# **LDA of complaints against the CPD (Chi version)**

### *Importing Packages*

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction import text 
from spacy.tokenizer import Tokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from spacy.lang.en import English
from collections import Counter
from string import punctuation
from nltk import word_tokenize
import pandas as pd
import altair as alt
import numpy as np
import nltk
import re

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

### *Pre-processing*
0. Set up allegations data from csv
1. Convert data to lowercase
2. Remove special characters (punctuation and numbers)
3. Tokenize into terms
4. Remove stop words (generic + allegation specific)
5. Stemming
6. Term document matrix


In [0]:
# 0. Set up allegations data from csv #
#
# read in csv
narratives_csv_url = "https://raw.githubusercontent.com/andresnigenda/cpd_complaints_nlp/andres/narratives.csv"
df = pd.read_csv(narratives_csv_url)
# filter to relevant section
df = df[df.column_name == "Initial / Intake Allegation"]

#Check if we duplicates are due to different variation in columns other than cr_id and text
df.drop(columns=['pdf_name','doccloud_url','dropbox_path','page_num'], inplace=True)
filtered_df = df.drop_duplicates().reset_index()
print("There are {} complaints after dropping columns".format(filtered_df.shape[0]))
print(filtered_df)
#Looks like most of the duplicates come from the other columns so we want to keep rows with unique content for cr_id and text
# filter to relevant columns
df = df[['cr_id', 'text']]
print("There are {} complaints".format(df.shape[0]))
# drop allegations with same id + text
df = df.drop_duplicates(['cr_id', 'text'])
print("There are {} unique complaints".format(df.shape[0]))
#allegations_lst = df['text'].to_list()
df.head()

There are 17059 complaints after dropping columns
       index  ...  batch_id
0          0  ...         1
1          4  ...         1
2          9  ...         1
3         12  ...         1
4         14  ...         1
...      ...  ...       ...
17054  30703  ...         5
17055  30706  ...         5
17056  30707  ...         5
17057  30711  ...         5
17058  30715  ...         5

[17059 rows x 6 columns]
There are 19966 complaints
There are 17001 unique complaints


Unnamed: 0,cr_id,text
0,1048960,The reporting party alleges that the\naccused ...
4,1048962,The victim alleges that an unknown male\nblack...
9,1048964,The reporting party alleges that he was a\nvi...
12,1048965,The reporting party alleges that while\nwaitin...
14,1048965,The reporting party alleges that while\nwaitin...


In [0]:
#The text columns from some rows contain duplicated content.
#The functions below will help detect them.

from sklearn.metrics.pairwise import cosine_similarity

def cosine_sim_vectors(vec1, vec2):
  vec1 = vec1.reshape(1,-1)
  vec2 = vec2.reshape(1,-1)
  return cosine_similarity(vec1, vec2)[0][0]

def split_half_and_compare(str_input):
  if len(str_input) < 2 or str_input in ['nfi', 'NFI'] or str_input in nltk.corpus.stopwords.words('english'):
    return False
  first_half = str_input[0:len(str_input)//2] 
  second_half = str_input[len(str_input)//2 if len(str_input)%2 == 0
                                 else ((len(str_input)//2)+1):]
  #print(first_half)
  #print(second_half)
  vectorizer = CountVectorizer().fit_transform([first_half, second_half])
  vectors = vectorizer.toarray()
  similarity_score = cosine_sim_vectors(vectors[0], vectors[1])
  #print(similarity_score)
  return similarity_score > .9

test_str = filtered_df.iloc[17055]['text']
split_half_and_compare(test_str)

True

In [0]:
df['TextCount'] = df['text'].apply(len)
df['DuplicateText'] = df['text'].apply(split_half_and_compare)
duplicated_df = df[df['DuplicateText'] == True].reset_index()
#randomly grab one row to check
print(duplicated_df.iloc[4]['text'])
print("")
print("Threre are {} rows with duplicated content issue".format(duplicated_df.shape[0]))

The reporting party alleges that the accused
officers failed to inventory or return his
identification
The reporting party alleges that the accused
officers failed to inventory or return his
identification

Threre are 726 rows with duplicated content issue


In [0]:
#If DuplicateText is True then replace text by half of text
df['text'] = df.apply(lambda row: row['text'] if row['DuplicateText'] == False else row['text'][0:len(row['text'])//2], axis=1)
#double check if processing works
df[df['DuplicateText'] == True].reset_index().iloc[4]['text']

'The reporting party alleges that the accused\nofficers failed to inventory or return his\nidentification'

We want to look at categories like: nudity_penetration,sexual_relations_with_a_minor_,sexual_harassment_sexual_remarks,domestic_violence_police_committing_,sexual_humiliation_sexual_extortion_prostitution_sex_work,tasers_baton_aggressive_physical_touch_gun,trespass_robbery,biometric_surveillance_fitting_a_description_gang_related_,racial_slurs_xenophobic_remarks_,undocumented_status_asking_for_someone_s_status_calling_ice_,planting_drug_guns,neglect_of_duty_failure_to_serve,refusing_to_provide_medical_assistance,workplace_harassment,_irrational_aggressive_unstable_,suicide_in_jail_improper_care_,dcfs_threats,pregnant_women,school,searching_patting_down_arresting_minors

So we will eliminate words like accused, reporting, party, alleges, officer, alleged, complainant, officers, victim, vehicle, failed, police, justification, stated, report, states, called, did, unknown, told, provide, incident, regarding, issued.

In [0]:
class PreProcess():
  '''
  Class for pre-processing a csv of text documents into a  sparse matrix of
  counts following scikit-learn's CountVectorizer

  Source:
  https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py
  '''

  def __init__(self, raw_data, additional_stopwords, stem=True):
      self.raw_data = raw_data
      self.stemmer = PorterStemmer()
      self.stop_words = set(nltk.corpus.stopwords.words('english')).union(additional_stopwords)
      self.stem = stem
      self.vectorizer = None
      self.doc_term_matrix = None


  def _tokenize_text(self, text):
      '''
      Strips punctuation, and everything that isn't alphabetic, tokenizes.
      Stems by default.
      '''
      tokenized_text = []
      
      # drop whatever isn't a word with letters or an apostrophe
      for token in word_tokenize(text):
        # to lowercase
        token = token.lower()
        # substitute whatever is not alphabetic
        token = re.sub('[^a-z]', '', token)
        if token:
          if token not in self.stop_words:
            if self.stem:
              tokenized_text.append(self.stemmer.stem(token))
            else:
              tokenized_text.append(token)
      
      return tokenized_text
  
  def _vectorize(self):
      '''
      Launch a vectorizer with CountVectorizer
      '''
      # instantiate vectorizer w/ our custom analyzer
      # by default we drop words that appear in more than 80% of documents and
      # that don't appear in more than one document
      # we override the analyzer with our tokenizer method
      self.vectorizer = CountVectorizer(max_df=0.8, 
                                        min_df=2, 
                                        analyzer=self._tokenize_text)

  def _fit_vectorizer(self):
      '''
      Fit vectorizer and create a doc_term_matrix
      '''
      # launch the CountVectorizer object
      self._vectorize()
      # fit it
      self.doc_term_matrix = self.vectorizer.fit_transform(self.raw_data.text.values.astype('U'))

  def plot_word_distributions(self, N):
      '''
      Plots frequencies for top N words

      Source:
        - https://altair-viz.github.io/gallery/percentage_of_total.html
      '''
      # get word list
      if not self.doc_term_matrix:
        self._fit_vectorizer()

      word_lst = self.vectorizer.get_feature_names()
      counts_lst = np.asarray(self.doc_term_matrix.sum(axis=0)).tolist()[0]

      source = pd.DataFrame({'Word': word_lst, 'Count': counts_lst}).sort_values(by=['Count'], ascending=False)[:N]
      
      alt.data_transformers.disable_max_rows()

      plot = alt.Chart(source).transform_joinaggregate(
          TotalCount='sum(Count)',
      ).transform_calculate(
          PercentOfTotal="datum.Count / datum.TotalCount"
      ).mark_bar().encode(
          alt.X('PercentOfTotal:Q', axis=alt.Axis(format='.0%')),
          alt.Y('Word:N', sort='-x')
      )
      return plot

In [0]:
additional_stopwords = set(["accused", "reporting", "party", "alleges", "officer", 
                        "alleged", "alleges", "complainant", "officers", "victim", 
                        "police", "stated", "report", "states", "called", 
                        "did", "told", "provide", "incident", "regarding", "issued",
                        "reported", "vehicle", "car", "justification",
                        "district", "uniformed", "threatened", "witness", "th",
                        "number", "scene"]).union(text.ENGLISH_STOP_WORDS)

In [0]:
# with stemming
stemmed_data = PreProcess(df, additional_stopwords, True)
stemmed_data.plot_word_distributions(20)

In [0]:
# without stemming
non_stemmed_data = PreProcess(df, additional_stopwords, False)
non_stemmed_data.plot_word_distributions(50)

### *Analysis*

In [0]:
LDA = LatentDirichletAllocation(n_components=20, random_state=8)
LDA.fit(stemmed_data.doc_term_matrix)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=20, n_jobs=None,
                          perp_tol=0.1, random_state=8, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [0]:
for i, topic in enumerate(LDA.components_):
    print(f'Top 10 words for topic {i + 1}:')
    print([stemmed_data.vectorizer.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic 1:
['refus', 'place', 'allow', 'alleg', 'probabl', 'victim', 'search', 'fals', 'caus', 'arrest']


Top 10 words for topic 2:
['answer', 'inform', 'refus', 'unprofession', 'supervisor', 'phone', 'hung', 'rude', 'speak', 'telephon']


Top 10 words for topic 3:
['person', 'secur', 'properti', 'stolen', 'case', 'thorough', 'inform', 'conduct', 'investig', 'fail']


Top 10 words for topic 4:
['took', 'cell', 'properti', 'card', 'remov', 'usc', 'phone', 'fail', 'inventori', 'return']


Top 10 words for topic 5:
['unknown', 'home', 'damag', 'permiss', 'apart', 'warrant', 'door', 'enter', 'search', 'resid']


Top 10 words for topic 6:
['plant', 'drug', 'possess', 'assault', 'refus', 'batteri', 'fail', 'fals', 'offend', 'arrest']


Top 10 words for topic 7:
['possibl', 'supervisor', 'gun', 'process', 'unknown', 'white', 'femal', 'black', 'subject', 'male']


Top 10 words for topic 8:
['appar', 'reason', 'femal', 'plaincloth', 'hispan', 'unknown', 'possibl', 'black', 'whit