<a href="https://colab.research.google.com/github/andresnigenda/cpd_complaints_nlp/blob/andres/lda_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **LDA of complaints against the CPD**

### *Importing Packages*

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction import text 
from spacy.tokenizer import Tokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from spacy.lang.en import English
from collections import Counter
from string import punctuation
from nltk import word_tokenize
import pandas as pd
import altair as alt
import numpy as np
import nltk
import re

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

### *Pre-processing*
0. Set up allegations data from csv
1. Convert data to lowercase
2. Remove special characters (punctuation and numbers)
3. Tokenize into terms
4. Remove stop words (generic + allegation specific)
5. Stemming
6. Term document matrix


In [2]:
# 0. Set up allegations data from csv #
#
# read in csv
narratives_csv_url = "https://raw.githubusercontent.com/andresnigenda/cpd_complaints_nlp/andres/narratives.csv"
df = pd.read_csv(narratives_csv_url)
# filter to relevant section
df = df[df.column_name == "Initial / Intake Allegation"]
# filter to relevant columns
df = df[['cr_id', 'text']]
print("There are {} complaints".format(df.shape[0]))
# drop allegations with same id + text
df = df.drop_duplicates(['cr_id', 'text'])
print("There are {} unique complaints".format(df.shape[0]))
#allegations_lst = df['text'].to_list()
df.head()

There are 19966 complaints
There are 17001 unique complaints


Unnamed: 0,cr_id,text
0,1048960,The reporting party alleges that the\naccused ...
4,1048962,The victim alleges that an unknown male\nblack...
9,1048964,The reporting party alleges that he was a\nvi...
12,1048965,The reporting party alleges that while\nwaitin...
14,1048965,The reporting party alleges that while\nwaitin...


We want to look at categories like: nudity_penetration,sexual_relations_with_a_minor_,sexual_harassment_sexual_remarks,domestic_violence_police_committing_,sexual_humiliation_sexual_extortion_prostitution_sex_work,tasers_baton_aggressive_physical_touch_gun,trespass_robbery,biometric_surveillance_fitting_a_description_gang_related_,racial_slurs_xenophobic_remarks_,undocumented_status_asking_for_someone_s_status_calling_ice_,planting_drug_guns,neglect_of_duty_failure_to_serve,refusing_to_provide_medical_assistance,workplace_harassment,_irrational_aggressive_unstable_,suicide_in_jail_improper_care_,dcfs_threats,pregnant_women,school,searching_patting_down_arresting_minors

So we will eliminate words like accused, reporting, party, alleges, officer, alleged, complainant, officers, victim, vehicle, failed, police, justification, stated, report, states, called, did, unknown, told, provide, incident, regarding, issued.

In [0]:
class PreProcess():
  '''
  Class for pre-processing a csv of text documents into a  sparse matrix of
  counts following scikit-learn's CountVectorizer

  Source:
  https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py
  '''

  def __init__(self, raw_data, additional_stopwords, stem=True):
      self.raw_data = raw_data
      self.stemmer = PorterStemmer()
      self.stop_words = set(nltk.corpus.stopwords.words('english')).union(additional_stopwords)
      self.stem = stem
      self.vectorizer = None
      self.doc_term_matrix = None


  def _tokenize_text(self, text):
      '''
      Strips punctuation, and everything that isn't alphabetic, tokenizes.
      Stems by default.
      '''
      tokenized_text = []
      
      # drop whatever isn't a word with letters or an apostrophe
      for token in word_tokenize(text):
        # to lowercase
        token = token.lower()
        # substitute whatever is not alphabetic
        token = re.sub('[^a-z]', '', token)
        if token:
          if token not in self.stop_words:
            if self.stem:
              tokenized_text.append(self.stemmer.stem(token))
            else:
              tokenized_text.append(token)
      
      return tokenized_text
  
  def _vectorize(self):
      '''
      Launch a vectorizer with CountVectorizer
      '''
      # instantiate vectorizer w/ our custom analyzer
      # by default we drop words that appear in more than 80% of documents and
      # that don't appear in more than one document
      # we override the analyzer with our tokenizer method
      self.vectorizer = CountVectorizer(max_df=0.8, 
                                        min_df=2, 
                                        analyzer=self._tokenize_text)

  def _fit_vectorizer(self):
      '''
      Fit vectorizer and create a doc_term_matrix
      '''
      # launch the CountVectorizer object
      self._vectorize()
      # fit it
      self.doc_term_matrix = self.vectorizer.fit_transform(self.raw_data.text.values.astype('U'))

  def plot_word_distributions(self, N):
      '''
      Plots frequencies for top N words

      Source:
        - https://altair-viz.github.io/gallery/percentage_of_total.html
      '''
      # get word list
      if not self.doc_term_matrix:
        self._fit_vectorizer()

      word_lst = self.vectorizer.get_feature_names()
      counts_lst = np.asarray(self.doc_term_matrix.sum(axis=0)).tolist()[0]

      source = pd.DataFrame({'Word': word_lst, 'Count': counts_lst}).sort_values(by=['Count'], ascending=False)[:N]
      
      alt.data_transformers.disable_max_rows()

      plot = alt.Chart(source).transform_joinaggregate(
          TotalCount='sum(Count)',
      ).transform_calculate(
          PercentOfTotal="datum.Count / datum.TotalCount"
      ).mark_bar().encode(
          alt.X('PercentOfTotal:Q', axis=alt.Axis(format='.0%')),
          alt.Y('Word:N', sort='-x')
      )
      return plot

In [0]:
additional_stopwords = set(["accused", "reporting", "party", "alleges", "officer", 
                        "alleged", "alleges", "complainant", "officers", "victim", 
                        "police", "stated", "report", "states", "called", 
                        "did", "told", "provide", "incident", "regarding", "issued",
                        "reported", "vehicle", "car", "justification",
                        "district", "uniformed", "threatened", "witness", "th",
                        "number", "scene"]).union(text.ENGLISH_STOP_WORDS)

In [7]:
# with stemming
stemmed_data = PreProcess(df, additional_stopwords, True)
stemmed_data.plot_word_distributions(20)

In [177]:
# without stemming
non_stemmed_data = PreProcess(df, additional_stopwords, False)
non_stemmed_data.plot_word_distributions(50)

### *Analysis*

In [8]:
LDA = LatentDirichletAllocation(n_components=30, random_state=8)
LDA.fit(stemmed_data.doc_term_matrix)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=30, n_jobs=None,
                          perp_tol=0.1, random_state=8, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [10]:
for i, topic in enumerate(LDA.components_):
    print(f'Top 10 words for topic {i + 1}:')
    print([stemmed_data.vectorizer.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic 1:
['plaintiff', 'charg', 'impound', 'probabl', 'plant', 'possess', 'caus', 'drug', 'fals', 'arrest']


Top 10 words for topic 2:
['handcuf', 'stop', 'femal', 'aggress', 'white', 'possibl', 'manner', 'male', 'appar', 'reason']


Top 10 words for topic 3:
['refus', 'resid', 'assault', 'secur', 'assist', 'servic', 'action', 'time', 'fail', 'respond']


Top 10 words for topic 4:
['radio', 'miss', 'indebted', 'properti', 'unknown', 'inform', 'citi', 'member', 'chicago', 'depart']


Top 10 words for topic 5:
['black', 'court', 'order', 'number', 'male', 'possibl', 'name', 'refus', 'request', 'star']


Top 10 words for topic 6:
['listen', 'spoke', 'ask', 'nt', 'refus', 'disrespect', 'traffic', 'yell', 'unprofession', 'rude']


Top 10 words for topic 7:
['rd', 'inform', 'quot', 'assign', 'thorough', 'conduct', 'case', 'detect', 'fail', 'investig']


Top 10 words for topic 8:
['state', 'physic', 'nigger', 'alterc', 'bitch', 'fuck', 'ass', 'refer', 'abus', 'verbal']


Top