<a href="https://colab.research.google.com/github/andresnigenda/cpd_complaints_nlp/blob/andres/lda_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **LDA of complaints against the CPD**

### *Importing Packages*

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction import text 
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from collections import Counter
import pandas as pd
import altair as alt
import numpy as np
import re

### *Pre-processing*
0. Set up allegations data from csv
1. Convert data to lowercase
2. Remove special characters (punctuation and numbers)
3. Tokenize into terms
4. Remove stop words (generic + allegation specific)
5. Stemming
6. Term document matrix


In [2]:
# 0. Set up allegations data from csv #
#
# read in csv
narratives_csv_url = "https://raw.githubusercontent.com/andresnigenda/cpd_complaints_nlp/andres/narratives.csv"
df = pd.read_csv(narratives_csv_url)
# filter to relevant section
df = df[df.column_name == "Initial / Intake Allegation"]
# filter to relevant columns
df = df[['cr_id', 'text']]
print("There are {} complaints".format(df.shape[0]))
# drop allegations with same id + text
df = df.drop_duplicates(['cr_id', 'text'])
print("There are {} unique complaints".format(df.shape[0]))
#allegations_lst = df['text'].to_list()
df.head()

There are 19966 complaints
There are 17001 unique complaints


Unnamed: 0,cr_id,text
0,1048960,The reporting party alleges that the\naccused ...
4,1048962,The victim alleges that an unknown male\nblack...
9,1048964,The reporting party alleges that he was a\nvi...
12,1048965,The reporting party alleges that while\nwaitin...
14,1048965,The reporting party alleges that while\nwaitin...


In [0]:
# 1. Convert to lowercase #
#


class PreProcess():
  '''
  Class for pre-processing a csv of text documents into a  sparse matrix of
  counts following scikit-learn's CountVectorizer

  Source:
  https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py
  '''

  def __init__(self, raw_data, stopwords=None):
      self.raw_data = raw_data
      self.stop_words = stopwords
      self.vectorizer = None

  def text_cleaner(self, text):
      '''
      Custom text preprocessor before tokenization. Converts data to lowercase and
      removes special characters (punctuation and numbers)
      '''
      # conver to lowercase
      text = text.lower()
      # remove anything that is not a word
      text = re.sub('[^a-z]', ' ', text)

      return text

  def _add_stopwords(self):
      '''
      Adds stopwords
      '''
      self.stop_words = text.ENGLISH_STOP_WORDS.union(self.stop_words)
  
  def _vectorize(self):
      '''
      Launch a vectorizer with CountVectorizer
      '''
      if self.stop_words:
        self._add_stopwords()
      self.vectorizer = CountVectorizer(max_df=0.8, 
                                        min_df=2, 
                                        preprocessor=self.text_cleaner,
                                        stop_words=self.stop_words)

  def _fit_vectorizer(self):
      '''
      Fit vectorizer
      '''
      self._vectorize()
      X = self.vectorizer.fit_transform(self.raw_data.text.values.astype('U'))

      return X

  def plot_word_distributions(self, N):
      '''
      Plots frequencies for top N words

      Source:
        - https://altair-viz.github.io/gallery/percentage_of_total.html
      '''
      # get word list
      X = self._fit_vectorizer()
      word_lst = self.vectorizer.get_feature_names()
      counts_lst = np.asarray(X.sum(axis=0)).tolist()[0]

      source = pd.DataFrame({'Word': word_lst, 'Count': counts_lst}).sort_values(by=['Count'], ascending=False)[:N]
      
      alt.data_transformers.disable_max_rows()

      plot = alt.Chart(source).transform_joinaggregate(
          TotalCount='sum(Count)',
      ).transform_calculate(
          PercentOfTotal="datum.Count / datum.TotalCount"
      ).mark_bar().encode(
          alt.X('PercentOfTotal:Q', axis=alt.Axis(format='.0%')),
          alt.Y('Word:N', sort='-x')
      )
      return plot


In [0]:
preprocessed_df = PreProcess(df)
X = preprocessed_df._fit_vectorizer()
word_lst = preprocessed_df.vectorizer.get_feature_names()
#test = preprocessed_df.vectorizer
counts_lst = np.asarray(X.sum(axis=0)).tolist()[0]


We want to look at categories like: nudity_penetration,sexual_relations_with_a_minor_,sexual_harassment_sexual_remarks,domestic_violence_police_committing_,sexual_humiliation_sexual_extortion_prostitution_sex_work,tasers_baton_aggressive_physical_touch_gun,trespass_robbery,biometric_surveillance_fitting_a_description_gang_related_,racial_slurs_xenophobic_remarks_,undocumented_status_asking_for_someone_s_status_calling_ice_,planting_drug_guns,neglect_of_duty_failure_to_serve,refusing_to_provide_medical_assistance,workplace_harassment,_irrational_aggressive_unstable_,suicide_in_jail_improper_care_,dcfs_threats,pregnant_women,school,searching_patting_down_arresting_minors

So we will eliminate words like accused, reporting, party, alleges, officer, alleged, complainant, officers, victim, vehicle, failed, police, justification, stated, report, states, called, did, unknown, told, provide, incident, regarding, issued.

In [65]:
# Check for common legal jargon
additional_stopwords = ["accused", "reporting", "party", "alleges", "officer", 
                        "alleged", "complainant", "officers", "victim", "failed", 
                        "police", "stated", "report", "states", "called", 
                        "did", "told", "provide", "incident", "regarding", "issued",
                        "reported", "vehicle", "car", "justification"]
preprocessed_df = PreProcess(df, additional_stopwords)
preprocessed_df.plot_word_distributions(20)


In [0]:
# Check for common legal jargon
additional_stopwords = ['it']
preprocessed_df = PreProcess(df, additional_stopwords)
preprocessed_df.plot_word_distributions(20)


In [31]:
alt.data_transformers.disable_max_rows()

alt.Chart(source).transform_joinaggregate(
    TotalCount='sum(Count)',
).transform_calculate(
    PercentOfTotal="datum.Count / datum.TotalCount"
).mark_bar().encode(
    alt.X('PercentOfTotal:Q', axis=alt.Axis(format='.0%')),
    alt.Y('Word:N', sort='-x')
)

