<a href="https://colab.research.google.com/github/andresnigenda/cpd_complaints_nlp/blob/andres/lda_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **LDA of complaints against the CPD**

### *Importing Packages*

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction import text 
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from collections import Counter
import pandas as pd
import re

### *Pre-processing*
0. Set up allegations data from csv
1. Convert data to lowercase
2. Remove special characters (punctuation and numbers)
3. Tokenize into terms
4. Remove stop words (generic + allegation specific)
5. Stemming
6. Term document matrix


In [66]:
# 0. Set up allegations data from csv #
#
# read in csv
narratives_csv_url = "https://raw.githubusercontent.com/andresnigenda/cpd_complaints_nlp/andres/narratives.csv"
df = pd.read_csv(narratives_csv_url)
# filter to relevant section
df = df[df.column_name == "Initial / Intake Allegation"]
# filter to relevant columns
df = df[['cr_id', 'text']]
print("There are {} complaints".format(df.shape[0]))
# drop allegations with same id + text
df = df.drop_duplicates(['cr_id', 'text'])
print("There are {} unique complaints".format(df.shape[0]))
#allegations_lst = df['text'].to_list()
df.head()

There are 19966 complaints
There are 17001 unique complaints


Unnamed: 0,cr_id,text
0,1048960,The reporting party alleges that the\naccused ...
4,1048962,The victim alleges that an unknown male\nblack...
9,1048964,The reporting party alleges that he was a\nvi...
12,1048965,The reporting party alleges that while\nwaitin...
14,1048965,The reporting party alleges that while\nwaitin...


In [0]:
# 1. Convert to lowercase #
#
class PreProcess():
  '''
  Class for pre-processing a csv of text documents into a  sparse matrix of
  counts following scikit-learn's CountVectorizer

  Source:
  https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py
  '''
  def __init__(self, raw_data):
    self.raw_data = raw_data
    self.preprocessor = preprocess_text
    self.vectorizer = CountVectorizer(max_df=0.8, 
                                      min_df=2, 
                                      preprocessor=self.preprocessor)

  @staticmethod
  def preprocess_text(text):
    '''
    Custom text preprocessor before tokenization. Converts data to lowercase and
    removes special characters (punctuation and numbers)
    '''
    # to lowercase
    text = text.lower()
    # no digits
    text = re.sub(r'\d+', '', text)
    # strip punctuation
    text = re.sub(r'[^\w\s]', '', text)

    return text

  def _fit_vectorizer(self):
    '''
    Fit vectorizer
    '''
    self.vectorizer.fit_transform(self.raw_data.text.values.astype('U'))


In [93]:
preprocessed_df = PreProcess(df)
preprocessed_df._fit_vectorizer()
preprocessed_df.vectorizer.get_feature_names()

['__',
 '___',
 '____',
 '_____',
 '___and',
 '__and',
 '__for',
 '__while',
 '_and',
 '_had',
 '_the',
 'aa',
 'aaccused',
 'abandoned',
 'abandonment',
 'abc',
 'abdomen',
 'abduct',
 'abducted',
 'abduction',
 'abetting',
 'abide',
 'ability',
 'abitch',
 'able',
 'abour',
 'about',
 'above',
 'abrasive',
 'abrupt',
 'abruptly',
 'absence',
 'absences',
 'absent',
 'abuse',
 'abused',
 'abuser',
 'abuses',
 'abusing',
 'abusive',
 'ac',
 'academy',
 'acase',
 'acccused',
 'accelerated',
 'accelerating',
 'accents',
 'accept',
 'acceptable',
 'acceptance',
 'accepted',
 'access',
 'accessible',
 'acciden',
 'accident',
 'accidentally',
 'accidently',
 'accommodation',
 'accompanied',
 'accompany',
 'accompanying',
 'accord',
 'accordance',
 'according',
 'accosted',
 'account',
 'accountant',
 'accounted',
 'accounts',
 'accredited',
 'accued',
 'accurate',
 'accurately',
 'accurred',
 'accus',
 'accusation',
 'accusations',
 'accusatory',
 'accuse',
 'accused',
 'accusedarresting',


In [77]:
vectorizer = PreProcess(df)._vectorize()
vectorizer.fit_transform(df.text.values.astype('U'))
vectorizer.get_feature_names()

NotFittedError: ignored

In [53]:
def preprocess_text(text):
    '''
    Custom text preprocessor before tokenization. Converts data to lowercase and
    removes special characters (punctuation and numbers)
    '''
    # to lowercase
    text = text.lower()
    # no digits
    text = re.sub(r'\d+', '', text)
    # strip punctuation
    text = re.sub(r'[^\w\s]', '', text)
    return text
vectorizer = CountVectorizer(max_df=0.8, min_df=2, preprocessor=preprocess_text)
vectorized_corpus = vectorizer.fit_transform(df.text.values.astype('U'))
vectorizer.get_feature_names()

['__',
 '___',
 '____',
 '_____',
 '___and',
 '__and',
 '__for',
 '__while',
 '_and',
 '_had',
 '_the',
 'aa',
 'aaccused',
 'abandoned',
 'abandonment',
 'abc',
 'abdomen',
 'abduct',
 'abducted',
 'abduction',
 'abetting',
 'abide',
 'ability',
 'abitch',
 'able',
 'abour',
 'about',
 'above',
 'abrasive',
 'abrupt',
 'abruptly',
 'absence',
 'absences',
 'absent',
 'abuse',
 'abused',
 'abuser',
 'abuses',
 'abusing',
 'abusive',
 'ac',
 'academy',
 'acase',
 'acccused',
 'accelerated',
 'accelerating',
 'accents',
 'accept',
 'acceptable',
 'acceptance',
 'accepted',
 'access',
 'accessible',
 'acciden',
 'accident',
 'accidentally',
 'accidently',
 'accommodation',
 'accompanied',
 'accompany',
 'accompanying',
 'accord',
 'accordance',
 'according',
 'accosted',
 'account',
 'accountant',
 'accounted',
 'accounts',
 'accredited',
 'accued',
 'accurate',
 'accurately',
 'accurred',
 'accus',
 'accusation',
 'accusations',
 'accusatory',
 'accuse',
 'accused',
 'accusedarresting',
