In [None]:
import base64
import numpy as np
import pandas as pd
import psycopg2
import re
import spacy
import nltk
import numexpr
nltk.download('punkt')

from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from wordcloud import WordCloud, STOPWORDS
from pprint import pprint
from nltk.corpus import stopwords
from matplotlib import pyplot as plt
%matplotlib inline

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

!pip install -U pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

  """)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


  from collections import Iterable


In [None]:
conn = psycopg2.connect(
    host="codd04.research.northwestern.edu",
    port = "5433",
    database="postgres",
    user="cpdbstudent",
    password="DataSci4AI")

In [None]:
cursor = conn.cursor()

In [None]:
data_allegation_summary = "select \
      da.crid, da.cr_text, doa.disciplined \
    from \
      (select crid, cr_text from data_allegation where cr_text is not null) as da \
        left join (select allegation_id, disciplined from data_officerallegation group by 1,2) as doa \
          on da.crid = doa.allegation_id \
"

In [None]:
cursor.execute(data_allegation_summary)
summaries = cursor.fetchall()
print("shape is: " + str(len(summaries))) # 186983


df_summaries = pd.DataFrame(summaries)
colnames = [desc[0] for desc in cursor.description]
df_summaries.columns = colnames
print(df_summaries.shape)

shape is: 11303
(11303, 3)


In [None]:
df_summaries.head

<bound method NDFrame.head of           crid                                            cr_text disciplined
0      1062172  Initial / Intake Allegation 1: The complainant...       False
1      1054276  Initial / Intake Allegation 1: The reporting p...       False
2      1049008  Initial / Intake Allegation 1: The reporting p...       False
3      1056461  Initial / Intake Allegation 1:  The reporting ...       False
4      1049161  Initial / Intake Allegation 1: The reporting p...       False
...        ...                                                ...         ...
11298  1053289  Initial / Intake Allegation 1:  The reporting ...        None
11299  1054371  Initial / Intake Allegation 1:  The reporting ...        None
11300  1053798  Initial / Intake Allegation 1:  The complainan...        None
11301  1059891  Initial / Intake Allegation 4: It is reported ...        None
11302  1049365  Initial / Intake Allegation 1:  The reporting ...        None

[11303 rows x 3 columns]>

In [None]:
df_summaries.disciplined.value_counts()

False    2875
True      197
Name: disciplined, dtype: int64

In [None]:
df_summaries_disciplined = df_summaries[df_summaries.disciplined == True]

df_summaries_disciplined.shape

(197, 3)

In [None]:
df_summaries_notdisciplined = df_summaries[df_summaries.disciplined != True]

df_summaries_notdisciplined.shape

(11106, 3)

In [None]:
data_discplined = df_summaries_disciplined['cr_text'].values.tolist()

data_discplined = [re.sub('\n+', ' ', sent) for sent in data_discplined]

In [None]:
data_notdiscplined = df_summaries_notdisciplined['cr_text'].values.tolist()

data_notdiscplined = [re.sub('\n+', ' ', sent) for sent in data_notdiscplined]

In [None]:
#Let’s tokenize each sentence into a list of words, removing punctuations and unnecessary characters altogether.
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

disciplined_words = list(sent_to_words(data_discplined))
notdisciplined_words = list(sent_to_words(data_notdiscplined))

pprint(disciplined_words[:1])
pprint(notdisciplined_words[:1])

[['initial',
  'intake',
  'allegation',
  'it',
  'is',
  'reported',
  'that',
  'the',
  'accused',
  'officers',
  'submitted',
  'false',
  'police',
  'report',
  'in',
  'that',
  'they',
  'identified',
  'officers',
  'who',
  'did',
  'not',
  'participate',
  'in',
  'an',
  'arrest',
  'and',
  'identified',
  'those',
  'officers',
  'as',
  'first',
  'arresting',
  'officer',
  'and',
  'second',
  'arresting',
  'officer',
  'it',
  'is',
  'reported',
  'that',
  'the',
  'accused',
  'officers',
  'submitted',
  'false',
  'police',
  'report',
  'in',
  'that',
  'they',
  'identified',
  'officers',
  'who',
  'did',
  'not',
  'participate',
  'in',
  'an',
  'arrest',
  'and',
  'identified',
  'those',
  'officers',
  'as',
  'first',
  'arresting',
  'officer',
  'and',
  'second',
  'arresting',
  'officer',
  'initial',
  'intake',
  'allegation',
  'it',
  'is',
  'reported',
  'that',
  'the',
  'accused',
  'officers',
  'submitted',
  'false',
  'police',


In [None]:
# Build the bigram and trigram models
bigram_disc = gensim.models.Phrases(disciplined_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram_disc = gensim.models.Phrases(bigram_disc[disciplined_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_disc_mod = gensim.models.phrases.Phraser(bigram_disc)
trigram_disc_mod = gensim.models.phrases.Phraser(trigram_disc)

# See trigram example
print(trigram_disc_mod[bigram_disc_mod[disciplined_words[0]]])



['initial', 'intake', 'allegation', 'it', 'is', 'reported', 'that', 'the', 'accused', 'officers', 'submitted', 'false', 'police', 'report', 'in', 'that', 'they', 'identified', 'officers', 'who', 'did', 'not', 'participate', 'in', 'an', 'arrest', 'and', 'identified', 'those', 'officers', 'as', 'first', 'arresting', 'officer', 'and', 'second', 'arresting', 'officer', 'it', 'is', 'reported', 'that', 'the', 'accused', 'officers', 'submitted', 'false', 'police', 'report', 'in', 'that', 'they', 'identified', 'officers', 'who', 'did', 'not', 'participate', 'in', 'an', 'arrest', 'and', 'identified', 'those', 'officers', 'as', 'first', 'arresting', 'officer', 'and', 'second', 'arresting', 'officer', 'initial', 'intake', 'allegation', 'it', 'is', 'reported', 'that', 'the', 'accused', 'officers', 'submitted', 'false', 'police', 'report', 'in', 'that', 'they', 'identified', 'officers', 'who', 'did', 'not', 'participate', 'in', 'an', 'arrest', 'and', 'identified', 'those', 'officers', 'as', 'first'

In [None]:
# Build the bigram and trigram models
bigram_notdisc = gensim.models.Phrases(disciplined_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram_notdisc = gensim.models.Phrases(bigram_notdisc[disciplined_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_notdisc_mod = gensim.models.phrases.Phraser(bigram_notdisc)
trigram_notdisc_mod = gensim.models.phrases.Phraser(trigram_notdisc)

# See trigram example
print(trigram_notdisc_mod[bigram_notdisc_mod[disciplined_words[0]]])



['initial', 'intake', 'allegation', 'it', 'is', 'reported', 'that', 'the', 'accused', 'officers', 'submitted', 'false', 'police', 'report', 'in', 'that', 'they', 'identified', 'officers', 'who', 'did', 'not', 'participate', 'in', 'an', 'arrest', 'and', 'identified', 'those', 'officers', 'as', 'first', 'arresting', 'officer', 'and', 'second', 'arresting', 'officer', 'it', 'is', 'reported', 'that', 'the', 'accused', 'officers', 'submitted', 'false', 'police', 'report', 'in', 'that', 'they', 'identified', 'officers', 'who', 'did', 'not', 'participate', 'in', 'an', 'arrest', 'and', 'identified', 'those', 'officers', 'as', 'first', 'arresting', 'officer', 'and', 'second', 'arresting', 'officer', 'initial', 'intake', 'allegation', 'it', 'is', 'reported', 'that', 'the', 'accused', 'officers', 'submitted', 'false', 'police', 'report', 'in', 'that', 'they', 'identified', 'officers', 'who', 'did', 'not', 'participate', 'in', 'an', 'arrest', 'and', 'identified', 'those', 'officers', 'as', 'first'

In [None]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts, bigram_mod):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts, trigram_mod, bigram_mod):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
stop_words = stopwords.words('english')
data_disc_nostops = remove_stopwords(disciplined_words)
data_notdisc_nostops = remove_stopwords(notdisciplined_words)

# Form Bigrams
data_disc_bigrams = make_bigrams(data_disc_nostops, bigram_disc)
data_notdisc_bigrams = make_bigrams(data_notdisc_nostops, bigram_notdisc)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_disc_lemmatized = lemmatization(data_disc_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
data_notdisc_lemmatized = lemmatization(data_notdisc_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

pprint(data_disc_lemmatized[:1])
pprint(data_notdisc_lemmatized[:1])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.

[['initial',
  'intake',
  'allegation',
  'report',
  'accuse',
  'officer',
  'submit',
  'false',
  'police',
  'report',
  'identify',
  'officer',
  'participate',
  'arrest',
  'identify',
  'officer',
  'first',
  'arrest',
  'officer',
  'second',
  'arrest',
  'officer',
  'report',
  'accuse',
  'officer',
  'submit',
  'false',
  'police',
  'report',
  'identify',
  'officer',
  'participate',
  'arrest',
  'identify',
  'officer',
  'first',
  'arrest',
  'officer',
  'second',
  'arrest',
  'officer',
  'initial',
  'intake',
  'allegation',
  'report',
  'accuse',
  'officer',
  'submit',
  'false',
  'police',
  'report',
  'identify',
  'officer',
  'participate',
  'arrest',
  'identify',
  'officer',
  'first',
  'arrest',
  'officer',
  'second',
  'arrest',
  'officer',
  'report',
  'accuse',
  'officer',
  'submit',
  'false',
  'police',
  'report',
  'identify',
  'officer',
  'participate',
  'arrest',
  'identify',
  'officer',
  'first',
  'arrest',
  'offic

  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILAB

In [None]:
#The two main inputs to the LDA topic model are the dictionary(id2word) and the corpus. Let’s create them.
# Create Dictionary
id2word_disc = corpora.Dictionary(data_disc_lemmatized)
id2word_notdisc = corpora.Dictionary(data_notdisc_lemmatized)

# Create Corpus
texts_disc = data_disc_lemmatized
texts_notdisc = data_notdisc_lemmatized

# Term Document Frequency
corpus_disc = [id2word_disc.doc2bow(text) for text in texts_disc]
corpus_notdisc = [id2word_notdisc.doc2bow(text) for text in texts_notdisc]

# View
print(corpus_disc[:1])
print(corpus_notdisc[:1])

[[(0, 8), (1, 4), (2, 3), (3, 2), (4, 1), (5, 5), (6, 13), (7, 3), (8, 1), (9, 4), (10, 1), (11, 1), (12, 1), (13, 4), (14, 4), (15, 5), (16, 8), (17, 4), (18, 1), (19, 2), (20, 2), (21, 2), (22, 23), (23, 3), (24, 4), (25, 4), (26, 8), (27, 1), (28, 14), (29, 1), (30, 4), (31, 4), (32, 4), (33, 1), (34, 10)]]
[[(0, 2), (1, 2), (2, 2), (3, 2), (4, 1), (5, 2), (6, 1), (7, 2), (8, 2), (9, 1), (10, 2), (11, 2), (12, 2)]]


In [None]:
[[(id2word_disc[id], freq) for id, freq in cp] for cp in corpus_disc[:1]]

[[(id2word_notdisc[id], freq) for id, freq in cp] for cp in corpus_notdisc[:1]]

[[('accuse', 2),
  ('allegation', 2),
  ('allege', 2),
  ('complainant', 2),
  ('enter', 1),
  ('feel', 2),
  ('find', 1),
  ('initial', 2),
  ('intake', 2),
  ('none', 1),
  ('officer', 2),
  ('profile', 2),
  ('racially', 2)]]

In [None]:
#you need to provide the number of topics as well.
# Build LDA model
lda_model_disc = gensim.models.ldamodel.LdaModel(corpus=corpus_disc,
                                           id2word=id2word_disc,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)


  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

In [None]:
#you need to provide the number of topics as well.
# Build LDA model
lda_model_notdisc = gensim.models.ldamodel.LdaModel(corpus=corpus_notdisc,
                                           id2word=id2word_notdisc,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad 

In [None]:
!pip install pandas --upgrade



In [None]:
pyLDAvis.enable_notebook()
vis_disc = gensimvis.prepare(lda_model_disc, corpus_disc, id2word_disc)
vis_disc

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [None]:
pyLDAvis.enable_notebook()
vis_notdisc = gensimvis.prepare(lda_model_notdisc, corpus_notdisc, id2word_notdisc)
vis_notdisc

  by='saliency', ascending=False).head(R).drop('saliency', 1)
