# Setup

In [1]:
# !pip install --upgrade gensim
from gensim import corpora, models, similarities, matutils
# sklearn
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
from sklearn.decomposition import LatentDirichletAllocation

In [3]:
import pandas as pd
import numpy as np

In [4]:
#!pip install mglearn

In [151]:
import mglearn
from collections import Counter

# Clean Data

In [6]:
df = pd.read_csv("data/Death_Row_Data.csv", encoding = "latin1")

In [7]:
words = df['Last Statement']
print("Number of executions: {}".format(len(words)))

Number of executions: 549


In [8]:
words = words.dropna()
print("Number of statements: {}".format(len(words)))

Number of statements: 447


## Extra symbols

Webscraping gave me none english characters. clean up text

In [9]:
words = [doc.replace("\x99","") for doc in words]

In [10]:
words = [doc.replace("\x98","") for doc in words]

In [11]:
words = [doc.replace("\x93","") for doc in words]

In [12]:
words = [doc.replace("\x80","") for doc in words]

In [13]:
words = [doc.replace("\x9d","") for doc in words]

In [14]:
words = [doc.replace("\x9c","") for doc in words]

In [15]:
words = [doc.replace("â","") for doc in words]

In [16]:
words = [doc.replace("\'","") for doc in words]

In [17]:
words = [doc.replace("\n","") for doc in words]

In [18]:
words = [doc.replace("\r","") for doc in words]

## Drop "No statement"

In [20]:
words = remove_all_values(words,'None')

In [154]:
words = remove_all_values(words,'None.')

In [156]:
words = remove_all_values(words,'No')

In [21]:
words = remove_all_values(words,'This offender declined to make a last statement.  ')

In [22]:
words = remove_all_values(words,'This offender declined to make a last statement.')

In [126]:
words = remove_all_values(words,'No, I have no final statement. ')

In [131]:
words = remove_all_values(words,'No last statement.')

In [158]:
print("Number of statements: {}".format(len(words)))

Number of statements: 436


## Length of statements

In [170]:
#checking length of statements

lengths = []
for doc in words:
    lengths.append(len(doc.split()))
    if len(doc.split())<=4:
        print(doc)
lengths.sort()
Counter(lengths)

I love you Israel. 
 Bye, Im Ready. 
 Profanity directed toward staff.  
 Santajaib Singh Ji.  
Goodbye.
Yes, I do.
Ill see you.
Peace.
High Flight (aviation poem)
Thanked his family.
Im ready, Warden.


Counter({1: 2,
         3: 6,
         4: 3,
         5: 3,
         6: 2,
         7: 4,
         8: 5,
         9: 2,
         10: 3,
         11: 4,
         12: 3,
         13: 4,
         14: 5,
         15: 2,
         16: 4,
         17: 8,
         18: 3,
         19: 4,
         20: 6,
         21: 3,
         22: 4,
         23: 2,
         24: 6,
         25: 6,
         26: 1,
         27: 5,
         28: 6,
         29: 4,
         30: 2,
         31: 3,
         32: 8,
         33: 4,
         34: 2,
         35: 7,
         36: 2,
         37: 1,
         38: 1,
         39: 2,
         40: 4,
         42: 3,
         43: 4,
         44: 2,
         45: 7,
         46: 3,
         47: 3,
         48: 4,
         49: 2,
         50: 2,
         51: 2,
         52: 2,
         53: 2,
         54: 1,
         55: 1,
         56: 5,
         57: 2,
         58: 3,
         59: 1,
         60: 1,
         61: 2,
         62: 3,
         63: 2,
         64: 2,
         65: 4,


In [172]:
# create list of shorter than X words statements

words_short = list(filter(lambda x: len(x.split()) < 240, words))
print("Number of statements: {}".format(len(words_short)))

Number of statements: 405


In [173]:
words = words_short

# Parse words

In [174]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [175]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [176]:
# Create a TfidfVectorizer for parsing/counting words
tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=3, stop_words=ENGLISH_STOP_WORDS)

In [177]:
x = tfidf.fit_transform(words)

In [178]:
feature_names = tfidf.get_feature_names()
len(feature_names)

1082

In [None]:
# Create a CountVectorizer for parsing/counting words
vect = CountVectorizer(ngram_range=(1,3), min_df=3, stop_words=ENGLISH_STOP_WORDS,strip_accents='unicode')

In [None]:
x = vect.fit_transform(words)

In [None]:
feature_names = vect.get_feature_names()
len(feature_names)

# LDA

In [179]:
num_topics = 5

In [180]:
lda = LatentDirichletAllocation(n_topics=num_topics, max_iter = 15, random_state=0, n_jobs=-1)

In [181]:
document_topics = lda.fit_transform(x)



In [182]:
print("lda.components_.shape: {}".format(lda.components_.shape))

lda.components_.shape: (5, 1082)


In [183]:
sorting = np.argsort(lda.components_,axis=1)[:,::-1]
feature_names = np.array(tfidf.get_feature_names())
mglearn.tools.print_topics(topics=range(num_topics),feature_names=feature_names,
                           sorting=sorting,topics_per_chunk=6,n_words=14)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
12            love          thy           thou          proceed warden
profanity     family        unto          shall         proceed       
staff         thank         god allah     lots          final         
oh lord       forgive       theres        leadeth       blessing      
oh            sorry         brothers sisterssin           warden jones  
hands         im            return        committed     jones         
amen          know          sisters       guilty        ready         
spirit        say           allah         crime         dont say      
lord          tell          brothers      paths         dont          
goodbye       like          continue      head oil      warden        
love mom      god           strong        enemies       say           
love          want          tell          valley        yall          
mom 

In [184]:
topics = np.array([range(num_topics)])

In [185]:
topic_4 = np.argsort(document_topics[:,2])

for i in topic_4[:5]:
    print(i,'.'.join(words[i].split(".")[:4]))

205  Can you hear me, Chris?  The Lord is my Shepherd; I shall not want.  He makes me to lie down in green pastures; He leads me beside the still waters.  He restores my soul;  He leads me in the paths of righteousness for His names sake.  Yea, though I walk through the valley of the shadow of death, I will fear no evil; for Thou art with me;  Thy rod and Thy staff comfort me
90 Collins family, I know your not going to get the closure you are looking for tonight. I wish you the best. I prayed for yall every day and every night. I have only the warmest wishes
227 The Lord is my Shepherd, I shall not want. He maketh me lie down in green pastures; He leadeth me beside the still waters, He restoreth my soul. He leadeth me in the paths of righteousness for His names sake. Yea, though I walk through the valley of the shadow of death, I will fear no evil; for Thou art with me
240  Yes sir, members of Mrs. Sanchezs family, I dont know who you are and other people present.  As I said, Im taking

In [136]:
topic = np.argsort(document_topics[:,1])

for i in topic[:10]:
    print(i,'  ','.'.join(words[i].split(".")[:4]))

244     Well, I dont have anything to say.  I am just sorry about what I did to Mr. Peters.  Thats all
178     Yes sir, I do.  To the victims family.  I hope it helps a little.  I do not know how, but I hope it helps
68    Youre not about to witness an execution, you are about to witness a murder. I am strapped down for something Marcus Rhodes did. I never killed anybody, ever. I love you, Mom
408    Thanked his family.
147     Yes, for all of those that want this to happen, I hope that you get what you want and it makes you feel better and that it gives you some kind of relief.  I dont know what else to say.  For those that I have hurt, I hope after a while it gets better.  I love you, I love you
334    Keep it brief here. Just want to say, uh, family, take care of yourselves. Uh, look at this as a learning experience. Everything happens for a reason
219    The statement that I would like to make is, none of this should have happened and now that Im dying, there is nothing left to wor