# Death Row Last Statements analysis.

We will go over last statements of executed Texas convicts, and analyse topics and try to infer the sentiment of each text.

In [1]:
%%capture
!pip install contractions
!pip install --upgrade spacy
!python -m spacy download en_core_web_md

In [2]:
from google.colab import drive
import os
import warnings
import requests
from bs4 import BeautifulSoup
from lxml import etree
import pandas as pd

import spacy
import contractions
from gensim.models.phrases import Phraser
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_short, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, stem_text
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

warnings.filterwarnings('ignore')
# This will prompt for authorization.
drive.mount('/content/drive')

# Change to assignment directory
path_to_folder = '/content/drive/My Drive/slef'
os.chdir(path_to_folder)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Scraping and data loading
We will scrape the webpage using BeautifulSoup, and we will compose a pandas Dataframe with last statements of each inmate. The code ignores inmates that did not provide a last statement.

In [None]:

# Read HTML page using requests
url = 'https://www.tdcj.texas.gov/death_row/dr_executed_offenders.html'
html = requests.get(url,verify=False).content

# Parse HTML using Beautiful Soup
soup = BeautifulSoup(html, 'html.parser')
dom =  etree.HTML(str(soup))
last_stmt_link = dom.xpath('//a[contains(text(), "Last Statement")]')

output = []

for elem in last_stmt_link:
    try:
        link = elem.get('href')
        # Read HTML page for current prisoner's last statement
        html = requests.get(f'https://www.tdcj.texas.gov/death_row/{link}',verify=False).content
        soup = BeautifulSoup(html, 'html.parser')
        # Get prisoner's last statement text
        text = soup.select('p:nth-child(11)')[0].get_text().strip()
        # Assign 'None' if prisoner said nothing
        if text != '' and text != 'No last statement given.':
            output.append(text)
    except:
        print(f"page {link} not found")
        text = 'None'

page dr_info/greengarylast.html not found
page dr_info/runnelstravislast.html not found
page dr_info/youngchristopherlast.html not found
page dr_info/bibledannylast.html not found
page /death_row/dr_info/cardenasrubenlast.html not found
page /death_row/dr_info/pruettrobertlast.html not found
page dr_info/vasquezpablolast.html not found


In [None]:
statement_df = pd.DataFrame(output, columns=['Last Statement'])

In [None]:
statement_df.to_csv('data.csv')

Saving to csv and loading.

In [3]:
df = pd.read_csv('data.csv')

In [4]:
df=df[df['Last Statement'] != 'No last statement given.']

In [5]:
df.head()
df.shape

(475, 2)

## Preprocessing

In [6]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
from nltk.corpus.reader.wordnet import VERB, NOUN, ADJ, ADV
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
FILTERS=[lambda x: x.lower(), strip_tags, strip_punctuation,
    strip_multiple_whitespaces, strip_numeric,
    remove_stopwords, strip_short]
py_lemmatizer = WordNetLemmatizer()
dict_pos_map = {
    # Look for NN in the POS tag because all nouns begin with NN
    'NN': NOUN,
    # Look for VB in the POS tag because all nouns begin with VB
    'VB':VERB,
    # Look for JJ in the POS tag because all nouns begin with JJ
    'JJ' : ADJ,
    # Look for RB in the POS tag because all nouns begin with RB
    'RB':ADV,
}


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
def remove_names(text):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    clean_text = ""

    # Iterate over each sentence
    for sentence in sentences:
        # Tokenize the sentence into words
        words = word_tokenize(sentence)

        words = [contractions.fix(word) for word in words]

        # Tag the words with their parts of speech
        tagged_words = pos_tag(words)

        # Filter out proper nouns (tags starting with 'NNP')
        filtered_words = [py_lemmatizer.lemmatize(word,dict_pos_map[tag[:2]]) for word, tag in tagged_words if (tag[:2] in dict_pos_map.keys() and tag !='NNP')]

        # Join the words back into a sentence
        cleaned_sentence = " ".join(filtered_words)

        # Append the cleaned sentence to the final text
        clean_text += cleaned_sentence + " "

    clean_text = preprocess_string(clean_text, FILTERS)
    return clean_text

In [8]:
df['Tokenized'] = df['Last Statement'].apply(lambda doc: remove_names(doc)) # Lemmatizes

In [9]:
print("------ Before Text Preprocessing ------\n")
print(df['Last Statement'][9])
print('')
print("------ After Text Preprocessing ------\n")
print(df['Tokenized'][9])

------ Before Text Preprocessing ------

No statement was made.

------ After Text Preprocessing ------

['statement']


## Word2Vec

In [20]:
import gensim.downloader as api
model=api.load("word2vec-google-news-300")
vec_size=300



In [14]:
import multiprocessing
from gensim.models import Word2Vec
vec_size=300
w2v_model = Word2Vec(df['Tokenized'].tolist(),
                     min_count=3,
                     window=4,
                     vector_size=vec_size,
                     sample=1e-5,
                     alpha=0.03,
                     min_alpha=0.0007,
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)
model = w2v_model.wv

In [25]:
import numpy as np
words = set(model.index_to_key )
X_vect = np.array([np.array([model[i] for i in ls if i in words]) # Get embeddings for each word
                         for ls in df['Tokenized'].tolist()])
X_vect.shape

(475,)

In [26]:
X_train_vect_avg = []
for v in X_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0)) # Average across each embedding dimension
    else:
        X_train_vect_avg.append(np.zeros(vec_size, dtype=float)) # If size is zero, just append zeroes

In [34]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=X_train_vect_avg)
positive_cluster_center = km.cluster_centers_[0]
negative_cluster_center = km.cluster_centers_[1]

In [33]:
model.similar_by_vector(km.cluster_centers_[0], topn=10, restrict_vocab=None)

[('love', 0.9849041700363159),
 ('people', 0.9407002329826355),
 ('know', 0.9296285510063171),
 ('life', 0.9207703471183777),
 ('come', 0.9147742390632629),
 ('want', 0.9036694169044495),
 ('forgive', 0.8843216896057129),
 ('family', 0.8750388622283936),
 ('thing', 0.8601243495941162),
 ('continue', 0.8590933680534363)]

In [32]:
model.similar_by_vector(km.cluster_centers_[1], topn=10, restrict_vocab=None) # basura

[('love', 0.9992219805717468),
 ('people', 0.9202014207839966),
 ('know', 0.9010292887687683),
 ('life', 0.8999360799789429),
 ('come', 0.8968741297721863),
 ('want', 0.8849157691001892),
 ('forgive', 0.8675839900970459),
 ('family', 0.8430299162864685),
 ('thing', 0.8412821292877197),
 ('continue', 0.8402407765388489)]

## Topic modeling

In [None]:
## Inutil completamente. Borrar.
corpus=df.Tokenized.values.tolist()
from gensim.corpora import Dictionary #Dictionary creation

D = Dictionary(corpus)
n_tokens = len(D)
no_below = 4
no_above = 0.8

D.filter_extremes(no_below=no_below,no_above=no_above)#Filtering
n_tokens = len(D)

print('The dictionary contains', n_tokens, 'terms')

The dictionary contains 421 terms


In [None]:
questions_bow = [D.doc2bow(doc) for doc in corpus] #BOW

n_question = 10
print(' '.join(corpus[n_question]))

print(questions_bow[n_question])

print(list(map(lambda x: (D[x[0]], x[1]), questions_bow[n_question])))

lay sleep die justice soul wake truly regret kill family thankful thought prayer family day love
[(10, 1), (11, 1), (35, 2), (42, 1), (73, 1), (74, 1), (100, 1), (118, 1), (119, 1), (120, 1), (121, 1), (122, 1), (123, 1), (124, 1)]
[('justice', 1), ('kill', 1), ('family', 2), ('love', 1), ('day', 1), ('die', 1), ('regret', 1), ('lay', 1), ('prayer', 1), ('sleep', 1), ('soul', 1), ('thankful', 1), ('truly', 1), ('wake', 1)]


In [None]:
from gensim.models.ldamodel import LdaModel
num_topics = 20

ldag = LdaModel(corpus=questions_bow, id2word=D, num_topics=num_topics)

