## WORK IN PROGRESS!!!

---



# Applying Bi-LSTM Model to Political Speech - Predicting Sentiment Scores (Sample Data)
### POLI 179 Project - Brenna Farris and Eden Stewart

Further examining this research: https://doi.org/10.1080/10584609.2021.1952497

Data accessed from: https://github.com/ccochrane/emotionTranscripts

In [1]:
# import libraries and packages
import numpy as np
import pandas as pd
import requests
from io import BytesIO
from zipfile import ZipFile
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from keras.models import Sequential
from keras.layers import LSTM, Bidirectional, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
# load dataset containing human coder sentiment scores
coding_decisions_df = pd.read_csv(
    'https://raw.githubusercontent.com/bafarris/speech-sentiment-bilstm/main/data/fullCodingData.csv',
    sep=','
)

# examine human coder dataset
print(coding_decisions_df.columns)
print(coding_decisions_df.head())

Index(['Unnamed: 0.1', 'Video', 't1Act1', 't1Act2', 't1ActAvg', 't1Sent1',
       't1Sent2', 't1SentAvg', 't2Act1', 't2Act2', 't2ActAvg', 't2Sent1',
       't2Sent2', 't2SentAvg', 't3Act1', 't3Act2', 't3ActAvg', 't3Sent1',
       't3Sent2', 't3SentAvg', 'Unnamed: 0', 'v1Act1', 'v1Act2', 'v1ActAvg',
       'v1Sent1', 'v1Sent2', 'v1SentAvg', 'v2Act1', 'v2Act2', 'v2ActAvg',
       'v2Sent1', 'v2Sent2', 'v2SentAvg', 'v3Act1', 'v3Act2', 'v3ActAvg',
       'v3Sent1', 'v3Sent2', 'v3SentAvg'],
      dtype='object')
   Unnamed: 0.1         Video  t1Act1  t1Act2  t1ActAvg  t1Sent1  t1Sent2  \
0             0  2017 12 13 0       7       6       6.5        2        3   
1             1  2017 12 13 1       6       6       6.0        4        4   
2             2  2017 12 13 2       6       7       6.5        7        8   
3             3  2017 12 13 3       7       8       7.5        0        0   
4             4  2017 12 13 4       6       7       6.5        7        4   

   t1SentAvg  t2Act1  t2

In [3]:
# select columns related to text sentiment scores
text_sentiment_columns = [col for col in coding_decisions_df.columns if col.startswith('t') and 'Sent' in col]
text_sentiment_df = coding_decisions_df[text_sentiment_columns]

# examine those columns
print(text_sentiment_df.head())

   t1Sent1  t1Sent2  t1SentAvg  t2Sent1  t2Sent2  t2SentAvg  t3Sent1  t3Sent2  \
0        2        3        2.5      1.0        2        1.5      3.0      3.0   
1        4        4        4.0      7.0        5        6.0      4.0      4.0   
2        7        8        7.5      6.0        7        6.5      5.0      6.0   
3        0        0        0.0      1.0        1        1.0      2.0      3.0   
4        7        4        5.5      8.0        3        5.5      6.0      6.0   

   t3SentAvg  
0        3.0  
1        4.0  
2        5.5  
3        2.5  
4        6.0  


In [4]:
# calculate and add average sentiment score for each row and examine
text_sentiment_df['avg_sentiment'] = text_sentiment_df.mean(axis=1)
print(text_sentiment_df[['avg_sentiment']].head())

   avg_sentiment
0       2.333333
1       4.666667
2       6.500000
3       1.166667
4       5.666667


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_sentiment_df['avg_sentiment'] = text_sentiment_df.mean(axis=1)


In [5]:
# download and load text corpus
dropbox_link = "https://www.dropbox.com/s/4xzw3rscu7x7xn3/hansardExtractedSpeechesFull.csv.zip?dl=1"
response = requests.get(dropbox_link)
with ZipFile(BytesIO(response.content)) as zipfile:
    file_name = zipfile.namelist()[0]
    with zipfile.open(file_name) as file:
        corpus_df = pd.read_csv(file, delimiter='\t')

# examine corpus dataframe
print(corpus_df.head())

   Unnamed: 0  parliamentNumber  parliamentSession orderOfBusinessRubric  \
0           0                39                  1                 Other   
1           1                39                  1                 Other   
2           2                39                  1                 Other   
3           3                39                  1                 Other   
4           4                39                  1                 Other   

  subjectOfBusinessTitle  subjectOfBusinessID subjectOfBusinessQualifier  \
0                    NaN            1498168.0                        NaN   
1    Election of Speaker            1498174.0                        NaN   
2    Election of Speaker            1498174.0                        NaN   
3    Election of Speaker            1498174.0                        NaN   
4    Election of Speaker            1498174.0                        NaN   

           speechId  interventionId           date  ... floorLanguage  \
0  2006-4-3-1

In [6]:
# calculate and examine the average length of speeches
average_length = corpus_df['speech'].fillna('').apply(len).mean()
print(f'average speech length: {average_length}')

# print number of rows and columns in the corpus dataframe
print(f"rows: {corpus_df.shape[0]}, columns: {corpus_df.shape[1]}")

average speech length: 1314.084397233906
rows: 350675, columns: 47


In [7]:
# preprocess a 10% subset of the data to manage RAM
sample_size = int(corpus_df.shape[0] * 0.1)
corpus_df_sample = corpus_df.sample(n=sample_size, random_state=88)

In [8]:
# load and define stop words for preprocessing
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """tokenize text, remove stop words and non-alphanumeric characters"""
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalnum() and word not in stop_words]
    return words

In [9]:
# apply preprocessing to the 'speech' column
corpus_df_sample['tokens'] = corpus_df_sample['speech'].fillna('').apply(preprocess_text)

In [10]:
# prepare sentences for word2vec model training
sentences_sample = corpus_df_sample['tokens'].tolist()

In [11]:
# train word2vec model on the preprocessed subset
word2vec_model_sample = Word2Vec(sentences_sample, vector_size=300, window=6, min_count=10, epochs=5)
word2vec_model_sample.save("word2vec_sample.model")

# print model vocabulary
print(word2vec_model_sample.wv.key_to_index)



In [12]:
# adjust sample size to match both dataframes
sample_size = min(corpus_df.shape[0], text_sentiment_df.shape[0], sample_size)
corpus_df_sample = corpus_df.sample(n=sample_size, random_state=88)
text_sentiment_sample = text_sentiment_df.sample(n=sample_size, random_state=88)

In [13]:
# create target variable from the sampled sentiment scores
y = text_sentiment_sample['avg_sentiment'].values

In [14]:
# replace missing values in 'speech' column with empty strings
corpus_df_sample['speech'] = corpus_df_sample['speech'].fillna('')

In [15]:
# initialize tokenizer
tokenizer = Tokenizer()

# create a token dictionary from the 'speech' column
tokenizer.fit_on_texts(corpus_df_sample['speech'])

# convert texts to sequences of integers
sequences = tokenizer.texts_to_sequences(corpus_df_sample['speech'])

In [16]:
# pad sentences to max length
max_sequence_length = max(len(x) for x in sequences)
X = pad_sequences(sequences, maxlen=max_sequence_length)

In [17]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=88)

In [18]:
# build bi-lstm model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=300, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(1, activation='sigmoid'))

# compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [19]:
# train the model with training data
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7c8f980ca7d0>

In [20]:
# evaluate the model with testing data
loss, accuracy = model.evaluate(X_test, y_test)



In [21]:
# print the accuracy of the model
print(f'model accuracy: {accuracy}')

model accuracy: 0.0
