### WORK IN PROGRESS!!!

# Applying Bi-LSTM Model to Political Speech - Predicting Sentiment Scores (Sample Data)
### POLI 179 Project - Brenna Farris and Eden Stewart

Further examining this research: https://doi.org/10.1080/10584609.2021.1952497

Data accessed from: https://github.com/ccochrane/emotionTranscripts

### Loading Libraries and Dataset(s)

In [10]:
# import libraries and packages
import numpy as np
import pandas as pd
import requests
from io import BytesIO
from zipfile import ZipFile
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from keras.models import Sequential
from keras.layers import LSTM, Bidirectional, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
# load dataset containing human coder sentiment scores and sections of corpus with assigned sentiment scores
full_df = pd.read_csv(
    'https://raw.githubusercontent.com/bafarris/speech-sentiment-bilstm/main/data/w2vScores.csv',
    sep=','
)

# examine human coder dataset
print(full_df.columns)
print(full_df.head())

Index(['Unnamed: 0', 'IDMain', 'countedWords', 'date', 'english', 'floor',
       'french', 'label', 'party', 'seconds', 'sentencePolarity', 'sentiment',
       'speaker', 'timeStamp', 'wordPolaritySummed', 'youTube'],
      dtype='object')
   Unnamed: 0        IDMain  countedWords        date  \
0           0  2017 12 13 0            13  2017-12-13   
1           1  2017 12 13 1             8  2017-12-13   
2           2  2017 12 13 2            16  2017-12-13   
3           3  2017 12 13 3            30  2017-12-13   
4           4  2017 12 13 4             7  2017-12-13   

                                             english  \
0  I thought we usually hired an investigator to ...   
1  I hope they will be getting better than that i...   
2  We told Canadians that we would run deficits, ...   
3  The problem was that they fired 700 people in ...   
4  We did not create the Phoenix problem, but we ...   

                                               floor  french  label party  \
0 

### Pre-Processing

In [12]:
# load and define stop words for preprocessing
stop_words = set(stopwords.words('english'))

# tokenize
def preprocess_text(text):
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalnum() and word not in stop_words]
    return words

In [13]:
# apply preprocessing to the 'speech' column
full_df['tokens'] = full_df['english'].fillna('').apply(preprocess_text)

In [14]:
# prepare sentences for word2vec model training
sentences = full_df['tokens'].tolist()

In [15]:
# train word2vec model
word2vec_model_sample = Word2Vec(sentences, vector_size=300, window=6, min_count=10, epochs=5)
word2vec_model_sample.save("word2vec_sample.model")

# print model vocabulary
print(word2vec_model_sample.wv.key_to_index)

{'minister': 0, 'speaker': 1, 'government': 2, 'canadians': 3, 'canada': 4, 'prime': 5, 'canadian': 6, 'work': 7, 'would': 8, 'liberals': 9, 'house': 10, 'people': 11, 'jobs': 12, 'new': 13, 'member': 14, 'us': 15, 'tax': 16, 'support': 17, 'members': 18, 'public': 19, 'plan': 20, 'going': 21, 'like': 22, 'get': 23, 'make': 24, 'know': 25, 'liberal': 26, 'one': 27, 'help': 28, 'working': 29, 'families': 30, 'said': 31, 'conservatives': 32, 'communities': 33, 'put': 34, 'national': 35, 'quebec': 36, 'also': 37, 'process': 38, 'country': 39, 'health': 40, 'years': 41, 'finance': 42, 'continue': 43, 'important': 44, 'year': 45, 'bill': 46, 'ensure': 47, 'take': 48, 'every': 49, 'commissioner': 50, 'need': 51, 'money': 52, 'across': 53, 'yesterday': 54, 'million': 55, 'system': 56, 'economy': 57, 'access': 58, 'ndp': 59, 'want': 60, 'last': 61, 'trade': 62, 'question': 63, 'conservative': 64, 'future': 65, 'many': 66, 'budget': 67, 'way': 68, 'women': 69, 'first': 70, 'indigenous': 71, 'ma

### Apply Bi-LSTM Model

In [16]:
# convert tokens to sequences of integers
def tokens_to_sequence(tokens, word2vec_model):
    return [word2vec_model.wv.key_to_index[word] for word in tokens if word in word2vec_model.wv.key_to_index]

full_df['sequence'] = full_df['tokens'].apply(lambda x: tokens_to_sequence(x, word2vec_model_sample))

In [17]:
# create training and test sets

X = pad_sequences(full_df['sequence'], maxlen=100)  # assuming max length of sequences is 100
y = full_df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# build the Bi-LSTM Model

model = Sequential()
model.add(Embedding(input_dim=len(word2vec_model_sample.wv), output_dim=300, weights=[word2vec_model_sample.wv.vectors], input_length=100, trainable=False))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(1, activation='linear'))  # linear activation for regression task

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_squared_error'])