Examining data from Cochrane, Christopher, et al. “The Automatic Analysis of Emotion in Political Speech Based on Transcripts.” Political Communication, vol. 39, no. 1, 2022, pp. 98–121, https://doi.org/10.1080/10584609.2021.1952497.

Datasets loaded from here: https://github.com/ccochrane/emotionTranscripts

In [1]:
# libraries
import numpy as np
import scipy
import pandas as pd
import time
import random
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
from keras.preprocessing.sequence import TimeseriesGenerator
from keras.models import Sequential
from keras.layers import LSTM, Bidirectional, Dense, Embedding
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from io import BytesIO
from zipfile import ZipFile
from google.colab import drive

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
# load human coder sentiment scores dataset
coding_decisions_df = pd.read_csv(
    'https://raw.githubusercontent.com/bafarris/speech-sentiment-bilstm/main/data/fullCodingData.csv',
    sep=','
)

# examine
print(coding_decisions_df.columns)
print(coding_decisions_df.head())

Index(['Unnamed: 0.1', 'Video', 't1Act1', 't1Act2', 't1ActAvg', 't1Sent1',
       't1Sent2', 't1SentAvg', 't2Act1', 't2Act2', 't2ActAvg', 't2Sent1',
       't2Sent2', 't2SentAvg', 't3Act1', 't3Act2', 't3ActAvg', 't3Sent1',
       't3Sent2', 't3SentAvg', 'Unnamed: 0', 'v1Act1', 'v1Act2', 'v1ActAvg',
       'v1Sent1', 'v1Sent2', 'v1SentAvg', 'v2Act1', 'v2Act2', 'v2ActAvg',
       'v2Sent1', 'v2Sent2', 'v2SentAvg', 'v3Act1', 'v3Act2', 'v3ActAvg',
       'v3Sent1', 'v3Sent2', 'v3SentAvg'],
      dtype='object')
   Unnamed: 0.1         Video  t1Act1  t1Act2  t1ActAvg  t1Sent1  t1Sent2  \
0             0  2017 12 13 0       7       6       6.5        2        3   
1             1  2017 12 13 1       6       6       6.0        4        4   
2             2  2017 12 13 2       6       7       6.5        7        8   
3             3  2017 12 13 3       7       8       7.5        0        0   
4             4  2017 12 13 4       6       7       6.5        7        4   

   t1SentAvg  t2Act1  t2

In [3]:
# filter columns to just text sentiment scores
text_sentiment_columns = [col for col in coding_decisions_df.columns if col.startswith('t') and 'Sent' in col]
text_sentiment_df = coding_decisions_df[text_sentiment_columns]

# examine
print(text_sentiment_df.head())

   t1Sent1  t1Sent2  t1SentAvg  t2Sent1  t2Sent2  t2SentAvg  t3Sent1  t3Sent2  \
0        2        3        2.5      1.0        2        1.5      3.0      3.0   
1        4        4        4.0      7.0        5        6.0      4.0      4.0   
2        7        8        7.5      6.0        7        6.5      5.0      6.0   
3        0        0        0.0      1.0        1        1.0      2.0      3.0   
4        7        4        5.5      8.0        3        5.5      6.0      6.0   

   t3SentAvg  
0        3.0  
1        4.0  
2        5.5  
3        2.5  
4        6.0  


In [4]:
# load corpus of text data frame

# dropbox shared link
dropbox_link = "https://www.dropbox.com/s/4xzw3rscu7x7xn3/hansardExtractedSpeechesFull.csv.zip?dl=1"

# send a get request to download the file
response = requests.get(dropbox_link)

# get content from zipfile
with ZipFile(BytesIO(response.content)) as zipfile:
    file_name = zipfile.namelist()[0]
    # read csv
    with zipfile.open(file_name) as file:
        corpus_df = pd.read_csv(file, delimiter='\t')

# examine corpus_df
print(corpus_df.head())

# calculate the average length of the text in the 'speech' column
average_length = corpus_df['speech'].fillna('').apply(len).mean()
print(average_length)

# number of rows and columns
print("Number of rows:", corpus_df.shape[0])
print("Number of columns:", corpus_df.shape[1])

   Unnamed: 0  parliamentNumber  parliamentSession orderOfBusinessRubric  \
0           0                39                  1                 Other   
1           1                39                  1                 Other   
2           2                39                  1                 Other   
3           3                39                  1                 Other   
4           4                39                  1                 Other   

  subjectOfBusinessTitle  subjectOfBusinessID subjectOfBusinessQualifier  \
0                    NaN            1498168.0                        NaN   
1    Election of Speaker            1498174.0                        NaN   
2    Election of Speaker            1498174.0                        NaN   
3    Election of Speaker            1498174.0                        NaN   
4    Election of Speaker            1498174.0                        NaN   

           speechId  interventionId           date  ... floorLanguage  \
0  2006-4-3-1

# WORK IN PROGRESS!!!

In [None]:
# pre-process corpus data

# define preprocessing
def preprocess_text(text):
    # tokenize
    words = word_tokenize(text.lower())
    # remove stop words
    words = [word for word in words if word.isalnum() and word not in stopwords.words('english')]
    return words

# apply preprocessing to 'speech' column
corpus_df['tokens'] = corpus_df['speech'].fillna('').apply(preprocess_text)

# prepare sentences for word2vec
sentences = corpus_df['tokens'].tolist()

In [None]:
# train model
word2vec_model = Word2Vec(
    sentences,
    vector_size=300,  # 300 dimensions
    window=6,         # 6-word window
    min_count=10,      # 10 min word count
    epochs=5          # 5 iterations
)

# save model
word2vec_model.save("word2vec.model")

# check model
print(word2vec_model.wv.key_to_index)

In [None]:
# check to see if number of tokens and words in vocabulary is similar to other researchers after pre-processing

# single list tokens
all_tokens = [token for sublist in corpus_df['tokens'].tolist() for token in sublist]

# number of tokens
number_of_tokens = len(all_tokens)
print("Number of tokens:", number_of_tokens)

# number of unique words in vocabulary
unique_words = set(all_tokens)
vocabulary_size = len(unique_words)
print("Number of unique words:", vocabulary_size)

After Pre-Processing:

Number of tokens: 37,829,396

Number of unique words: 93,475

Original research:

Number of tokens: 49,713,429

Number of unique words: 40,597

In [None]:
# initalize tokenizer
tokenizer = Tokenizer()

### trying different ideas so i don't run out of RAM?????
# incrementally fit tokenizer
batch_size = 10000
for i in range(0, len(corpus_df), batch_size):
    batch_texts = corpus_df['tokens'][i:min(i+batch_size, len(corpus_df))].apply(lambda x: ' '.join(x)).tolist()
    tokenizer.fit_on_texts(batch_texts)

# convert tokens to sequences
corpus_df['sequences'] = corpus_df['tokens'].apply(lambda x: ' '.join(x))
sequences = tokenizer.texts_to_sequences(corpus_df['sequences'].tolist())

# pad sequences
X = pad_sequences(sequences, maxlen=100)

# NEED TO PUT IN SENTIMENT COLUMNS
y = text_sentiment_df['INSERT_SENTIMENT_COLUMNS'].values

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# define bi-lstm
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_seq_length))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(1, activation='linear'))

# compile model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])

# generator: feed data to model in batches
def data_generator(X_data, y_data, batch_size):
    total_samples = len(X_data)
    while True:
        for start in range(0, total_samples, batch_size):
            end = min(start + batch_size, total_samples)
            yield X_data[start:end], y_data[start:end]

train_gen = data_generator(X_train, y_train, 64)
val_gen = data_generator(X_test, y_test, 64)

# train model using generator
model.fit(train_gen, steps_per_epoch=len(X_train)//64, epochs=10, validation_data=val_gen, validation_steps=len(X_test)//64)

# evaluate
accuracy = model.evaluate(X_test, y_test)
print(f"Accuracy: {accuracy}")