In [1]:
# Load all packages and modules
import os
import glob
import re
from gensim.models import Word2Vec
from gensim.models.phrases import Phraser, Phrases
from gensim.models import FastText
import pandas as pd
import numpy as np



In [2]:
# Set directory to the folder with the Democrat transcripts
os.chdir("/Users/nicholasaoki/Desktop/debate_transcripts/Democrat")

In [3]:
# Empty list where the sentences will be appended
all_sentences = []

# Loop through each file in the directory
for filename in glob.glob('*.txt'):

    # Open each file
    with open(filename, 'r') as file:
        
        # Print the filename to keep track of which file is being read
        print(filename)

        # Read each file
        data = file.read()
        
        # Change the spelling of titles to take out the punctuation marks
        data = data.replace("Mr.", "Mister")
        data = data.replace("Mrs.", "Missus")
        data = data.replace("Ms.", "Miss")

        # Take out the final space of each transcript and make everything lowercase
        data = data[:-1]
        data = data.lower()

        # Save all of the punctuation marks to be removed in a single variable
        punctuation = ';:\,$–'

        # Remove these punctuation marks
        for ele in data:
            if ele in punctuation:
                data = data.replace(ele, "")

        # Split the data by periods, question marks, and exclamation marks
        words = re.split("\?|\.|!", data)

        words_no_initial_space = []

        # Loop through each of the sentences and remove the initial space
        for i in words:

            if i[0] == ' ':
                j = i[1:]
                words_no_initial_space += [j]

            else:
                words_no_initial_space += [i]

        # Split the sentences into words
        for k in words_no_initial_space:

            split_sentence = k.split(' ')
            all_sentences += [split_sentence]

2008_1_Obama_Democrat.txt
2020_2_Harris_Democrat.txt
2020_3_Biden_Democrat.txt
1988_3_Dukakis_Democrat.txt
2004_4_Kerry_Democrat.txt
1992_1_1_Clinton_Democrat.txt
1988_2_Bentsen_Democrat.txt
2016_1_Clinton_Democrat.txt
2004_2_Kerry_Democrat.txt
2008_4_Obama_Democrat.txt
1976_2_Carter_Democrat.txt
2016_4_Clinton_Democrat.txt
1976_3_Carter_Democrat.txt
2008_2_Biden_Democrat.txt
1976_1_Carter_Democrat.txt
2012_3_Obama_Democrat.txt
1984_2_Ferraro_Democrat.txt
1960_4_Kennedy_Democrat.txt
1980_2_Carter_Democrat.txt
1960_1_Kennedy_Democrat.txt
1992_3_1_Clinton_Democrat.txt
2004_1_Kerry_Democrat.txt
1996_3_Clinton_Democrat.txt
1984_3_Mondale_Democrat.txt
2012_1_Obama_Democrat.txt
2000_1_Gore_Democrat.txt
2016_2_Kaine_Democrat.txt
1996_2_Gore_Democrat.txt
1960_2 Kennedy_Democrat.txt
1992_3_2_Clinton_Democrat.txt
2000_2_Lieberman_Democrat.txt
2000_4_Gore_Democrat.txt
1988_1_Dukakis_Democrat.txt
1992_2_Gore_Democrat.txt
2004_3_Edwards_Democrat.txt
1992_1_2_Clinton_Democrat.txt
1960_3_Kennedy_Demo

In [4]:
# Print the first two sentences
all_sentences[0:2]

[['well',
  'thank',
  'you',
  'very',
  'much',
  'jim',
  'and',
  'thanks',
  'to',
  'the',
  'commission',
  'and',
  'the',
  'university',
  'of',
  'mississippi',
  'ole',
  'miss',
  'for',
  'hosting',
  'us',
  'tonight'],
 ['i',
  'can’t',
  'think',
  'of',
  'a',
  'more',
  'important',
  'time',
  'for',
  'us',
  'to',
  'talk',
  'about',
  'the',
  'future',
  'of',
  'the',
  'country']]

In [5]:
# Empty list where the words will be appended
words = []

# Loop through each of the sentences and add the words to the list
for i in all_sentences:
    
    for j in i:
        words += [j]
    
# Print the first few words
print(words[0:10])

['well', 'thank', 'you', 'very', 'much', 'jim', 'and', 'thanks', 'to', 'the']


In [6]:
# Find the number of tokens and types
print(len(words))
print(len(set(words)))

277871
10857


In [7]:
# Run FastText on the Democrat transcripts
democrat_model = FastText(all_sentences)

In [8]:
# Read the csv file containing the Affective Norms for English Words (ANEW) data set
df = pd.read_csv('~/Desktop/anew.csv')
df.head

<bound method NDFrame.head of            term   pleasure    arousal  dominance
0     abduction  31.292517  62.698413  39.569161
1      abortion  39.682540  61.111111  52.040816
2        absurd  48.299320  49.433107  53.628118
3     abundance  74.716553  62.471655  65.759637
4         abuse  20.408163  77.437642  41.836735
...         ...        ...        ...        ...
1029      yacht  78.798186  63.605442  69.160998
1030     yellow  63.605442  50.226757  62.018141
1031      young  78.117914  63.945578  60.090703
1032      youth  76.530612  64.285714  57.936508
1033       zest  76.984127  63.378685  68.027211

[1034 rows x 4 columns]>

In [9]:
# List the top 50 words with the lowest pleasure scores
negative = df.sort_values('pleasure')['term'].head(50)
negative = negative.tolist()
negative

['rape',
 'suicide',
 'funeral',
 'rejected',
 'cancer',
 'murderer',
 'torture',
 'suffocate',
 'unhappy',
 'sad',
 'loneliness',
 'death',
 'slaughter',
 'infection',
 'poverty',
 'betray',
 'syphilis',
 'terrorist',
 'grief',
 'failure',
 'terrified',
 'disaster',
 'rabies',
 'ulcer',
 'tragedy',
 'abuse',
 'mutilate',
 'depressed',
 'slave',
 'pollute',
 'depression',
 'gloom',
 'killer',
 'sick',
 'hurt',
 'nightmare',
 'drown',
 'morgue',
 'misery',
 'terrible',
 'disloyal',
 'dead',
 'distressed',
 'jail',
 'cruel',
 'paralysis',
 'poison',
 'hatred',
 'toothache',
 'bankrupt']

In [10]:
# List the top 50 words with highest pleasure scores
positive = df.sort_values('pleasure', ascending=False)['term'].head(50)
positive = positive.tolist()
positive

['triumphant',
 'paradise',
 'love',
 'loved',
 'joy',
 'miracle',
 'humor',
 'laughter',
 'champion',
 'friendly',
 'sweetheart',
 'affection',
 'mother',
 'excellence',
 'win',
 'comedy',
 'fun',
 'cash',
 'victory',
 'orgasm',
 'romantic',
 'success',
 'pleasure',
 'treasure',
 'free',
 'delight',
 'kiss',
 'joyful',
 'baby',
 'happy',
 'promotion',
 'graduate',
 'lucky',
 'terrific',
 'vacation',
 'god',
 'rainbow',
 'music',
 'valentine',
 'joke',
 'cheer',
 'thrill',
 'sex',
 'passion',
 'millionaire',
 'beach',
 'proud',
 'sexy',
 'rollercoaster',
 'diploma']

In [11]:
# Empty vector where counts will be stored
neg_counts = []

# For each word in the "negative" list, count the number of times that word appeared in the corpus
for neg_word in negative:
    count = words.count(neg_word)
    neg_counts.append(count)

# Print the first few count values
neg_counts[0:5]

[9, 3, 1, 6, 12]

In [12]:
# Empty vector where counts will be stored
pos_counts = []

# For each word in the "positive" list, count the number of times that word appeared in the corpus
for pos_word in positive:
    count = words.count(pos_word)
    pos_counts.append(count)

# Print the first few count values
pos_counts[0:5]

[0, 0, 33, 7, 0]

In [13]:
# Find the most common negative words in the Democrat corpus
neg_df = pd.DataFrame(
    {'word': negative,
     'count': neg_counts})
neg_df.sort_values('count', ascending=False).head(10)

Unnamed: 0,word,count
14,poverty,33
11,death,29
19,failure,25
30,depression,24
17,terrorist,24
41,dead,23
49,bankrupt,21
39,terrible,21
34,hurt,21
33,sick,17


In [14]:
# Find the most common positive words in the Democrat corpus
pos_df = pd.DataFrame(
    {'word': positive,
     'count': pos_counts})
pos_df.sort_values('count', ascending=False).head(10)

Unnamed: 0,word,count
46,proud,72
24,free,66
14,win,36
2,love,33
29,happy,29
35,god,26
12,mother,26
21,success,24
43,passion,11
28,baby,10


In [15]:
# Find the mean cosine similarity between "democrat" and the top 10 negative words in the Democrat corpus
democrat_negative = [democrat_model.wv.similarity('democrat', 'poverty'),
                          democrat_model.wv.similarity('democrat', 'death'),
                          democrat_model.wv.similarity('democrat', 'failure'),
                          democrat_model.wv.similarity('democrat', 'depression'),
                          democrat_model.wv.similarity('democrat', 'terrorist'),
                          democrat_model.wv.similarity('democrat', 'dead'),
                          democrat_model.wv.similarity('democrat', 'bankrupt'),
                          democrat_model.wv.similarity('democrat', 'terrible'),
                          democrat_model.wv.similarity('democrat', 'hurt'),
                          democrat_model.wv.similarity('democrat', 'sick')]
np.mean(democrat_negative)

0.90582407

In [16]:
# Find the mean cosine similarity between "republican" and the top 10 negative words in the Democrat corpus
republican_negative = [democrat_model.wv.similarity('republican', 'poverty'),
                          democrat_model.wv.similarity('republican', 'death'),
                          democrat_model.wv.similarity('republican', 'failure'),
                          democrat_model.wv.similarity('republican', 'depression'),
                          democrat_model.wv.similarity('republican', 'terrorist'),
                          democrat_model.wv.similarity('republican', 'dead'),
                          democrat_model.wv.similarity('republican', 'bankrupt'),
                          democrat_model.wv.similarity('republican', 'terrible'),
                          democrat_model.wv.similarity('republican', 'hurt'),
                          democrat_model.wv.similarity('republican', 'sick')]
np.mean(republican_negative)

0.8484914

In [17]:
# Find the mean cosine similarity between "democrat" and the top 10 positive words in the Democrat corpus
democrat_positive = [democrat_model.wv.similarity('democrat', 'proud'),
                          democrat_model.wv.similarity('democrat', 'free'),
                          democrat_model.wv.similarity('democrat', 'win'),
                          democrat_model.wv.similarity('democrat', 'love'),
                          democrat_model.wv.similarity('democrat', 'happy'),
                          democrat_model.wv.similarity('democrat', 'god'),
                          democrat_model.wv.similarity('democrat', 'mother'),
                          democrat_model.wv.similarity('democrat', 'success'),
                          democrat_model.wv.similarity('democrat', 'passion'),
                          democrat_model.wv.similarity('democrat', 'baby')]
np.mean(democrat_positive)

0.8233612

In [18]:
# Find the mean cosine similarity between "republican" and the top 10 negative words in the Democrat corpus
republican_positive = [democrat_model.wv.similarity('republican', 'proud'),
                          democrat_model.wv.similarity('republican', 'free'),
                          democrat_model.wv.similarity('republican', 'win'),
                          democrat_model.wv.similarity('republican', 'love'),
                          democrat_model.wv.similarity('republican', 'happy'),
                          democrat_model.wv.similarity('republican', 'god'),
                          democrat_model.wv.similarity('republican', 'mother'),
                          democrat_model.wv.similarity('republican', 'success'),
                          democrat_model.wv.similarity('republican', 'passion'),
                          democrat_model.wv.similarity('republican', 'baby')]
np.mean(republican_positive)

0.76201755

The code below repeats the same process above for the "Republican corpus", so no comments are included here.

In [19]:
os.chdir("/Users/nicholasaoki/Desktop/debate_transcripts/Republican")

In [20]:
all_sentences = []

for filename in glob.glob('*.txt'):

    with open(filename, 'r') as file:
        
        print(filename)

        data = file.read()

        data = data.replace("Mr.", "Mister")
        data = data.replace("Mrs.", "Missus")
        data = data.replace("Ms.", "Miss")

        data = data[:-1]

        data = data.lower()

        punctuation = ';:\,$–'

        for ele in data:
            if ele in punctuation:
                data = data.replace(ele, "")

        # split data
        words = re.split("\?|\.|!", data)

        words_no_initial_space = []

        for i in words:

            if i[0] == ' ':
                j = i[1:]
                words_no_initial_space += [j]

            else:
                words_no_initial_space += [i]

        for k in words_no_initial_space:

            split_sentence = k.split(' ')
            all_sentences += [split_sentence]

2008_1_McCain_Republican.txt
2004_1_Bush_Republican.txt
1976_1_Ford_Republican.txt
1984_2_Bush_Republican.txt
1976_3_Ford_Republican.txt
2004_3_Cheney_Republican.txt
1976_2_Ford_Republican.txt
2004_2_Bush_Republican.txt
2004_4_Bush_Republican.txt
2008_3_McCain_Republican.txt
1992_4_Bush_Republican.txt
2012_4_Romney_Republican.txt
2000_3_Bush_Republican.txt
1988_3_Bush_Republican.txt
1960_1_Nixon_Republican.txt
2020_1_Trump_Republican.txt
1960_4_Nixon_Republican.txt
2016_4_Trump_Republican.txt
2012_2_Ryan_Republican.txt
2000_1_Bush_Republican.txt
1988_1_Bush_Republican.txt
2016_1_Trump_Republican.txt
2016_2_Pence_Republican.txt
1988_2_Quayle_Republican.txt
2000_4_Bush_Republican.txt
1980_2_Reagan_Republican.txt
2020_2_Pence_Republican.txt
1992_2_Quayle_Republican.txt
2000_2_Cheney_Republican.txt
1996_2_Kemp_Republican.txt
1960_2_Nixon_Republican.txt
1984_3_Reagan_Republican.txt
1980_1_Reagan_Republican.txt
1992_3_1_Bush_Republican.txt
2008_4_McCain_Republican.txt
1996_3_Dole_Republican.

In [21]:
all_sentences[0:2]

[['well', 'thank', 'you', 'jim'], ['and', 'thanks', 'to', 'everybody']]

In [22]:
words = []

for i in all_sentences:
    
    for j in i:
        words += [j]
        
print(words[0:10])

['well', 'thank', 'you', 'jim', 'and', 'thanks', 'to', 'everybody', 'and', 'i']


In [23]:
print(len(words))
print(len(set(words)))

279728
11228


In [24]:
republican_model = FastText(all_sentences)

In [25]:
neg_counts = []

for neg_word in negative:
    count = words.count(neg_word)
    neg_counts.append(count)

neg_counts[0:5]

[5, 6, 0, 7, 2]

In [26]:
pos_counts = []

for pos_word in positive:
    count = words.count(pos_word)
    pos_counts.append(count)

pos_counts[0:5]

[0, 1, 66, 9, 2]

In [27]:
neg_df = pd.DataFrame(
    {'word': negative,
     'count': neg_counts})
neg_df.sort_values('count', ascending=False).head(10)

Unnamed: 0,word,count
21,disaster,47
17,terrorist,37
14,poverty,34
11,death,28
34,hurt,27
39,terrible,25
24,tragedy,19
43,jail,18
19,failure,15
41,dead,14


In [28]:
pos_df = pd.DataFrame(
    {'word': positive,
     'count': pos_counts})
pos_df.sort_values('count', ascending=False).head(10)

Unnamed: 0,word,count
24,free,140
46,proud,78
2,love,66
14,win,66
12,mother,31
35,god,30
29,happy,29
21,success,20
18,victory,15
28,baby,12


In [29]:
import numpy as np
democrat_negative = [republican_model.wv.similarity('democrat', 'disaster'),
                          republican_model.wv.similarity('democrat', 'terrorist'),
                          republican_model.wv.similarity('democrat', 'poverty'),
                          republican_model.wv.similarity('democrat', 'death'),
                          republican_model.wv.similarity('democrat', 'hurt'),
                          republican_model.wv.similarity('democrat', 'terrible'),
                          republican_model.wv.similarity('democrat', 'tragedy'),
                          republican_model.wv.similarity('democrat', 'jail'),
                          republican_model.wv.similarity('democrat', 'failure'),
                          republican_model.wv.similarity('democrat', 'dead')]
np.mean(democrat_negative)

0.91852844

In [30]:
republican_negative = [republican_model.wv.similarity('republican', 'poverty'),
                          republican_model.wv.similarity('republican', 'death'),
                          republican_model.wv.similarity('republican', 'failure'),
                          republican_model.wv.similarity('republican', 'depression'),
                          republican_model.wv.similarity('republican', 'terrorist'),
                          republican_model.wv.similarity('republican', 'dead'),
                          republican_model.wv.similarity('republican', 'bankrupt'),
                          republican_model.wv.similarity('republican', 'terrible'),
                          republican_model.wv.similarity('republican', 'hurt'),
                          republican_model.wv.similarity('republican', 'sick')]
np.mean(republican_negative)

0.9567429

In [31]:
democrat_positive = [republican_model.wv.similarity('democrat', 'free'),
                          republican_model.wv.similarity('democrat', 'proud'),
                          republican_model.wv.similarity('democrat', 'love'),
                          republican_model.wv.similarity('democrat', 'win'),
                          republican_model.wv.similarity('democrat', 'mother'),
                          republican_model.wv.similarity('democrat', 'god'),
                          republican_model.wv.similarity('democrat', 'happy'),
                          republican_model.wv.similarity('democrat', 'success'),
                          republican_model.wv.similarity('democrat', 'victory'),
                          republican_model.wv.similarity('democrat', 'baby')]
np.mean(democrat_positive)

0.8346173

In [32]:
republican_positive = [republican_model.wv.similarity('republican', 'free'),
                          republican_model.wv.similarity('republican', 'proud'),
                          republican_model.wv.similarity('republican', 'love'),
                          republican_model.wv.similarity('republican', 'win'),
                          republican_model.wv.similarity('republican', 'mother'),
                          republican_model.wv.similarity('republican', 'god'),
                          republican_model.wv.similarity('republican', 'happy'),
                          republican_model.wv.similarity('republican', 'success'),
                          republican_model.wv.similarity('republican', 'victory'),
                          republican_model.wv.similarity('republican', 'baby')]
np.mean(republican_positive)

0.90169716