# Compare Experiments

In [1]:
# load packages
import pickle
import numpy as np
from sklearn.svm import SVC

## Train Classifier

In [2]:
# load BERT embeddings
embedded_tweets = np.load('bert_embeddings.npy')

In [3]:
# instantiate classification algorithm

# round 1 winner
svc = SVC(C = 1, kernel = 'rbf', probability = True)

class1_train_indices = list(range(100))
class0_train_indices = list(range(280,380))

train_X = embedded_tweets[[class1_train_indices + class0_train_indices],:][0]

hundred_ones = [1]*100
hundred_zeros = [0]*100
train_Y = hundred_ones + hundred_zeros

# fit SVC model on training subset of tweet embeddings
svc.fit(train_X, train_Y)

SVC(C=1, probability=True)

## Disaggregate correct/incorrect predictions

In [4]:
# get test set
test_indices = list(range(100,280))
test_X = embedded_tweets[[test_indices],:][0]

# model predictions for test set
y_hat = svc.predict(test_X)

In [5]:
# correct or not
y_bool = y_hat==1

## Load list of explanations

In [6]:
# read in explanations list
with open('bert_explanation_list', 'rb') as f:
    explanations = pickle.load(f)

In [7]:
explanations[0]

[('buy', 0.051218169268151266),
 ('today', 0.04564912805488858),
 ('recovered', 0.03684671343606235),
 ('coronavirus', 0.03167357242077659),
 ('2700', 0.030506518954913325),
 ('stocks', 0.029581742274224374),
 ('Reasons', 0.028743866428244537),
 ('deaths', 0.027860092852483344),
 ('virus', 0.025716989358801717),
 ('markets', 0.022864906695504512),
 ('1500', 0.02231262230616804),
 ('60', 0.021412463259315447),
 ('portfolio', 0.016436537908448373),
 ('back', 0.01619796505973802),
 ('new', 0.013312404914688995),
 ('cases', 0.01311502314673957),
 ('to', -0.01199561987952933)]

In [8]:
y_bool

array([ True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True, False, False,  True,  True,  True, False,  True,  True,
       False,  True, False,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [9]:
y_boolF = y_bool==False
y_boolF

array([False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False, False, False, False,  True,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
       False,  True,  True, False, False, False,  True, False, False,
        True, False,  True, False, False, False, False, False, False,
        True, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [10]:
# separate by correctness of prediction
from itertools import compress
correctPred = list(compress(explanations, y_bool))
wrongPred = list(compress(explanations, y_boolF))

In [11]:
wrongPred[0]

[('meth', 0.030987877447655538),
 ('your', 0.025280368484429324),
 ('for', 0.023607036528363858),
 ('This', 0.021277121458002424),
 ('with', 0.02115291971117502),
 ('coronavirus', 0.020399463946433768),
 ('contaminated', 0.020293806094854105),
 ('Florida', 0.01895198653657757),
 ('police', 0.01784072679125687),
 ('Is', 0.017506689596506714),
 ('will', 0.0156057483026649),
 ('dept', 0.011637419697194519),
 ('test', 0.011128987757850867),
 ('free', 0.005326991149549698),
 ('it', 0.002893041845161037)]

In [12]:
# get unique words from all explanations
with open('tweets_vocab_list', 'rb') as f:
    vocab_list = pickle.load(f)

len(vocab_list)

1659

In [13]:
# unique words from explanations for correctly predicted unreliable tweets
vocab_correct = []
for subList in correctPred:
    for el in subList:
        if el[0] not in vocab_correct:
            vocab_correct.append(el[0])
len(vocab_correct)

1530

In [14]:
# unique words from explanations for incorrectly predicted unreliable tweets
vocab_wrong = []
for subList in wrongPred:
    for el in subList:
        if el[0] not in vocab_wrong:
            vocab_wrong.append(el[0])
len(vocab_wrong)

271

## Experiment 1: Table 1 words

In [15]:
# define dictionary
t1 = [
    'blame',
    'accuse',
    'refuse',
    'catastrophe',
    'chaos',
    'evil',
    'fight',
    'danger',
    'hysteria',
    'panic',
    'paranoia',
    'laugh',
    'stupidity',
    'hear',
    'see',
    'feel',
    'suppose',
    'perceive',
    'look',
    'appear',
    'suggest',
    'believe',
    'pretend',
    'martial',
    'kill',
    'die',
    'weapon',
    'weaponizing',
    'ussr',
    'japan',
    'fukushima',
    'chernobyl',
    'wuhan',
    'china',
    'foreigners',
    'cats',
    'dogs',
    'i',
    'me',
    'mine',
    'my',
    'you',
    'your',
    'we',
    'our',
    'propaganda',
    'fake',
    'conspiracy',
    'claim',
    'misleading',
    'hoax',
    'cure',
    'breakthrough',
    'bitch',
    'wtf',
    'dogbreath',
    'zombie',
    'junkies',
    'hell',
    'screwed',
    'secular',
    'bible',
    'maga',
    'magat',
    'genetic',
    'hillary',
    'chinese',
    'fundamentalist',
    'market',
    'communist',
    'nazi',
    'stock',
    'economy',
    'money',
    'cost',
    'costs',
    'election',
    'campaign',
    'presidential',
    'impeachment',
    'rallies',
    'base',
    'trump',
    'war',
    'iran'
]

### Disaggregated by correctness of prediction

#### Correctly predicted

In [16]:
# get filtered vocab based on analysis words
filtered_vocab = [el for el in vocab_correct if el.lower() in t1]

In [17]:
# make dictionary from filtered vocab
myDict = dict()

for i in range(len(filtered_vocab)):
    myDict[filtered_vocab[i]] = i

In [18]:
# matrix of class associations of filtered vocab
my_matrix = np.zeros((len(correctPred), len(vocab_correct)))

for i in range(len(correctPred)):
    expl = correctPred[i]
    for j in range(len(expl)):
        word = expl[j][0]
        val = expl[j][1]
        
        if word in filtered_vocab:
            if val > 0:
                my_matrix[i, myDict[word]] = 1

            if val < 0:
                my_matrix[i, myDict[word]] = -1

In [19]:
# no penalty
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        if el > 0:
            rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [20]:
totalNonZero

206

In [21]:
np.array(tweet_scores).mean()

0.7320261437908496

In [22]:
# penalty for wrong class association of word
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [23]:
np.array(tweet_scores).mean()

0.7320261437908496

#### Incorrectly predicted

In [24]:
# get filtered vocab based on analysis words
filtered_vocab = [el for el in vocab_wrong if el.lower() in t1]

In [25]:
# make dictionary from filtered vocab
myDict = dict()

for i in range(len(filtered_vocab)):
    myDict[filtered_vocab[i]] = i

In [26]:
# matrix of class associations of filtered vocab
my_matrix = np.zeros((len(wrongPred), len(vocab_wrong)))

for i in range(len(wrongPred)):
    expl = wrongPred[i]
    for j in range(len(expl)):
        word = expl[j][0]
        val = expl[j][1]
        
        if word in filtered_vocab:
            if val > 0:
                my_matrix[i, myDict[word]] = 1

            if val < 0:
                my_matrix[i, myDict[word]] = -1

In [27]:
# no penalty
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        if el > 0:
            rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [28]:
np.array(tweet_scores).mean()

0.6296296296296297

In [29]:
# penalty for wrong class association of word
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [30]:
np.array(tweet_scores).mean()

0.6296296296296297

### Aggregated

In [31]:
# get filtered vocab based on analysis words
filtered_vocab = [el for el in vocab_list if el.lower() in t1]

In [32]:
filtered_vocab

['you',
 'we',
 'Trump',
 'You',
 'your',
 'look',
 'TRUMP',
 'We',
 'My',
 'fight',
 'evil',
 'Iran',
 'chaos',
 'refuse',
 'me',
 'Misleading',
 'junkies',
 'Zombie',
 'Market',
 'market',
 'stock',
 'Wuhan',
 'China',
 'conspiracy',
 'panic',
 'Stupidity',
 'Hell',
 'Paranoia',
 'weaponizing',
 'hoax',
 'campaign',
 'my',
 'I',
 'our',
 'rallies',
 'base',
 'screwed',
 'weapon',
 'Rallies',
 'laugh',
 'cure',
 'DIE',
 'Hoax',
 'economy',
 'Chinese',
 'see',
 'pretend',
 'suggest',
 'cost',
 'Communist',
 'kill',
 'hysteria',
 'Fake',
 'war',
 'impeachment',
 'Conspiracy',
 'Cure',
 'FAKE',
 'fake',
 'ECONOMY',
 'MARKET',
 'STOCK',
 'money',
 'Chernobyl',
 'USSR',
 'Fukushima',
 'Japan',
 'KILL',
 'MY',
 'feel',
 'believe',
 'MAGA',
 'Propaganda']

In [33]:
len(filtered_vocab)

73

In [34]:
# make dictionary from filtered vocab
myDict = dict()

for i in range(len(filtered_vocab)):
    myDict[filtered_vocab[i]] = i

In [35]:
# matrix of class associations of filtered vocab
my_matrix = np.zeros((len(explanations), len(vocab_list)))

for i in range(len(explanations)):
    expl = explanations[i]
    for j in range(len(expl)):
        word = expl[j][0]
        val = expl[j][1]
        
        if word in filtered_vocab:
            if val > 0:
                my_matrix[i, myDict[word]] = 1

            if val < 0:
                my_matrix[i, myDict[word]] = -1

In [36]:
# no penalty
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        if el > 0:
            rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [37]:
totalNonZero

232

In [38]:
np.array(tweet_scores).mean()

0.7166666666666667

In [39]:
# penalty for wrong class association of word
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [40]:
np.array(tweet_scores).mean()

0.7166666666666667

## Experiment 2:  Table 1 + manual additions

In [41]:
# define dictionary
t1PlusManual = [
    'blame',
    'accuse',
    'refuse',
    'catastrophe',
    'emergency',
    'chaos',
    'crisis',
    'evil',
    'fight',
    'danger',
    'hysteria',
    'panic',
    'paranoia',
    'fear',
    'fears',
    'laugh',
    'stupidity',
    'hear',
    'see',
    'feel',
    'suppose',
    'perceive',
    'look',
    'appear',
    'suggest',
    'believe',
    'believed',
    'pretend',
    'martial',
    'kill',
    'killing',
    'kills',
    'killed',
    'die',
    'death',
    'dies',
    'dying',
    'dead',
    'died',
    'threat',
    'weapon',
    'weaponize',
    'weaponizing',
    'knife',
    'ussr',
    'japan',
    'chernobyl',
    'wuhan',
    'china',
    'foreigners',
    'cat',
    'cats',
    'dog',
    'dogs',
    'i',
    'me',
    'mine',
    'my',
    'you',
    'yours',
    'your',
    'we',
    'our',
    'propaganda',
    'fake',
    'conspiracy',
    'claim',
    'claims',
    'claiming',
    'claimed',
    'misleading',
    'hoax',
    'cure',
    'breakthrough',
    'bitch',
    'wtf',
    'dogbreath',
    'zombie',
    'junkies',
    'hell',
    'screwed',
    'fuck',
    'fucking',
    'fucked',
    'fuckin',
    'wth',
    'secular',
    'bible',
    'maga',
    'magat',
    'genetic',
    'hillary',
    'clinton',
    'fundamentalist',
    'market',
    'communist',
    'nazi',
    'stock',
    'bank',
    'economy',
    'economic',
    'money',
    'cost',
    'costs',
    'election',
    'campaign',
    'presidential',
    'impeachment',
    'rally',
    'rallies',
    'base',
    'president',
    'trump',
    'war',
    'wwiii',
    'asteroid',
    'banknotes',
    'dangerous',
    'invent',
    'invented',
    'iran',
    'lie',
    'lies',
    'lying',
    'lied',
    'liar',
    'liars',
    'lmfao',
    'lmfaoooooo',
    'misinformation',
    'news',
    'media',
    'financial',
    'propagandawars',
    'antidote'
]

### Disaggregated by correctness of prediction

#### Correctly predicted

In [42]:
# get filtered vocab based on analysis words
filtered_vocab = [el for el in vocab_correct if el.lower() in t1PlusManual]

In [43]:
# make dictionary from filtered vocab
myDict = dict()

for i in range(len(filtered_vocab)):
    myDict[filtered_vocab[i]] = i

In [44]:
# matrix of class associations of filtered vocab
my_matrix = np.zeros((len(correctPred), len(vocab_correct)))

for i in range(len(correctPred)):
    expl = correctPred[i]
    for j in range(len(expl)):
        word = expl[j][0]
        val = expl[j][1]
        
        if word in filtered_vocab:
            if val > 0:
                my_matrix[i, myDict[word]] = 1

            if val < 0:
                my_matrix[i, myDict[word]] = -1

In [45]:
# no penalty
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        if el > 0:
            rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [46]:
totalNonZero

301

In [47]:
np.array(tweet_scores).mean()

0.8235294117647058

In [48]:
# penalty for wrong class association of word
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [49]:
np.array(tweet_scores).mean()

0.8235294117647058

#### Incorrectly predicted

In [50]:
# get filtered vocab based on analysis words
filtered_vocab = [el for el in vocab_wrong if el.lower() in t1PlusManual]

In [51]:
# make dictionary from filtered vocab
myDict = dict()

for i in range(len(filtered_vocab)):
    myDict[filtered_vocab[i]] = i

In [52]:
# matrix of class associations of filtered vocab
my_matrix = np.zeros((len(wrongPred), len(vocab_wrong)))

for i in range(len(wrongPred)):
    expl = wrongPred[i]
    for j in range(len(expl)):
        word = expl[j][0]
        val = expl[j][1]
        
        if word in filtered_vocab:
            if val > 0:
                my_matrix[i, myDict[word]] = 1

            if val < 0:
                my_matrix[i, myDict[word]] = -1

In [53]:
# no penalty
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        if el > 0:
            rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [54]:
np.array(tweet_scores).mean()

0.7777777777777778

In [55]:
# penalty for wrong class association of word
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [56]:
np.array(tweet_scores).mean()

0.7777777777777778

### Aggregated

In [57]:
# get filtered vocab based on analysis words
filtered_vocab = [el for el in vocab_list if el.lower() in t1PlusManual]

In [58]:
filtered_vocab

['you',
 'we',
 'President',
 'Trump',
 'You',
 'your',
 'look',
 'media',
 'TRUMP',
 'We',
 'WWIII',
 'dying',
 'asteroid',
 'BANKNOTES',
 'dangerous',
 'economic',
 'My',
 'believed',
 'knife',
 'fight',
 'NEWS',
 'EMERGENCY',
 'evil',
 'invented',
 'Iran',
 'chaos',
 'refuse',
 'lying',
 'me',
 'Misleading',
 'junkies',
 'Zombie',
 'Market',
 'market',
 'stock',
 'Wuhan',
 'claims',
 'China',
 'conspiracy',
 'killing',
 'LMFAOOOOOO',
 'lied',
 'threat',
 'crisis',
 'Bank',
 'panic',
 'Stupidity',
 'Hell',
 'Death',
 'died',
 'Claims',
 'Fears',
 'Paranoia',
 'weaponizing',
 'hoax',
 'claiming',
 'Fear',
 'campaign',
 'Kills',
 'my',
 'death',
 'I',
 'fucked',
 'our',
 'rallies',
 'base',
 'screwed',
 'emergency',
 'weapon',
 'Rallies',
 'Clinton',
 'DEAD',
 'laugh',
 'cure',
 'DIE',
 'lie',
 'Hoax',
 'lies',
 'economy',
 'dog',
 'see',
 'news',
 'pretend',
 'misinformation',
 'suggest',
 'cost',
 'Communist',
 'fuckin',
 'kill',
 'fear',
 'financial',
 'fucking',
 'hysteria',
 'kill

In [59]:
len(filtered_vocab)

125

In [60]:
# make dictionary from filtered vocab
myDict = dict()

for i in range(len(filtered_vocab)):
    myDict[filtered_vocab[i]] = i

In [61]:
# matrix of class associations of filtered vocab
my_matrix = np.zeros((len(explanations), len(vocab_list)))

for i in range(len(explanations)):
    expl = explanations[i]
    for j in range(len(expl)):
        word = expl[j][0]
        val = expl[j][1]
        
        if word in filtered_vocab:
            if val > 0:
                my_matrix[i, myDict[word]] = 1

            if val < 0:
                my_matrix[i, myDict[word]] = -1

In [62]:
# no penalty
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        if el > 0:
            rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [63]:
totalNonZero

341

In [64]:
np.array(tweet_scores).mean()

0.8166666666666667

In [65]:
# penalty for wrong class association of word
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [66]:
np.array(tweet_scores).mean()

0.8166666666666667

## Experiment 3: Table 1 words + stemming

In [67]:
from nltk.stem.porter import *

In [68]:
stemmer = PorterStemmer()

In [69]:
# define dictionary
stems = [stemmer.stem(word) for word in t1]
stems = list(set(stems))

In [70]:
stems

['antidot',
 'catastroph',
 'financi',
 'appear',
 'mine',
 'fuck',
 'wwiii',
 'look',
 'wuhan',
 'elect',
 'our',
 'liar',
 'china',
 'conspiraci',
 'bitch',
 'refus',
 'campaign',
 'clinton',
 'suppos',
 'perceiv',
 'fuckin',
 'die',
 'feel',
 'believ',
 'i',
 'fear',
 'hoax',
 'evil',
 'cat',
 'accus',
 'ralli',
 'iran',
 'magat',
 'dog',
 'death',
 'your',
 'threat',
 'propagandawar',
 'ussr',
 'market',
 'my',
 'fake',
 'me',
 'hell',
 'secular',
 'wth',
 'cure',
 'wtf',
 'blame',
 'pretend',
 'nazi',
 'emerg',
 'banknot',
 'lmfao',
 'genet',
 'we',
 'chernobyl',
 'news',
 'econom',
 'impeach',
 'you',
 'misinform',
 'breakthrough',
 'trump',
 'suggest',
 'zombi',
 'chao',
 'presidenti',
 'kill',
 'hysteria',
 'danger',
 'hear',
 'asteroid',
 'base',
 'crisi',
 'see',
 'bank',
 'mislead',
 'fight',
 'war',
 'money',
 'screw',
 'communist',
 'hillari',
 'martial',
 'maga',
 'cost',
 'lie',
 'panic',
 'paranoia',
 'dead',
 'propaganda',
 'junki',
 'economi',
 'laugh',
 'presid',
 'm

### Disaggregated by correctness of prediction

#### Correctly predicted

In [71]:
# get filtered vocab based on analysis words
filtered_vocab = [el for el in vocab_correct if stemmer.stem(el).lower() in stems]

In [72]:
# make dictionary from filtered vocab
myDict = dict()

for i in range(len(filtered_vocab)):
    myDict[filtered_vocab[i]] = i

In [73]:
# matrix of class associations of filtered vocab
my_matrix = np.zeros((len(correctPred), len(vocab_correct)))

for i in range(len(correctPred)):
    expl = correctPred[i]
    for j in range(len(expl)):
        word = expl[j][0]
        val = expl[j][1]
        
        if word in filtered_vocab:
            if val > 0:
                my_matrix[i, myDict[word]] = 1

            if val < 0:
                my_matrix[i, myDict[word]] = -1

In [74]:
# no penalty
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        if el > 0:
            rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [75]:
totalNonZero

326

In [76]:
np.array(tweet_scores).mean()

0.8562091503267973

In [77]:
# penalty for wrong class association of word
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [78]:
np.array(tweet_scores).mean()

0.8562091503267973

#### Incorrectly predicted

In [79]:
# get filtered vocab based on analysis words
filtered_vocab = [el for el in vocab_wrong if stemmer.stem(el).lower() in stems]

In [80]:
# make dictionary from filtered vocab
myDict = dict()

for i in range(len(filtered_vocab)):
    myDict[filtered_vocab[i]] = i

In [81]:
# matrix of class associations of filtered vocab
my_matrix = np.zeros((len(wrongPred), len(vocab_wrong)))

for i in range(len(wrongPred)):
    expl = wrongPred[i]
    for j in range(len(expl)):
        word = expl[j][0]
        val = expl[j][1]
        
        if word in filtered_vocab:
            if val > 0:
                my_matrix[i, myDict[word]] = 1

            if val < 0:
                my_matrix[i, myDict[word]] = -1

In [82]:
# no penalty
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        if el > 0:
            rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [83]:
np.array(tweet_scores).mean()

0.8148148148148148

In [84]:
# penalty for wrong class association of word
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [85]:
np.array(tweet_scores).mean()

0.8148148148148148

### Aggregated

In [86]:
# get filtered vocab based on analysis words
filtered_vocab = [el for el in vocab_list if stemmer.stem(el).lower() in stems]

In [87]:
filtered_vocab

['stocks',
 'deaths',
 'markets',
 'you',
 'we',
 'President',
 'Trump',
 'You',
 'your',
 'look',
 'media',
 'TRUMP',
 'We',
 'WWIII',
 'dying',
 'asteroid',
 'BANKNOTES',
 'dangerous',
 'economic',
 'suggests',
 'My',
 'believed',
 'refused',
 'knife',
 'fight',
 'EMERGENCY',
 'evil',
 'invented',
 'Iran',
 'chaos',
 'refuse',
 'lying',
 'me',
 'Misleading',
 'junkies',
 'Zombie',
 'Fundamentalists',
 'Market',
 'market',
 'stock',
 'Wuhan',
 'claims',
 'China',
 'conspiracy',
 'killing',
 'LMFAOOOOOO',
 'lied',
 'threat',
 'crisis',
 'Bank',
 'panic',
 'Stupidity',
 'Hell',
 'Death',
 'died',
 'Impeach',
 'Claims',
 'Fears',
 'Paranoia',
 'weaponizing',
 'hoax',
 'claiming',
 'Fear',
 'campaign',
 'Kills',
 'my',
 'death',
 'I',
 'fucked',
 'refuses',
 'our',
 'rallies',
 'base',
 'screwed',
 'emergency',
 'weapon',
 'Rallies',
 'Clinton',
 'DEAD',
 'laugh',
 'cure',
 'DIE',
 'lie',
 'Hoax',
 'lies',
 'economy',
 'dog',
 'see',
 'news',
 'pretend',
 'foreign',
 'misinformation',
 's

In [88]:
len(filtered_vocab)

140

In [89]:
# make dictionary from filtered vocab
myDict = dict()

for i in range(len(filtered_vocab)):
    myDict[filtered_vocab[i]] = i

In [90]:
# matrix of class associations of filtered vocab
my_matrix = np.zeros((len(explanations), len(vocab_list)))

for i in range(len(explanations)):
    expl = explanations[i]
    for j in range(len(expl)):
        word = expl[j][0]
        val = expl[j][1]
        
        if word in filtered_vocab:
            if val > 0:
                my_matrix[i, myDict[word]] = 1

            if val < 0:
                my_matrix[i, myDict[word]] = -1

In [91]:
# no penalty
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        if el > 0:
            rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [92]:
totalNonZero

365

In [93]:
np.array(tweet_scores).mean()

0.85

In [94]:
# penalty for wrong class association of word
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [95]:
np.array(tweet_scores).mean()

0.85