# Compare Experiments

In [41]:
# load packages
import pickle
import numpy as np
from sklearn.svm import SVC

## Train Classifier

In [42]:
# load ICA embeddings
embedded_tweets = np.load('tweet_embed_250.npy')

In [43]:
# instantiate classification algorithm

# round 1 winner
svc = SVC(C = 1, kernel = 'rbf', probability = True)

class1_train_indices = list(range(100))
class0_train_indices = list(range(280,380))

train_X = embedded_tweets[[class1_train_indices + class0_train_indices],:][0]

hundred_ones = [1]*100
hundred_zeros = [0]*100
train_Y = hundred_ones + hundred_zeros

# fit SVC model on training subset of tweet embeddings
svc.fit(train_X, train_Y)

SVC(C=1, probability=True)

## Disaggregate correct/incorrect predictions

In [44]:
# get test set
test_indices = list(range(100,280))
test_X = embedded_tweets[[test_indices],:][0]

# model predictions for test set
y_hat = svc.predict(test_X)

In [45]:
# correct or not
y_bool = y_hat==1

## Load list of explanations

In [46]:
# read in explanations list
with open('explanation_list', 'rb') as f:
    explanations = pickle.load(f)

In [47]:
explanations[0]

[('cases', -0.20405363785689484),
 ('new', -0.155064717978664),
 ('deaths', -0.10207417820647408),
 ('coronavirus', -0.06698967143404387),
 ('60', 0.05479825610293619),
 ('buy', 0.037730810152627504),
 ('stocks', 0.030866492206280807),
 ('back', 0.02862044153289094),
 ('portfolio', 0.019228230730050148),
 ('virus', -0.01764508279082087),
 ('Reasons', 0.01762324495375404),
 ('markets', 0.017075460328202865),
 ('1500', 0.015832371859270025),
 ('recovered', 0.015536797813199194),
 ('2700', 0.015119529843808394),
 ('to', -0.002003116629506429),
 ('today', 0.0003091368069083442)]

In [48]:
y_bool

array([False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True, False,  True,  True, False,  True,  True,  True,
        True,  True,

In [49]:
y_boolF = y_bool==False
y_boolF

array([ True, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False,  True, False, False,  True, False, False, False,
       False, False,

In [50]:
# separate by correctness of prediction
from itertools import compress
correctPred = list(compress(explanations, y_bool))
wrongPred = list(compress(explanations, y_boolF))

In [51]:
wrongPred[0]

[('cases', -0.20405363785689484),
 ('new', -0.155064717978664),
 ('deaths', -0.10207417820647408),
 ('coronavirus', -0.06698967143404387),
 ('60', 0.05479825610293619),
 ('buy', 0.037730810152627504),
 ('stocks', 0.030866492206280807),
 ('back', 0.02862044153289094),
 ('portfolio', 0.019228230730050148),
 ('virus', -0.01764508279082087),
 ('Reasons', 0.01762324495375404),
 ('markets', 0.017075460328202865),
 ('1500', 0.015832371859270025),
 ('recovered', 0.015536797813199194),
 ('2700', 0.015119529843808394),
 ('to', -0.002003116629506429),
 ('today', 0.0003091368069083442)]

In [52]:
# get unique words from all explanations
vocab_list = []
for subList in explanations:
    for el in subList:
        if el[0] not in vocab_list:
            vocab_list.append(el[0])
len(vocab_list)

1659

In [53]:
# unique words from explanations for correctly predicted unreliable tweets
vocab_correct = []
for subList in correctPred:
    for el in subList:
        if el[0] not in vocab_correct:
            vocab_correct.append(el[0])
len(vocab_correct)

1615

In [54]:
# unique words from explanations for incorrectly predicted unreliable tweets
vocab_wrong = []
for subList in wrongPred:
    for el in subList:
        if el[0] not in vocab_wrong:
            vocab_wrong.append(el[0])
len(vocab_wrong)

120

## Experiment 1: Table 1 words

In [107]:
# define dictionary
t1 = [
    'blame',
    'accuse',
    'refuse',
    'catastrophe',
    'chaos',
    'evil',
    'fight',
    'danger',
    'hysteria',
    'panic',
    'paranoia',
    'laugh',
    'stupidity',
    'hear',
    'see',
    'feel',
    'suppose',
    'perceive',
    'look',
    'appear',
    'suggest',
    'believe',
    'pretend',
    'martial',
    'kill',
    'die',
    'weapon',
    'weaponizing',
    'ussr',
    'japan',
    'fukushima',
    'chernobyl',
    'wuhan',
    'china',
    'foreigners',
    'cats',
    'dogs',
    'i',
    'me',
    'mine',
    'my',
    'you',
    'your',
    'we',
    'our',
    'propaganda',
    'fake',
    'conspiracy',
    'claim',
    'misleading',
    'hoax',
    'cure',
    'breakthrough',
    'bitch',
    'wtf',
    'dogbreath',
    'zombie',
    'junkies',
    'hell',
    'screwed',
    'secular',
    'bible',
    'maga',
    'magat',
    'genetic',
    'hillary',
    'chinese',
    'fundamentalist',
    'market',
    'communist',
    'nazi',
    'stock',
    'economy',
    'money',
    'cost',
    'costs',
    'election',
    'campaign',
    'presidential',
    'impeachment',
    'rallies',
    'base',
    'trump',
    'war',
    'iran'
]

### Disaggregated by correctness of prediction

#### Correctly predicted

In [108]:
# get filtered vocab based on analysis words
filtered_vocab = [el for el in vocab_correct if el.lower() in t1]

In [109]:
# make dictionary from filtered vocab
myDict = dict()

for i in range(len(filtered_vocab)):
    myDict[filtered_vocab[i]] = i

In [110]:
# matrix of class associations of filtered vocab
my_matrix = np.zeros((len(correctPred), len(vocab_correct)))

for i in range(len(correctPred)):
    expl = correctPred[i]
    for j in range(len(expl)):
        word = expl[j][0]
        val = expl[j][1]
        
        if word in filtered_vocab:
            if val > 0:
                my_matrix[i, myDict[word]] = 1

            if val < 0:
                my_matrix[i, myDict[word]] = -1

In [111]:
# no penalty
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        if el > 0:
            rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [112]:
totalNonZero

228

In [113]:
np.array(tweet_scores).mean()

0.5341130604288499

In [114]:
# penalty for wrong class association of word
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [115]:
np.array(tweet_scores).mean()

0.3313840155945419

#### Incorrectly predicted

In [116]:
# get filtered vocab based on analysis words
filtered_vocab = [el for el in vocab_wrong if el.lower() in t1]

In [117]:
# make dictionary from filtered vocab
myDict = dict()

for i in range(len(filtered_vocab)):
    myDict[filtered_vocab[i]] = i

In [118]:
# matrix of class associations of filtered vocab
my_matrix = np.zeros((len(wrongPred), len(vocab_wrong)))

for i in range(len(wrongPred)):
    expl = wrongPred[i]
    for j in range(len(expl)):
        word = expl[j][0]
        val = expl[j][1]
        
        if word in filtered_vocab:
            if val > 0:
                my_matrix[i, myDict[word]] = 1

            if val < 0:
                my_matrix[i, myDict[word]] = -1

In [119]:
# no penalty
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        if el > 0:
            rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [120]:
np.array(tweet_scores).mean()

0.2777777777777778

In [121]:
# penalty for wrong class association of word
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [122]:
np.array(tweet_scores).mean()

0.2222222222222222

### Aggregated

In [123]:
# get filtered vocab based on analysis words
filtered_vocab = [el for el in vocab_list if el.lower() in t1]

In [124]:
# make dictionary from filtered vocab
myDict = dict()

for i in range(len(filtered_vocab)):
    myDict[filtered_vocab[i]] = i

In [125]:
# matrix of class associations of filtered vocab
my_matrix = np.zeros((len(explanations), len(vocab_list)))

for i in range(len(explanations)):
    expl = explanations[i]
    for j in range(len(expl)):
        word = expl[j][0]
        val = expl[j][1]
        
        if word in filtered_vocab:
            if val > 0:
                my_matrix[i, myDict[word]] = 1

            if val < 0:
                my_matrix[i, myDict[word]] = -1

In [126]:
# no penalty
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        if el > 0:
            rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [127]:
totalNonZero

232

In [128]:
np.array(tweet_scores).mean()

0.5212962962962963

In [129]:
# penalty for wrong class association of word
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [130]:
np.array(tweet_scores).mean()

0.3259259259259259

## Experiment 2:  Table 1 + manual additions

In [131]:
# define dictionary
t1PlusManual = [
    'blame',
    'accuse',
    'refuse',
    'catastrophe',
    'emergency',
    'chaos',
    'crisis',
    'evil',
    'fight',
    'danger',
    'hysteria',
    'panic',
    'paranoia',
    'fear',
    'fears',
    'laugh',
    'stupidity',
    'hear',
    'see',
    'feel',
    'suppose',
    'perceive',
    'look',
    'appear',
    'suggest',
    'believe',
    'believed',
    'pretend',
    'martial',
    'kill',
    'killing',
    'kills',
    'killed',
    'die',
    'death',
    'dies',
    'dying',
    'dead',
    'died',
    'threat',
    'weapon',
    'weaponize',
    'weaponizing',
    'knife',
    'ussr',
    'japan',
    'chernobyl',
    'wuhan',
    'china',
    'foreigners',
    'cat',
    'cats',
    'dog',
    'dogs',
    'i',
    'me',
    'mine',
    'my',
    'you',
    'yours',
    'your',
    'we',
    'our',
    'propaganda',
    'fake',
    'conspiracy',
    'claim',
    'claims',
    'claiming',
    'claimed',
    'misleading',
    'hoax',
    'cure',
    'breakthrough',
    'bitch',
    'wtf',
    'dogbreath',
    'zombie',
    'junkies',
    'hell',
    'screwed',
    'fuck',
    'fucking',
    'fucked',
    'fuckin',
    'wth',
    'secular',
    'bible',
    'maga',
    'magat',
    'genetic',
    'hillary',
    'clinton',
    'fundamentalist',
    'market',
    'communist',
    'nazi',
    'stock',
    'bank',
    'economy',
    'economic',
    'money',
    'cost',
    'costs',
    'election',
    'campaign',
    'presidential',
    'impeachment',
    'rally',
    'rallies',
    'base',
    'president',
    'trump',
    'war',
    'wwiii',
    'asteroid',
    'banknotes',
    'dangerous',
    'invent',
    'invented',
    'iran',
    'lie',
    'lies',
    'lying',
    'lied',
    'liar',
    'liars',
    'lmfao',
    'lmfaoooooo',
    'misinformation',
    'news',
    'media',
    'financial',
    'propagandawars',
    'antidote'
]

### Disaggregated by correctness of prediction

#### Correctly predicted

In [132]:
# get filtered vocab based on analysis words
filtered_vocab = [el for el in vocab_correct if el.lower() in t1PlusManual]

In [133]:
# make dictionary from filtered vocab
myDict = dict()

for i in range(len(filtered_vocab)):
    myDict[filtered_vocab[i]] = i

In [134]:
# matrix of class associations of filtered vocab
my_matrix = np.zeros((len(correctPred), len(vocab_correct)))

for i in range(len(correctPred)):
    expl = correctPred[i]
    for j in range(len(expl)):
        word = expl[j][0]
        val = expl[j][1]
        
        if word in filtered_vocab:
            if val > 0:
                my_matrix[i, myDict[word]] = 1

            if val < 0:
                my_matrix[i, myDict[word]] = -1

In [135]:
# no penalty
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        if el > 0:
            rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [136]:
totalNonZero

331

In [137]:
np.array(tweet_scores).mean()

0.5933723196881092

In [138]:
# penalty for wrong class association of word
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [139]:
np.array(tweet_scores).mean()

0.35633528265107217

#### Incorrectly predicted

In [140]:
# get filtered vocab based on analysis words
filtered_vocab = [el for el in vocab_wrong if el.lower() in t1PlusManual]

In [141]:
# make dictionary from filtered vocab
myDict = dict()

for i in range(len(filtered_vocab)):
    myDict[filtered_vocab[i]] = i

In [142]:
# matrix of class associations of filtered vocab
my_matrix = np.zeros((len(wrongPred), len(vocab_wrong)))

for i in range(len(wrongPred)):
    expl = wrongPred[i]
    for j in range(len(expl)):
        word = expl[j][0]
        val = expl[j][1]
        
        if word in filtered_vocab:
            if val > 0:
                my_matrix[i, myDict[word]] = 1

            if val < 0:
                my_matrix[i, myDict[word]] = -1

In [143]:
# no penalty
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        if el > 0:
            rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [144]:
np.array(tweet_scores).mean()

0.31481481481481477

In [145]:
# penalty for wrong class association of word
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [146]:
np.array(tweet_scores).mean()

0.07407407407407408

### Aggregated

In [147]:
# get filtered vocab based on analysis words
filtered_vocab = [el for el in vocab_list if el.lower() in t1PlusManual]

In [148]:
# make dictionary from filtered vocab
myDict = dict()

for i in range(len(filtered_vocab)):
    myDict[filtered_vocab[i]] = i

In [149]:
# matrix of class associations of filtered vocab
my_matrix = np.zeros((len(explanations), len(vocab_list)))

for i in range(len(explanations)):
    expl = explanations[i]
    for j in range(len(expl)):
        word = expl[j][0]
        val = expl[j][1]
        
        if word in filtered_vocab:
            if val > 0:
                my_matrix[i, myDict[word]] = 1

            if val < 0:
                my_matrix[i, myDict[word]] = -1

In [150]:
# no penalty
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        if el > 0:
            rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [151]:
totalNonZero

341

In [152]:
np.array(tweet_scores).mean()

0.5794444444444445

In [153]:
# penalty for wrong class association of word
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [154]:
np.array(tweet_scores).mean()

0.3422222222222222

## Experiment 3: Table 1 words + stemming

In [334]:
from nltk.stem.porter import *

In [335]:
stemmer = PorterStemmer()

In [336]:
# define dictionary
stems = [stemmer.stem(word) for word in t1]
stems = list(set(stems))

In [337]:
stems

['fake',
 'feel',
 'base',
 'elect',
 'fundamentalist',
 'me',
 'look',
 'we',
 'economi',
 'conspiraci',
 'impeach',
 'blame',
 'hear',
 'zombi',
 'ralli',
 'dogbreath',
 'foreign',
 'weapon',
 'danger',
 'hoax',
 'hillari',
 'my',
 'genet',
 'secular',
 'wuhan',
 'i',
 'die',
 'screw',
 'appear',
 'stupid',
 'chernobyl',
 'market',
 'cat',
 'wtf',
 'presidenti',
 'suppos',
 'believ',
 'nazi',
 'iran',
 'panic',
 'trump',
 'hysteria',
 'maga',
 'stock',
 'our',
 'bitch',
 'japan',
 'bibl',
 'fukushima',
 'mislead',
 'cost',
 'laugh',
 'chao',
 'breakthrough',
 'accus',
 'pretend',
 'china',
 'see',
 'magat',
 'catastroph',
 'refus',
 'perceiv',
 'junki',
 'kill',
 'mine',
 'war',
 'communist',
 'ussr',
 'evil',
 'propaganda',
 'fight',
 'claim',
 'cure',
 'suggest',
 'dog',
 'chines',
 'paranoia',
 'your',
 'you',
 'campaign',
 'hell',
 'martial',
 'money']

### Disaggregated by correctness of prediction

#### Correctly predicted

In [338]:
# get filtered vocab based on analysis words
filtered_vocab = [el for el in vocab_correct if stemmer.stem(el).lower() in stems]

In [339]:
# make dictionary from filtered vocab
myDict = dict()

for i in range(len(filtered_vocab)):
    myDict[filtered_vocab[i]] = i

In [340]:
# matrix of class associations of filtered vocab
my_matrix = np.zeros((len(correctPred), len(vocab_correct)))

for i in range(len(correctPred)):
    expl = correctPred[i]
    for j in range(len(expl)):
        word = expl[j][0]
        val = expl[j][1]
        
        if word in filtered_vocab:
            if val > 0:
                my_matrix[i, myDict[word]] = 1

            if val < 0:
                my_matrix[i, myDict[word]] = -1

In [341]:
# no penalty
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        if el > 0:
            rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [342]:
totalNonZero

267

In [343]:
np.array(tweet_scores).mean()

0.5927875243664718

In [344]:
# penalty for wrong class association of word
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [345]:
np.array(tweet_scores).mean()

0.3961013645224172

#### Incorrectly predicted

In [346]:
# get filtered vocab based on analysis words
filtered_vocab = [el for el in vocab_wrong if stemmer.stem(el).lower() in stems]

In [347]:
# make dictionary from filtered vocab
myDict = dict()

for i in range(len(filtered_vocab)):
    myDict[filtered_vocab[i]] = i

In [348]:
# matrix of class associations of filtered vocab
my_matrix = np.zeros((len(wrongPred), len(vocab_wrong)))

for i in range(len(wrongPred)):
    expl = wrongPred[i]
    for j in range(len(expl)):
        word = expl[j][0]
        val = expl[j][1]
        
        if word in filtered_vocab:
            if val > 0:
                my_matrix[i, myDict[word]] = 1

            if val < 0:
                my_matrix[i, myDict[word]] = -1

In [349]:
# no penalty
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        if el > 0:
            rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [350]:
np.array(tweet_scores).mean()

0.5

In [351]:
# penalty for wrong class association of word
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [352]:
np.array(tweet_scores).mean()

0.4444444444444444

### Aggregated

In [353]:
# get filtered vocab based on analysis words
filtered_vocab = [el for el in vocab_list if stemmer.stem(el).lower() in stems]

In [None]:
filtered_vocab

In [354]:
# make dictionary from filtered vocab
myDict = dict()

for i in range(len(filtered_vocab)):
    myDict[filtered_vocab[i]] = i

In [355]:
# matrix of class associations of filtered vocab
my_matrix = np.zeros((len(explanations), len(vocab_list)))

for i in range(len(explanations)):
    expl = explanations[i]
    for j in range(len(expl)):
        word = expl[j][0]
        val = expl[j][1]
        
        if word in filtered_vocab:
            if val > 0:
                my_matrix[i, myDict[word]] = 1

            if val < 0:
                my_matrix[i, myDict[word]] = -1

In [356]:
# no penalty
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        if el > 0:
            rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [357]:
totalNonZero

276

In [358]:
np.array(tweet_scores).mean()

0.5881481481481482

In [359]:
# penalty for wrong class association of word
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [360]:
np.array(tweet_scores).mean()

0.39851851851851844

## Experiment 4: Table 1 words + slang dictionary

In [361]:
# define dictionary
slang = [
    
]

### Disaggregated by correctness of prediction

#### Correctly predicted

In [362]:
# get filtered vocab based on analysis words
filtered_vocab = [el for el in vocab_correct if el.lower() in slang]

In [363]:
# make dictionary from filtered vocab
myDict = dict()

for i in range(len(filtered_vocab)):
    myDict[filtered_vocab[i]] = i

In [364]:
# matrix of class associations of filtered vocab
my_matrix = np.zeros((len(correctPred), len(vocab_correct)))

for i in range(len(correctPred)):
    expl = correctPred[i]
    for j in range(len(expl)):
        word = expl[j][0]
        val = expl[j][1]
        
        if word in filtered_vocab:
            if val > 0:
                my_matrix[i, myDict[word]] = 1

            if val < 0:
                my_matrix[i, myDict[word]] = -1

In [365]:
# no penalty
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        if el > 0:
            rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [366]:
totalNonZero

0

In [367]:
np.array(tweet_scores).mean()

0.0

In [368]:
# penalty for wrong class association of word
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [369]:
np.array(tweet_scores).mean()

0.0

#### Incorrectly predicted

In [370]:
# get filtered vocab based on analysis words
filtered_vocab = [el for el in vocab_wrong if el.lower() in slang]

In [371]:
# make dictionary from filtered vocab
myDict = dict()

for i in range(len(filtered_vocab)):
    myDict[filtered_vocab[i]] = i

In [372]:
# matrix of class associations of filtered vocab
my_matrix = np.zeros((len(wrongPred), len(vocab_wrong)))

for i in range(len(wrongPred)):
    expl = wrongPred[i]
    for j in range(len(expl)):
        word = expl[j][0]
        val = expl[j][1]
        
        if word in filtered_vocab:
            if val > 0:
                my_matrix[i, myDict[word]] = 1

            if val < 0:
                my_matrix[i, myDict[word]] = -1

In [373]:
# no penalty
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        if el > 0:
            rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [374]:
np.array(tweet_scores).mean()

0.0

In [375]:
# penalty for wrong class association of word
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [376]:
np.array(tweet_scores).mean()

0.0

### Aggregated

In [377]:
# get filtered vocab based on analysis words
filtered_vocab = [el for el in vocab_list if el.lower() in slang]

In [378]:
# make dictionary from filtered vocab
myDict = dict()

for i in range(len(filtered_vocab)):
    myDict[filtered_vocab[i]] = i

In [379]:
# matrix of class associations of filtered vocab
my_matrix = np.zeros((len(explanations), len(vocab_list)))

for i in range(len(explanations)):
    expl = explanations[i]
    for j in range(len(expl)):
        word = expl[j][0]
        val = expl[j][1]
        
        if word in filtered_vocab:
            if val > 0:
                my_matrix[i, myDict[word]] = 1

            if val < 0:
                my_matrix[i, myDict[word]] = -1

In [380]:
# no penalty
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        if el > 0:
            rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [381]:
totalNonZero

0

In [382]:
np.array(tweet_scores).mean()

0.0

In [383]:
# penalty for wrong class association of word
tweet_scores = []
totalNonZero = 0
for row in my_matrix:
    rowSum = 0
    nonZero = 0
    for el in row:
        rowSum += el
        if el != 0:
            nonZero += 1
            totalNonZero += 1
    
    if nonZero > 0:
        score = rowSum/nonZero
    else:
        score = 0
    
    tweet_scores.append(score)

In [384]:
np.array(tweet_scores).mean()

0.0