# Binary Classification with Tweet Embeddings

In [95]:
# import modules
import numpy as np
import pandas as pd
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.base import TransformerMixin
from sklearn.decomposition import FastICA
from sklearn.preprocessing import StandardScaler
import re
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer

## Load data

In [96]:
xls = pd.ExcelFile('../../../ForLastExpt.xlsx')

In [97]:
class1_train = pd.read_excel(xls, 'Class=1 Training')
class1_tune = pd.read_excel(xls, 'Class=1 Validation')
class1_test_pt1 = pd.read_excel(xls, 'Class=1 Test Part-1')
class1_test_pt2 = pd.read_excel(xls, 'Class=1 Test Part-2')
class0_train = pd.read_excel(xls, 'Class=0 Training')
class0_tune = pd.read_excel(xls, 'Class=0 Validation')
class0_test_pt1 = pd.read_excel(xls, 'Class=0 Test Part-1')
class0_test_pt2 = pd.read_excel(xls, 'Class=0 Test Part-2')

In [98]:
train = pd.concat([class1_train, class0_train], ignore_index = True)
tune = pd.concat([class1_tune, class0_tune], ignore_index = True)
test_pt1 = pd.concat([class1_test_pt1, class0_test_pt1], ignore_index = True)
test_pt2 = pd.concat([class1_test_pt2, class0_test_pt2], ignore_index = True)
test = pd.concat([test_pt1, test_pt2], ignore_index = True)
train_tune = pd.concat([train, tune], ignore_index = True)

In [99]:
class1_train

Unnamed: 0,Is_Unreliable,Category,Tweet
0,1,"1, 3, 6, 9",We are living in scary times in Canada. Gov’t ...
1,1,"1, 6, 8, 9","Just as bad in Canada. In fact, our government..."
2,1,"1, 4, 9",It was only a matter of time before the mainst...
3,1,"6, 8",Russia's taking no chances: Foreigners infecte...
4,1,"6, 8, 9",Although there is now a presumptive confirmed ...
...,...,...,...
95,1,"11, 10",EOIR has ordered immigration court staff to re...
96,1,10,President - who said last night that #coronavi...
97,1,11,"Mashhad, NE #Iran Man collapses possibly due t..."
98,1,"10, 11",Pence caught in lies about insurance covering ...


In [100]:
train

Unnamed: 0,Is_Unreliable,Category,Tweet
0,1,"1, 3, 6, 9",We are living in scary times in Canada. Gov’t ...
1,1,"1, 6, 8, 9","Just as bad in Canada. In fact, our government..."
2,1,"1, 4, 9",It was only a matter of time before the mainst...
3,1,"6, 8",Russia's taking no chances: Foreigners infecte...
4,1,"6, 8, 9",Although there is now a presumptive confirmed ...
...,...,...,...
195,0,,Short term financial relief overseas during Co...
196,0,,Coronavirus: Physician tested positive for COV...
197,0,,Coronavirus: Toronto March break camps prepare...
198,0,,Follow live news coverage from Italy on the CO...


## Derive text vectors from word embeddings

In [101]:
embeddings = pd.read_csv('word_embed_50.csv', index_col = 0)
embeddings

Unnamed: 0,Comp 1,Comp 2,Comp 3,Comp 4,Comp 5,Comp 6,Comp 7,Comp 8,Comp 9,Comp 10,...,Comp 41,Comp 42,Comp 43,Comp 44,Comp 45,Comp 46,Comp 47,Comp 48,Comp 49,Comp 50
!,-0.012094,0.000935,0.000976,-0.009518,-0.083408,0.012114,-0.030388,0.020018,0.015333,-0.003408,...,-0.028262,0.036965,-0.017945,-0.020004,0.008269,-0.069600,-0.055546,-0.028868,0.008643,-0.048048
#,0.030090,-0.011546,-0.027933,0.004141,-0.020924,-0.011667,0.028737,0.025169,0.036350,0.117241,...,-0.025798,-0.027569,0.047431,0.084039,0.044840,-0.010098,-0.050790,0.018669,-0.002559,0.039910
(,-0.017210,0.003451,-0.013769,0.016300,-0.011089,0.059428,0.205738,0.022734,-0.092807,-0.003518,...,0.014060,-0.000427,-0.001940,-0.031249,0.007885,-0.006893,0.013552,-0.103300,0.012017,-0.004168
),-0.015370,0.005453,-0.015889,0.016313,-0.010720,0.060504,0.207101,0.022378,-0.093391,-0.001944,...,0.012235,0.000202,-0.003083,-0.031148,0.006398,-0.006262,0.014583,-0.115351,0.009816,-0.012142
",",0.009391,-0.028855,0.000247,0.023929,-0.018705,-0.001752,-0.002686,0.021136,-0.036904,0.031643,...,0.105091,0.072078,-0.036567,0.075362,0.052916,-0.075958,-0.021310,-0.000011,-0.008623,-0.073998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
‘,-0.091469,-0.038311,-0.017282,0.009905,0.011328,-0.017742,0.040392,-0.018368,0.026076,-0.018067,...,0.004947,-0.021305,-0.020591,0.017108,0.047809,0.010627,0.020588,0.003104,0.014091,-0.015461
’,-0.007426,-0.033304,0.013391,0.049407,-0.040433,-0.105308,-0.007539,-0.007917,0.023227,0.011218,...,-0.004001,0.021299,-0.044304,0.040301,0.057719,-0.007113,-0.048529,-0.034061,-0.135474,-0.025738
“,0.009738,-0.022400,-0.024158,0.014539,-0.011919,0.039461,-0.019486,-0.026052,-0.049054,-0.014180,...,0.012043,-0.008504,0.020894,0.004209,0.007849,0.005359,-0.100265,-0.012615,-0.028282,-0.042011
”,0.005733,-0.021793,-0.021662,0.003647,-0.017282,0.040848,-0.024902,-0.011127,-0.030509,0.000597,...,0.005433,-0.012980,-0.006990,0.009010,-0.000684,0.020456,0.002314,-0.008129,-0.027064,0.014475


In [102]:
embeddings.shape

(2327, 50)

In [103]:
embeddings.index.tolist()

['!',
 '#',
 '(',
 ')',
 ',',
 '-',
 '--',
 '.',
 '..',
 '...',
 '....',
 '1',
 '1,600',
 '1,975',
 '1-',
 '10',
 '10,000',
 '100',
 '1000',
 '10:30',
 '10:30am',
 '10th',
 '11',
 '115',
 '118,000',
 '12',
 '133',
 '14',
 '14:21',
 '1500',
 '16-18',
 '17',
 '170',
 '18',
 '19',
 '1900',
 '19th',
 '1:30',
 '1b',
 '1st',
 '2',
 '2-',
 '2.5',
 '20',
 '20-40s',
 '200',
 '2019',
 '2019-ncovid',
 '2019ncov',
 '2020',
 '2021',
 '21',
 '22',
 '23',
 '24',
 '24thminute',
 '25',
 '26',
 '27',
 '2700',
 '29th',
 '2\uf30e1',
 '3',
 '3,000',
 '3.4',
 '30',
 '30.8',
 '32',
 '3200',
 '36',
 '366',
 '38',
 '39',
 '4',
 '40',
 '40-70',
 '409k',
 '42nd',
 '43',
 '4600',
 '48',
 '5',
 '50',
 '54,000',
 '56,000',
 '57',
 '577',
 '5:30pm',
 '5g',
 '5th',
 '6',
 '6,000',
 '60',
 '600',
 '60m',
 '61k',
 '66',
 '680news',
 '680newsweather',
 '691',
 '6ixside',
 '7',
 '7,800',
 '71',
 '75',
 '76',
 '7th',
 '8',
 '8,400',
 '8.3',
 '80',
 '800',
 '80yrs',
 '88-year-old',
 '8th',
 '9',
 '90',
 '90,000',
 '99',
 '

In [104]:
embeddings.loc['!'].values.shape

(50,)

In [105]:
zeros_array = np.zeros(50)
zeros_array.shape

(50,)

In [106]:
np.vstack((zeros_array, embeddings.loc['!'].values))

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [-0.01209355,  0.00093517,  0.0009762 , -0.00951836, -0.08340804,
         0.0121143 , -0.03038843,  0.02001789,  0.01533262, -0.00340765,
         0.04601693, -0.00884506, -0.03352376,  0.03479964, -0.0032449 ,
        -0.08065358, -0.00060672,  0.05827247, -0.

In [107]:
# to convert contractions picked up by word_tokenize() into full words
contractions = {
    "n't": 'not',
    "'ve": 'have',
    "'s": 'is', # note that this will include possessive nouns
    'gonna': 'going to',
    'gotta': 'got to',
    "'d": 'would',
    "'ll": 'will',
    "'re": 'are',
    "'m": 'am',
    'wanna': 'want to'
}

# to convert nltk_pos tags to wordnet-compatible PoS tags
def convert_pos_wordnet(tag):
    tag_abbr = tag[0].upper()
    tag_dict = {
        'J': wordnet.ADJ,
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV
    }
                
    if tag_abbr in tag_dict:
        return tag_dict[tag_abbr]

In [108]:
def get_text_vectors(word_embeddings, # pandas dataframe with row index = vocabulary
                     text_list, # list of strings
                     remove_stopwords = True,
                     lowercase = True,
                     lemmatize = True,
                     add_start_end_tokens = True):
    
    lemmatizer = WordNetLemmatizer()
    
    for k in range(len(text_list)):
        text = text_list[k]
        text = re.sub(r'[_~`@$%^&*[\]+=\|}{\"\'<>/]+', '', text)
        text_vec = np.zeros(word_embeddings.shape[1])
        words = word_tokenize(text)
        tracker = 0 # to track whether we've encountered a word for which we have an embedding (in each tweet)
        
        if remove_stopwords:
            clean_words = []
            for word in words:
                if word.lower() not in set(stopwords.words('english')):
                    clean_words.append(word)
            words = clean_words

        if lowercase:
            clean_words = []
            for word in words:
                clean_words.append(word.lower())

            words = clean_words

        if lemmatize:
            clean_words = []
            for word in words:
                PoS_tag = pos_tag([word])[0][1]

                # to change contractions to full word form
                if word in contractions:
                    word = contractions[word]

                if PoS_tag[0].upper() in 'JNVR':
                    word = lemmatizer.lemmatize(word, convert_pos_wordnet(PoS_tag))
                else:
                    word = lemmatizer.lemmatize(word)

                clean_words.append(word)

            words = clean_words

        if add_start_end_tokens:
            words = ['<START>'] + words + ['<END>']
        
        for i in range(len(words)):
            word = words[i]
            if word in word_embeddings.index.tolist():
                word_embed_vec = word_embeddings.loc[word].values
                if tracker == 0:
                    text_matrix = word_embed_vec
                else:
                    text_matrix = np.vstack((text_matrix, word_embed_vec))
                    
                # only increment if we have come across a word in the embeddings dictionary
                tracker += 1
                    
        for j in range(len(text_vec)):
            text_vec[j] = text_matrix[:,j].mean()
            
        if k == 0:
            full_matrix = text_vec
        else:
            full_matrix = np.vstack((full_matrix, text_vec))
            
    return full_matrix

In [109]:
train_X = get_text_vectors(embeddings, train['Tweet'])
train_y = train['Is_Unreliable']

tune_X = get_text_vectors(embeddings, tune['Tweet'])
tune_y = tune['Is_Unreliable']

test_pt1_X = get_text_vectors(embeddings, test_pt1['Tweet'])
test_pt1_y = test_pt1['Is_Unreliable']

test_pt2_X = get_text_vectors(embeddings, test_pt2['Tweet'])
test_pt2_y = test_pt2['Is_Unreliable']

test_X = get_text_vectors(embeddings, test['Tweet'])
test_y = test['Is_Unreliable']

train_tune_X = get_text_vectors(embeddings, train_tune['Tweet'])
train_tune_y = train_tune['Is_Unreliable']

## Binary classification: two-fold CV

In [110]:
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

In [111]:
# SVC hyperparams to optimize
kernel = ['rbf', 'linear', 'poly', 'sigmoid']
C = [0.001, 0.01, 0.1, 1, 10]

### Fold 1

In [112]:
# training & tuning
models = []
tune_auc = []
for ker in kernel:
    for el in C:
        svc = SVC(C = el, kernel = ker, probability = True)
        svc.fit(train_X, train_y)
        models.append(svc)
        tune_predict = svc.predict_proba(tune_X)[:,1]
        auc = roc_auc_score(tune_y, tune_predict)
        tune_auc.append(auc)
        
opt_model = models[tune_auc.index(max(tune_auc))]
opt_model

SVC(C=1, probability=True)

In [113]:
tune_auc

[0.06125,
 0.06125,
 0.9381250000000001,
 0.9668749999999999,
 0.944375,
 0.5,
 0.068125,
 0.068125,
 0.19468749999999999,
 0.9440625,
 0.5,
 0.08281250000000001,
 0.9221875,
 0.941875,
 0.9175,
 0.5,
 0.5,
 0.933125,
 0.9512499999999999,
 0.954375]

In [114]:
# testing
test_scores = dict()
test_predict = opt_model.predict(test_X)
test_predict_proba = opt_model.predict_proba(test_X)[:,1]
test_scores['auc'] = [roc_auc_score(test_y, test_predict_proba)]
test_scores['accuracy'] = [accuracy_score(test_y, test_predict)]
test_scores['recall'] = [recall_score(test_y, test_predict)]
test_scores['precision'] = [precision_score(test_y, test_predict)]
test_scores['f1'] = [f1_score(test_y, test_predict)]
test_scores

{'auc': [0.9155102040816326],
 'accuracy': [0.7928571428571428],
 'recall': [0.9285714285714286],
 'precision': [0.7303370786516854],
 'f1': [0.8176100628930818]}

### Fold 2

In [115]:
# training & tuning
models = []
tune_auc = []
for ker in kernel:
    for el in C:
        svc = SVC(C = el, kernel = ker, probability = True)
        svc.fit(test_pt1_X, test_pt1_y)
        models.append(svc)
        tune_predict = svc.predict_proba(test_pt2_X)[:,1]
        auc = roc_auc_score(test_pt2_y, tune_predict)
        tune_auc.append(auc)
        
opt_model = models[tune_auc.index(max(tune_auc))]
opt_model

SVC(C=10, kernel='sigmoid', probability=True)

In [116]:
tune_auc

[0.5,
 0.043125,
 0.95625,
 0.969375,
 0.923125,
 0.036250000000000004,
 0.036250000000000004,
 0.036250000000000004,
 0.5,
 0.965,
 0.5,
 0.076875,
 0.921875,
 0.9456249999999999,
 0.885625,
 0.5,
 0.03,
 0.970625,
 0.9668749999999999,
 0.9706250000000001]

In [117]:
# testing
test_predict = opt_model.predict(train_tune_X)
test_predict_proba = opt_model.predict_proba(train_tune_X)[:,1]
test_scores['auc'].append(roc_auc_score(train_tune_y, test_predict_proba))
test_scores['accuracy'].append(accuracy_score(train_tune_y, test_predict))
test_scores['recall'].append(recall_score(train_tune_y, test_predict))
test_scores['precision'].append(precision_score(train_tune_y, test_predict))
test_scores['f1'].append(f1_score(train_tune_y, test_predict))
test_scores

{'auc': [0.9155102040816326, 0.8897448979591838],
 'accuracy': [0.7928571428571428, 0.8142857142857143],
 'recall': [0.9285714285714286, 0.75],
 'precision': [0.7303370786516854, 0.860655737704918],
 'f1': [0.8176100628930818, 0.8015267175572519]}

### Average results

In [118]:
sum(test_scores['auc'])/2

0.9026275510204083

In [119]:
sum(test_scores['accuracy'])/2

0.8035714285714286

In [120]:
sum(test_scores['recall'])/2

0.8392857142857143

In [121]:
sum(test_scores['precision'])/2

0.7954964081783017

In [122]:
sum(test_scores['f1'])/2

0.8095683902251669