# Binary Classification with Tweet Embeddings

In [1]:
# import modules
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

## Load data

In [2]:
class1_train_indices = list(range(100))
class1_tune_indices = list(range(100,140))
class1_test_pt1_indices = list(range(140,240))
class1_test_pt2_indices = list(range(240,280))
class0_train_indices = list(range(280,380))
class0_tune_indices = list(range(380,420))
class0_test_pt1_indices = list(range(420,520))
class0_test_pt2_indices = list(range(520,560))

In [3]:
# load array of tweet embeddings according to index of original tweet file
num_embed = 150
fname = '../tweet_embeddings/tweet_embed_{}.npy'.format(num_embed)
tweet_embeddings = np.load(fname)
tweet_embeddings

array([[-6.88441002e-03,  2.14434517e-02, -9.09390624e-03, ...,
         1.66813656e-02, -8.59501441e-04, -6.45038951e-03],
       [-6.03031685e-03,  1.93254061e-02,  4.38044860e-03, ...,
         1.76256832e-02,  1.71151973e-02, -1.42828664e-02],
       [-2.45981463e-04,  5.12543027e-03,  1.68124867e-02, ...,
         3.01599815e-03, -6.57744718e-05,  4.66380666e-04],
       ...,
       [ 1.95957059e-02,  2.15798522e-02,  1.50377897e-03, ...,
        -1.95135041e-03, -2.81859175e-03, -2.11670430e-02],
       [-1.90048927e-03,  9.09224229e-03,  2.02165611e-03, ...,
        -3.54798182e-03,  1.20675428e-02, -3.10609596e-03],
       [ 2.70635669e-03,  1.13606831e-02,  1.73400811e-03, ...,
         7.21464538e-03,  6.52971204e-03, -6.73105757e-03]])

In [5]:
tweet_embeddings[class0_train_indices,:]

array([[ 0.00384834, -0.00068701,  0.01302268, ...,  0.00274137,
         0.00282781, -0.00061727],
       [ 0.01526974,  0.00280597,  0.00431867, ..., -0.00017089,
        -0.01230695, -0.00339119],
       [-0.00434156,  0.00867312,  0.00375309, ...,  0.00192461,
         0.01184405, -0.00179008],
       ...,
       [ 0.00602631,  0.00616558, -0.00060007, ...,  0.00271803,
        -0.00164902,  0.0197803 ],
       [ 0.01911258,  0.01042606, -0.00505949, ..., -0.00127946,
         0.03334516,  0.00702249],
       [-0.00542337,  0.00374834, -0.00736484, ...,  0.00543827,
         0.00601246, -0.01452185]])

## Derive text vectors from word embeddings

In [6]:
forty_ones = [1]*40
forty_zeros = [0]*40

hundred_ones = [1]*100
hundred_zeros = [0]*100

In [7]:
train_fold1_X = tweet_embeddings[[class1_train_indices + class0_train_indices],:][0]
train_fold1_y = hundred_ones + hundred_zeros

tune_fold1_X = tweet_embeddings[[class1_tune_indices + class0_tune_indices],:][0]
tune_fold1_y = forty_ones + forty_zeros

test_fold1_X = tweet_embeddings[[class1_test_pt1_indices + class1_test_pt2_indices + class0_test_pt1_indices + class0_test_pt2_indices],:][0]
test_fold1_y = hundred_ones + forty_ones + hundred_zeros + forty_zeros


train_fold2_X = tweet_embeddings[[class1_test_pt1_indices + class0_test_pt1_indices],:][0]
train_fold2_y = hundred_ones + hundred_zeros

tune_fold2_X = tweet_embeddings[[class1_test_pt2_indices + class0_test_pt2_indices],:][0]
tune_fold2_y = forty_ones + forty_zeros

test_fold2_X = tweet_embeddings[[class1_train_indices + class1_tune_indices + class0_train_indices + class0_tune_indices],:][0]
test_fold2_y = hundred_ones + forty_ones + hundred_zeros + forty_zeros

In [8]:
train_fold1_X.shape

(200, 150)

## Binary classification: two-fold CV

In [9]:
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

In [10]:
# SVC hyperparams to optimize
kernel = ['rbf', 'linear', 'poly', 'sigmoid']
C = [0.001, 0.01, 0.1, 1, 10]

### Fold 1

In [11]:
# training & tuning
models = []
tune_auc = []
for ker in kernel:
    for el in C:
        svc = SVC(C = el, kernel = ker, probability = True)
        svc.fit(train_fold1_X, train_fold1_y)
        models.append(svc)
        tune_predict = svc.predict_proba(tune_fold1_X)[:,1]
        auc = roc_auc_score(tune_fold1_y, tune_predict)
        tune_auc.append(auc)
        
opt_model = models[tune_auc.index(max(tune_auc))]
opt_model

SVC(C=1, probability=True)

In [12]:
tune_auc

[0.07187500000000001,
 0.07187500000000001,
 0.91625,
 0.936875,
 0.925,
 0.07500000000000001,
 0.07500000000000001,
 0.07500000000000001,
 0.07500000000000001,
 0.934375,
 0.08937500000000001,
 0.08937500000000001,
 0.9031250000000001,
 0.909375,
 0.90125,
 0.079375,
 0.079375,
 0.9225,
 0.9356249999999999,
 0.87875]

In [13]:
# testing
test_scores = dict()
test_predict = opt_model.predict(test_fold1_X)
test_predict_proba = opt_model.predict_proba(test_fold1_X)[:,1]
test_scores['auc'] = [roc_auc_score(test_fold1_y, test_predict_proba)]
test_scores['accuracy'] = [accuracy_score(test_fold1_y, test_predict)]
test_scores['recall'] = [recall_score(test_fold1_y, test_predict, average = 'macro')]
test_scores['precision'] = [precision_score(test_fold1_y, test_predict, average = 'macro')]
test_scores['f1'] = [f1_score(test_fold1_y, test_predict, average = 'macro')]
test_scores

{'auc': [0.908061224489796],
 'accuracy': [0.7857142857142857],
 'recall': [0.7857142857142857],
 'precision': [0.8084379819343468],
 'f1': [0.7816934352097302]}

### Fold 2

In [14]:
# training & tuning
models = []
tune_auc = []
for ker in kernel:
    for el in C:
        svc = SVC(C = el, kernel = ker, probability = True)
        svc.fit(train_fold2_X, train_fold2_y)
        models.append(svc)
        tune_predict = svc.predict_proba(tune_fold2_X)[:,1]
        auc = roc_auc_score(tune_fold2_y, tune_predict)
        tune_auc.append(auc)
        
opt_model = models[tune_auc.index(max(tune_auc))]
opt_model

SVC(C=0.1, kernel='sigmoid', probability=True)

In [15]:
tune_auc

[0.5,
 0.09500000000000001,
 0.88875,
 0.9075,
 0.8899999999999999,
 0.07,
 0.5,
 0.07,
 0.08812499999999998,
 0.918125,
 0.5,
 0.12437499999999999,
 0.12687500000000002,
 0.8943750000000001,
 0.8506250000000001,
 0.5,
 0.06125,
 0.9384375,
 0.9293750000000001,
 0.8606250000000001]

In [16]:
# testing
test_predict = opt_model.predict(test_fold2_X)
test_predict_proba = opt_model.predict_proba(test_fold2_X)[:,1]
test_scores['auc'].append(roc_auc_score(test_fold2_y, test_predict_proba))
test_scores['accuracy'].append(accuracy_score(test_fold2_y, test_predict))
test_scores['recall'].append(recall_score(test_fold2_y, test_predict, average = 'macro'))
test_scores['precision'].append(precision_score(test_fold2_y, test_predict, average = 'macro'))
test_scores['f1'].append(f1_score(test_fold2_y, test_predict, average = 'macro'))
test_scores

{'auc': [0.908061224489796, 0.9031122448979592],
 'accuracy': [0.7857142857142857, 0.8035714285714286],
 'recall': [0.7857142857142857, 0.8035714285714286],
 'precision': [0.8084379819343468, 0.8119920297834409],
 'f1': [0.7816934352097302, 0.8022370330418256]}

### Average results

In [17]:
sum(test_scores['auc'])/2

0.9055867346938775

In [18]:
sum(test_scores['accuracy'])/2

0.7946428571428572

In [19]:
sum(test_scores['recall'])/2

0.7946428571428572

In [20]:
sum(test_scores['precision'])/2

0.8102150058588938

In [21]:
sum(test_scores['f1'])/2

0.7919652341257779