# Binary Classification with Tweet Embeddings

In [1]:
# import modules
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

## Load data

In [2]:
class1_train_indices = list(range(100))
class1_tune_indices = list(range(100,140))
class1_test_pt1_indices = list(range(140,240))
class1_test_pt2_indices = list(range(240,280))
class0_train_indices = list(range(280,380))
class0_tune_indices = list(range(380,420))
class0_test_pt1_indices = list(range(420,520))
class0_test_pt2_indices = list(range(520,560))

In [3]:
# load array of tweet embeddings according to index of original tweet file
num_embed = 250
fname = '../tweet_embeddings/tweet_embed_{}.npy'.format(num_embed)
tweet_embeddings = np.load(fname)
tweet_embeddings

array([[-0.00448299,  0.01827452, -0.00187211, ...,  0.00711912,
        -0.01379659,  0.00315301],
       [-0.00912592, -0.00560592, -0.0038466 , ...,  0.00752772,
        -0.00832159,  0.00503022],
       [-0.00103838,  0.01104937, -0.00787435, ...,  0.00973806,
        -0.00132974,  0.00159357],
       ...,
       [-0.0032521 ,  0.01050532,  0.01424049, ...,  0.00640552,
        -0.01221106, -0.00418963],
       [-0.00364014,  0.0061394 ,  0.00616415, ...,  0.00794342,
        -0.00125844,  0.00397816],
       [ 0.00254795,  0.00633778,  0.00547869, ...,  0.00541811,
        -0.00373036,  0.00213037]])

In [4]:
tweet_embeddings[class0_train_indices,:]

array([[ 0.00535308,  0.0034574 ,  0.00726551, ..., -0.00041058,
        -0.00596488, -0.0008041 ],
       [-0.00671662,  0.01341865,  0.00987666, ...,  0.00402134,
        -0.00413588, -0.01807066],
       [-0.00229121,  0.00853273,  0.00939611, ..., -0.01098966,
        -0.00649302,  0.00242957],
       ...,
       [-0.00350634,  0.00274706,  0.00933622, ...,  0.0117901 ,
        -0.00333024, -0.00169415],
       [-0.00316775,  0.01304326,  0.01624512, ...,  0.01267831,
         0.00066741,  0.00192733],
       [-0.00554315,  0.00917482,  0.02088506, ...,  0.00894483,
        -0.00261615, -0.01814533]])

## Derive text vectors from word embeddings

In [5]:
forty_ones = [1]*40
forty_zeros = [0]*40

hundred_ones = [1]*100
hundred_zeros = [0]*100

In [6]:
train_fold1_X = tweet_embeddings[[class1_train_indices + class0_train_indices],:][0]
train_fold1_y = hundred_ones + hundred_zeros

tune_fold1_X = tweet_embeddings[[class1_tune_indices + class0_tune_indices],:][0]
tune_fold1_y = forty_ones + forty_zeros

test_fold1_X = tweet_embeddings[[class1_test_pt1_indices + class1_test_pt2_indices + class0_test_pt1_indices + class0_test_pt2_indices],:][0]
test_fold1_y = hundred_ones + forty_ones + hundred_zeros + forty_zeros


train_fold2_X = tweet_embeddings[[class1_test_pt1_indices + class0_test_pt1_indices],:][0]
train_fold2_y = hundred_ones + hundred_zeros

tune_fold2_X = tweet_embeddings[[class1_test_pt2_indices + class0_test_pt2_indices],:][0]
tune_fold2_y = forty_ones + forty_zeros

test_fold2_X = tweet_embeddings[[class1_train_indices + class1_tune_indices + class0_train_indices + class0_tune_indices],:][0]
test_fold2_y = hundred_ones + forty_ones + hundred_zeros + forty_zeros

In [7]:
train_fold1_X.shape

(200, 250)

## Binary classification: two-fold CV

In [8]:
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

In [9]:
# SVC hyperparams to optimize
kernel = ['rbf', 'linear', 'poly', 'sigmoid']
C = [0.001, 0.01, 0.1, 1, 10]

### Fold 1

In [10]:
# training & tuning
models = []
tune_auc = []
for ker in kernel:
    for el in C:
        svc = SVC(C = el, kernel = ker, probability = True)
        svc.fit(train_fold1_X, train_fold1_y)
        models.append(svc)
        tune_predict = svc.predict_proba(tune_fold1_X)[:,1]
        auc = roc_auc_score(tune_fold1_y, tune_predict)
        tune_auc.append(auc)
        
opt_model = models[tune_auc.index(max(tune_auc))]
opt_model

SVC(C=1, probability=True)

In [11]:
tune_auc

[0.07437499999999998,
 0.5,
 0.9196874999999999,
 0.9337500000000001,
 0.9237500000000001,
 0.07999999999999999,
 0.07999999999999999,
 0.07999999999999999,
 0.9199999999999999,
 0.9293750000000001,
 0.08437499999999999,
 0.08437499999999999,
 0.1409375,
 0.9125,
 0.905,
 0.5,
 0.07937499999999999,
 0.9199999999999999,
 0.928125,
 0.8293750000000001]

In [12]:
# testing
test_scores = dict()
test_predict = opt_model.predict(test_fold1_X)
test_predict_proba = opt_model.predict_proba(test_fold1_X)[:,1]
test_scores['auc'] = [roc_auc_score(test_fold1_y, test_predict_proba)]
test_scores['accuracy'] = [accuracy_score(test_fold1_y, test_predict)]
test_scores['recall'] = [recall_score(test_fold1_y, test_predict, average = 'macro')]
test_scores['precision'] = [precision_score(test_fold1_y, test_predict, average = 'macro')]
test_scores['f1'] = [f1_score(test_fold1_y, test_predict, average = 'macro')]
test_scores

{'auc': [0.8986989795918369],
 'accuracy': [0.7821428571428571],
 'recall': [0.7821428571428571],
 'precision': [0.8215303215303216],
 'f1': [0.7752602007921157]}

### Fold 2

In [13]:
# training & tuning
models = []
tune_auc = []
for ker in kernel:
    for el in C:
        svc = SVC(C = el, kernel = ker, probability = True)
        svc.fit(train_fold2_X, train_fold2_y)
        models.append(svc)
        tune_predict = svc.predict_proba(tune_fold2_X)[:,1]
        auc = roc_auc_score(tune_fold2_y, tune_predict)
        tune_auc.append(auc)
        
opt_model = models[tune_auc.index(max(tune_auc))]
opt_model

SVC(C=1, kernel='sigmoid', probability=True)

In [14]:
tune_auc

[0.10125000000000002,
 0.10125000000000002,
 0.5,
 0.894375,
 0.8890625,
 0.09062500000000001,
 0.5,
 0.09062500000000001,
 0.414375,
 0.90375,
 0.5,
 0.15250000000000002,
 0.15250000000000002,
 0.8806250000000001,
 0.8718750000000001,
 0.5,
 0.08687500000000001,
 0.25,
 0.906875,
 0.820625]

In [15]:
# testing
test_predict = opt_model.predict(test_fold2_X)
test_predict_proba = opt_model.predict_proba(test_fold2_X)[:,1]
test_scores['auc'].append(roc_auc_score(test_fold2_y, test_predict_proba))
test_scores['accuracy'].append(accuracy_score(test_fold2_y, test_predict))
test_scores['recall'].append(recall_score(test_fold2_y, test_predict, average = 'macro'))
test_scores['precision'].append(precision_score(test_fold2_y, test_predict, average = 'macro'))
test_scores['f1'].append(f1_score(test_fold2_y, test_predict, average = 'macro'))
test_scores

{'auc': [0.8986989795918369, 0.9089030612244898],
 'accuracy': [0.7821428571428571, 0.8321428571428572],
 'recall': [0.7821428571428571, 0.8321428571428571],
 'precision': [0.8215303215303216, 0.8322954417844929],
 'f1': [0.7752602007921157, 0.8321235856156958]}

### Average results

In [16]:
sum(test_scores['auc'])/2

0.9038010204081633

In [17]:
sum(test_scores['accuracy'])/2

0.8071428571428572

In [18]:
sum(test_scores['recall'])/2

0.8071428571428572

In [19]:
sum(test_scores['precision'])/2

0.8269128816574072

In [20]:
sum(test_scores['f1'])/2

0.8036918932039057