# Binary Classification with Tweet Embeddings

In [1]:
# import modules
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

## Load data

In [2]:
class1_train_indices = list(range(100))
class1_tune_indices = list(range(100,140))
class1_test_pt1_indices = list(range(140,240))
class1_test_pt2_indices = list(range(240,280))
class0_train_indices = list(range(280,380))
class0_tune_indices = list(range(380,420))
class0_test_pt1_indices = list(range(420,520))
class0_test_pt2_indices = list(range(520,560))

In [3]:
# load array of tweet embeddings according to index of original tweet file
num_embed = 100
fname = '../tweet_embeddings/tweet_embed_{}.npy'.format(num_embed)
tweet_embeddings = np.load(fname)
tweet_embeddings

array([[ 2.14904746e-03,  2.45077669e-02,  2.47014707e-02, ...,
        -7.25606271e-04,  2.67015765e-03,  4.03673065e-03],
       [-9.46452087e-04,  4.87960232e-03,  4.47070048e-02, ...,
        -7.95188148e-03,  7.33467039e-04, -1.57167224e-03],
       [-8.54947205e-04, -2.91345826e-03,  3.21994375e-03, ...,
        -7.52294886e-03,  4.42561397e-03, -4.54045879e-03],
       ...,
       [ 9.93205735e-04, -7.32190760e-03,  9.56053944e-03, ...,
         7.96010609e-03, -5.14358391e-03, -1.66861473e-02],
       [ 6.52102690e-03, -1.42260668e-03, -1.53443495e-02, ...,
        -1.39974437e-02,  6.00576922e-03, -1.51548927e-02],
       [ 2.71978112e-03,  4.16686129e-04, -8.77459056e-05, ...,
        -1.76468980e-03,  4.27055557e-04,  5.15986099e-03]])

In [4]:
tweet_embeddings[class0_train_indices,:]

array([[ 2.39348722e-04, -2.46133933e-03,  1.11936424e-03, ...,
        -8.21334638e-03,  3.44486518e-03,  1.86145575e-02],
       [ 3.00271422e-03, -9.77105527e-03,  4.60251036e-04, ...,
        -2.60485232e-04, -8.52634432e-03,  6.42468966e-03],
       [ 1.58959762e-03, -4.21894793e-03, -5.41214795e-03, ...,
         6.52759229e-03, -3.11762314e-02, -1.05364188e-02],
       ...,
       [-7.33872639e-03,  2.52321950e-02, -1.46316556e-02, ...,
        -1.52138813e-05, -3.44659406e-02, -1.61631449e-02],
       [-8.87662151e-03,  2.47644495e-02, -1.65147756e-02, ...,
        -6.06074517e-03, -2.04247780e-02, -1.82096387e-02],
       [-1.67504883e-04,  3.54172567e-03,  1.11666813e-02, ...,
        -4.59883172e-03, -8.77279082e-03, -1.43644654e-02]])

## Derive text vectors from word embeddings

In [5]:
forty_ones = [1]*40
forty_zeros = [0]*40

hundred_ones = [1]*100
hundred_zeros = [0]*100

In [6]:
train_fold1_X = tweet_embeddings[[class1_train_indices + class0_train_indices],:][0]
train_fold1_y = hundred_ones + hundred_zeros

tune_fold1_X = tweet_embeddings[[class1_tune_indices + class0_tune_indices],:][0]
tune_fold1_y = forty_ones + forty_zeros

test_fold1_X = tweet_embeddings[[class1_test_pt1_indices + class1_test_pt2_indices + class0_test_pt1_indices + class0_test_pt2_indices],:][0]
test_fold1_y = hundred_ones + forty_ones + hundred_zeros + forty_zeros


train_fold2_X = tweet_embeddings[[class1_test_pt1_indices + class0_test_pt1_indices],:][0]
train_fold2_y = hundred_ones + hundred_zeros

tune_fold2_X = tweet_embeddings[[class1_test_pt2_indices + class0_test_pt2_indices],:][0]
tune_fold2_y = forty_ones + forty_zeros

test_fold2_X = tweet_embeddings[[class1_train_indices + class1_tune_indices + class0_train_indices + class0_tune_indices],:][0]
test_fold2_y = hundred_ones + forty_ones + hundred_zeros + forty_zeros

In [7]:
train_fold1_X.shape

(200, 100)

## Binary classification: two-fold CV

In [8]:
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

In [9]:
# SVC hyperparams to optimize
kernel = ['rbf', 'linear', 'poly', 'sigmoid']
C = [0.001, 0.01, 0.1, 1, 10]

### Fold 1

In [10]:
# training & tuning
models = []
tune_auc = []
for ker in kernel:
    for el in C:
        svc = SVC(C = el, kernel = ker, probability = True)
        svc.fit(train_fold1_X, train_fold1_y)
        models.append(svc)
        tune_predict = svc.predict_proba(tune_fold1_X)[:,1]
        auc = roc_auc_score(tune_fold1_y, tune_predict)
        tune_auc.append(auc)
        
opt_model = models[tune_auc.index(max(tune_auc))]
opt_model

SVC(C=10, kernel='linear', probability=True)

In [11]:
tune_auc

[0.06687500000000002,
 0.06687500000000002,
 0.933125,
 0.94125,
 0.9275,
 0.06437500000000002,
 0.5,
 0.06437500000000002,
 0.2625,
 0.9562499999999999,
 0.5,
 0.10437500000000002,
 0.914375,
 0.9281249999999999,
 0.9103125,
 0.06625,
 0.06625,
 0.93375,
 0.95,
 0.8975]

In [12]:
# testing
test_scores = dict()
test_predict = opt_model.predict(test_fold1_X)
test_predict_proba = opt_model.predict_proba(test_fold1_X)[:,1]
test_scores['auc'] = [roc_auc_score(test_fold1_y, test_predict_proba)]
test_scores['accuracy'] = [accuracy_score(test_fold1_y, test_predict)]
test_scores['recall'] = [recall_score(test_fold1_y, test_predict, average = 'macro')]
test_scores['precision'] = [precision_score(test_fold1_y, test_predict, average = 'macro')]
test_scores['f1'] = [f1_score(test_fold1_y, test_predict, average = 'macro')]
test_scores

{'auc': [0.9173979591836734],
 'accuracy': [0.7928571428571428],
 'recall': [0.7928571428571429],
 'precision': [0.8283001601464196],
 'f1': [0.7871113208536522]}

### Fold 2

In [13]:
# training & tuning
models = []
tune_auc = []
for ker in kernel:
    for el in C:
        svc = SVC(C = el, kernel = ker, probability = True)
        svc.fit(train_fold2_X, train_fold2_y)
        models.append(svc)
        tune_predict = svc.predict_proba(tune_fold2_X)[:,1]
        auc = roc_auc_score(tune_fold2_y, tune_predict)
        tune_auc.append(auc)
        
opt_model = models[tune_auc.index(max(tune_auc))]
opt_model

SVC(C=0.1, kernel='sigmoid', probability=True)

In [14]:
tune_auc

[0.09875000000000002,
 0.09875000000000002,
 0.90125,
 0.9175,
 0.891875,
 0.06,
 0.06,
 0.5,
 0.15,
 0.93375,
 0.5,
 0.10125,
 0.89875,
 0.9125,
 0.8468749999999999,
 0.5,
 0.5,
 0.9437500000000001,
 0.9400000000000001,
 0.868125]

In [15]:
# testing
test_predict = opt_model.predict(test_fold2_X)
test_predict_proba = opt_model.predict_proba(test_fold2_X)[:,1]
test_scores['auc'].append(roc_auc_score(test_fold2_y, test_predict_proba))
test_scores['accuracy'].append(accuracy_score(test_fold2_y, test_predict))
test_scores['recall'].append(recall_score(test_fold2_y, test_predict, average = 'macro'))
test_scores['precision'].append(precision_score(test_fold2_y, test_predict, average = 'macro'))
test_scores['f1'].append(f1_score(test_fold2_y, test_predict, average = 'macro'))
test_scores

{'auc': [0.9173979591836734, 0.9037244897959185],
 'accuracy': [0.7928571428571428, 0.8],
 'recall': [0.7928571428571429, 0.8],
 'precision': [0.8283001601464196, 0.8050425399460469],
 'f1': [0.7871113208536522, 0.7991700394487422]}

### Average results

In [16]:
sum(test_scores['auc'])/2

0.9105612244897959

In [17]:
sum(test_scores['accuracy'])/2

0.7964285714285715

In [18]:
sum(test_scores['recall'])/2

0.7964285714285715

In [19]:
sum(test_scores['precision'])/2

0.8166713500462333

In [20]:
sum(test_scores['f1'])/2

0.7931406801511972