# Binary Classification with Tweet Embeddings

In [1]:
# import modules
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

## Load data

In [2]:
class1_train_indices = list(range(100))
class1_tune_indices = list(range(100,140))
class1_test_pt1_indices = list(range(140,240))
class1_test_pt2_indices = list(range(240,280))
class0_train_indices = list(range(280,380))
class0_tune_indices = list(range(380,420))
class0_test_pt1_indices = list(range(420,520))
class0_test_pt2_indices = list(range(520,560))

In [3]:
# load array of tweet embeddings according to index of original tweet file
num_embed = 200
fname = '../tweet_embeddings/tweet_embed_{}.npy'.format(num_embed)
tweet_embeddings = np.load(fname)
tweet_embeddings

array([[ 0.00593588,  0.00939591, -0.00662292, ..., -0.01922544,
         0.01135051, -0.01881703],
       [ 0.00667498, -0.01665263, -0.00706223, ..., -0.00810334,
         0.00158554, -0.00864557],
       [ 0.00702469, -0.00557805, -0.00507662, ..., -0.00726061,
         0.00634164, -0.02551629],
       ...,
       [ 0.00263419,  0.00553107, -0.0111273 , ..., -0.03626432,
        -0.00115585, -0.06540261],
       [ 0.00715289,  0.00460695, -0.00207928, ...,  0.0028719 ,
         0.00146786, -0.00983326],
       [ 0.00281343, -0.00274252, -0.00334287, ..., -0.00280212,
         0.0042161 , -0.0038069 ]])

In [4]:
tweet_embeddings[class0_train_indices,:]

array([[ 0.00574072,  0.00465161, -0.01033126, ..., -0.00891994,
         0.00185855, -0.01406886],
       [ 0.00783427,  0.00525873, -0.01145834, ...,  0.0015645 ,
         0.00276725, -0.00392904],
       [-0.01308348, -0.00039047, -0.00901186, ...,  0.00098442,
         0.00340062, -0.01465853],
       ...,
       [-0.00392546,  0.00928786, -0.00503153, ..., -0.00533025,
        -0.00054813, -0.0027769 ],
       [ 0.00814459,  0.00579188, -0.00407937, ..., -0.00582345,
         0.0059936 ,  0.00379572],
       [-0.00411265,  0.00691   , -0.005913  , ..., -0.0213258 ,
        -0.00173161, -0.04867859]])

## Derive text vectors from word embeddings

In [5]:
forty_ones = [1]*40
forty_neg_ones = [-1]*40

hundred_ones = [1]*100
hundred_neg_ones = [-1]*100

In [6]:
train_fold1_X = tweet_embeddings[[class0_train_indices],:][0]
train_fold1_y = hundred_ones

tune_fold1_X = tweet_embeddings[[class1_train_indices + class1_tune_indices + class0_tune_indices],:][0]
tune_fold1_y = hundred_neg_ones + forty_neg_ones + forty_ones

test_fold1_X = tweet_embeddings[[class1_test_pt1_indices + class1_test_pt2_indices + class0_test_pt1_indices + class0_test_pt2_indices],:][0]
test_fold1_y = hundred_neg_ones + forty_neg_ones + hundred_ones + forty_ones


train_fold2_X = tweet_embeddings[[class0_test_pt1_indices],:][0]
train_fold2_y = hundred_ones + hundred_neg_ones

tune_fold2_X = tweet_embeddings[[class1_test_pt1_indices + class1_test_pt2_indices + class0_test_pt2_indices],:][0]
tune_fold2_y = hundred_neg_ones + forty_neg_ones + forty_ones

test_fold2_X = tweet_embeddings[[class1_train_indices + class1_tune_indices + class0_train_indices + class0_tune_indices],:][0]
test_fold2_y = hundred_neg_ones + forty_neg_ones + hundred_ones + forty_ones

In [7]:
train_fold1_X.shape

(100, 200)

## Binary classification: two-fold CV

In [8]:
from sklearn.svm import OneClassSVM
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

In [9]:
# OCSVC hyperparams to optimize
kernel = ['rbf', 'linear', 'poly', 'sigmoid']

### Fold 1

In [10]:
# training & tuning
models = []
tune_f1 = []
for ker in kernel:
    svc = OneClassSVM(kernel = ker)
    svc.fit(train_fold1_X, train_fold1_y)
    models.append(svc)
    tune_predict = svc.predict(tune_fold1_X)
    f1 = f1_score(tune_fold1_y, tune_predict, average = 'macro')
    tune_f1.append(f1)
        
opt_model = models[tune_f1.index(max(tune_f1))]
opt_model

OneClassSVM(kernel='sigmoid')

In [11]:
tune_f1

[0.6683628706790665,
 0.6946631290504981,
 0.6616541353383458,
 0.7044334975369457]

In [12]:
# testing
test_scores = dict()
test_predict = opt_model.predict(test_fold1_X)
test_predict_proba = opt_model.decision_function(test_fold1_X)
test_scores['auc'] = [roc_auc_score(test_fold1_y, test_predict_proba, average = 'macro')]
test_scores['accuracy'] = [accuracy_score(test_fold1_y, test_predict)]
test_scores['recall'] = [recall_score(test_fold1_y, test_predict, average = 'macro')]
test_scores['precision'] = [precision_score(test_fold1_y, test_predict, average = 'macro')]
test_scores['f1'] = [f1_score(test_fold1_y, test_predict, average = 'macro')]
test_scores

{'auc': [0.7068877551020408],
 'accuracy': [0.6107142857142858],
 'recall': [0.6107142857142858],
 'precision': [0.6366584797531332],
 'f1': [0.5913175055905944]}

### Fold 2

In [13]:
# training & tuning
models = []
tune_f1 = []
for ker in kernel:
    svc = OneClassSVM(kernel = ker)
    svc.fit(train_fold2_X, train_fold2_y)
    models.append(svc)
    tune_predict = svc.predict(tune_fold2_X)
    f1 = f1_score(tune_fold2_y, tune_predict, average = 'macro')
    tune_f1.append(f1)
        
opt_model = models[tune_f1.index(max(tune_f1))]
opt_model

OneClassSVM(kernel='linear')

In [14]:
tune_f1

[0.5291634160892111,
 0.5833333333333333,
 0.5157162543473981,
 0.5830244625648628]

In [15]:
# testing
test_predict = opt_model.predict(test_fold2_X)
test_predict_proba = opt_model.decision_function(test_fold2_X)
test_scores['auc'].append(roc_auc_score(test_fold2_y, test_predict_proba, average = 'macro'))
test_scores['accuracy'].append(accuracy_score(test_fold2_y, test_predict))
test_scores['recall'].append(recall_score(test_fold2_y, test_predict, average = 'macro'))
test_scores['precision'].append(precision_score(test_fold2_y, test_predict, average = 'macro'))
test_scores['f1'].append(f1_score(test_fold2_y, test_predict))
test_scores

{'auc': [0.7068877551020408, 0.7427040816326531],
 'accuracy': [0.6107142857142858, 0.6607142857142857],
 'recall': [0.6107142857142858, 0.6607142857142857],
 'precision': [0.6366584797531332, 0.6983752125448706],
 'f1': [0.5913175055905944, 0.5662100456621004]}

### Average results

In [16]:
sum(test_scores['auc'])/2

0.7247959183673469

In [17]:
sum(test_scores['accuracy'])/2

0.6357142857142857

In [18]:
sum(test_scores['recall'])/2

0.6357142857142857

In [19]:
sum(test_scores['precision'])/2

0.6675168461490019

In [20]:
sum(test_scores['f1'])/2

0.5787637756263474