# Binary Classification with Tweet Embeddings

In [1]:
# import modules
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

## Load data

In [2]:
class1_train_indices = list(range(100))
class1_tune_indices = list(range(100,140))
class1_test_pt1_indices = list(range(140,240))
class1_test_pt2_indices = list(range(240,280))
class0_train_indices = list(range(280,380))
class0_tune_indices = list(range(380,420))
class0_test_pt1_indices = list(range(420,520))
class0_test_pt2_indices = list(range(520,560))

In [3]:
# load array of tweet embeddings according to index of original tweet file
num_embed = 500
fname = '../tweet_embeddings/tweet_embed_{}.npy'.format(num_embed)
tweet_embeddings = np.load(fname)
tweet_embeddings

array([[-0.00354477, -0.00215981, -0.00191002, ...,  0.00391414,
         0.00166403,  0.01324248],
       [-0.00296756, -0.00414907, -0.0029761 , ...,  0.00191059,
         0.00211693, -0.00103907],
       [-0.0025717 , -0.0033901 , -0.00370144, ...,  0.00282674,
         0.00194122,  0.00399626],
       ...,
       [-0.00143696, -0.00132692, -0.0021579 , ...,  0.00317285,
        -0.00236608,  0.01905248],
       [-0.00256621, -0.00164586, -0.00146364, ..., -0.00216716,
         0.00028311,  0.00383734],
       [-0.00139214, -0.00330151, -0.00095572, ..., -0.00303369,
         0.00858005,  0.00300258]])

In [4]:
tweet_embeddings[class0_train_indices,:]

array([[-0.00095106, -0.0036389 , -0.0021154 , ...,  0.00070693,
         0.00157908,  0.00390358],
       [-0.00064968, -0.00345125, -0.00151379, ...,  0.00115335,
         0.00278355,  0.00376189],
       [-0.00315136, -0.0067061 , -0.00094254, ..., -0.00178109,
         0.00448061,  0.00450257],
       ...,
       [-0.00080168,  0.0012146 ,  0.00035715, ...,  0.00061424,
        -0.00353658,  0.00038339],
       [-0.00183575, -0.01003218, -0.00118522, ..., -0.01033933,
         0.00869172,  0.00329064],
       [-0.00232136, -0.00077835, -0.00245735, ...,  0.00233888,
        -0.00163723,  0.01819277]])

## Set up train/tune/test sets

In [5]:
forty_ones = [1]*40
forty_neg_ones = [-1]*40

hundred_ones = [1]*100
hundred_neg_ones = [-1]*100

In [6]:
train_fold1_X = tweet_embeddings[[class0_train_indices],:][0]
train_fold1_y = hundred_ones

tune_fold1_X = tweet_embeddings[[class1_train_indices + class1_tune_indices + class0_tune_indices],:][0]
tune_fold1_y = hundred_neg_ones + forty_neg_ones + forty_ones

test_fold1_X = tweet_embeddings[[class1_test_pt1_indices + class1_test_pt2_indices + class0_test_pt1_indices + class0_test_pt2_indices],:][0]
test_fold1_y = hundred_neg_ones + forty_neg_ones + hundred_ones + forty_ones


train_fold2_X = tweet_embeddings[[class0_test_pt1_indices],:][0]
train_fold2_y = hundred_ones + hundred_neg_ones

tune_fold2_X = tweet_embeddings[[class1_test_pt1_indices + class1_test_pt2_indices + class0_test_pt2_indices],:][0]
tune_fold2_y = hundred_neg_ones + forty_neg_ones + forty_ones

test_fold2_X = tweet_embeddings[[class1_train_indices + class1_tune_indices + class0_train_indices + class0_tune_indices],:][0]
test_fold2_y = hundred_neg_ones + forty_neg_ones + hundred_ones + forty_ones

In [7]:
train_fold1_X.shape

(100, 500)

## One-class classification: two-fold CV

In [8]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

In [9]:
# Isolation Forest hyperparams to optimize
n_trees = [5, 25, 50, 100, 250, 500]

### Fold 1

In [10]:
# training & tuning
models = []
tune_f1 = []
for tree in n_trees:
    iforest = IsolationForest(n_estimators = tree)
    iforest.fit(train_fold1_X, train_fold1_y)
    models.append(iforest)
    tune_predict = iforest.predict(tune_fold1_X)
    f1 = f1_score(tune_fold1_y, tune_predict, average = 'macro')
    tune_f1.append(f1)
        
opt_model = models[tune_f1.index(max(tune_f1))]
opt_model

IsolationForest(n_estimators=5)

In [11]:
tune_f1

[0.22801177826142618,
 0.1975707455743636,
 0.1818181818181818,
 0.1897406004080443,
 0.1897406004080443,
 0.1897406004080443]

In [12]:
# testing
test_scores = dict()
test_predict = opt_model.predict(test_fold1_X)
test_predict_proba = opt_model.decision_function(test_fold1_X)
test_scores['auc'] = [roc_auc_score(test_fold1_y, test_predict_proba, average = 'macro')]
test_scores['accuracy'] = [accuracy_score(test_fold1_y, test_predict)]
test_scores['recall'] = [recall_score(test_fold1_y, test_predict, average = 'macro')]
test_scores['precision'] = [precision_score(test_fold1_y, test_predict, average = 'macro')]
test_scores['f1'] = [f1_score(test_fold1_y, test_predict, average = 'macro')]
test_scores

{'auc': [0.5309183673469388],
 'accuracy': [0.4857142857142857],
 'recall': [0.4857142857142857],
 'precision': [0.39629629629629626],
 'f1': [0.344390243902439]}

### Fold 2

In [13]:
# training & tuning
models = []
tune_f1 = []
for tree in n_trees:
    iforest = IsolationForest(n_estimators = tree)
    iforest.fit(train_fold2_X, train_fold2_y)
    models.append(iforest)
    tune_predict = iforest.predict(tune_fold2_X)
    f1 = f1_score(tune_fold2_y, tune_predict, average = 'macro')
    tune_f1.append(f1)
        
opt_model = models[tune_f1.index(max(tune_f1))]
opt_model

IsolationForest(n_estimators=5)

In [14]:
tune_f1

[0.21562390661280575,
 0.1897406004080443,
 0.1818181818181818,
 0.1818181818181818,
 0.1818181818181818,
 0.1818181818181818]

In [15]:
# testing
test_predict = opt_model.predict(test_fold2_X)
test_predict_proba = opt_model.decision_function(test_fold2_X)
test_scores['auc'].append(roc_auc_score(test_fold2_y, test_predict_proba, average = 'macro'))
test_scores['accuracy'].append(accuracy_score(test_fold2_y, test_predict))
test_scores['recall'].append(recall_score(test_fold2_y, test_predict, average = 'macro'))
test_scores['precision'].append(precision_score(test_fold2_y, test_predict, average = 'macro'))
test_scores['f1'].append(f1_score(test_fold2_y, test_predict))
test_scores

{'auc': [0.5309183673469388, 0.5238010204081633],
 'accuracy': [0.4857142857142857, 0.5214285714285715],
 'recall': [0.4857142857142857, 0.5214285714285715],
 'precision': [0.39629629629629626, 0.612781954887218],
 'f1': [0.344390243902439, 0.6699507389162561]}

### Average results

In [16]:
sum(test_scores['auc'])/2

0.5273596938775511

In [17]:
sum(test_scores['accuracy'])/2

0.5035714285714286

In [18]:
sum(test_scores['recall'])/2

0.5035714285714286

In [19]:
sum(test_scores['precision'])/2

0.5045391255917571

In [20]:
sum(test_scores['f1'])/2

0.5071704914093476