# Binary Classification with Tweet Embeddings

In [3]:
# import modules
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold

## Load data

In [4]:
# Load tweets
tweets = pd.read_excel('COVID19_Dataset-CM-ZB-complete with sources.xlsx')
tweets

Unnamed: 0,Is_Unreliable,Category,Tweet
0,1,"1, 3, 6, 9",We are living in scary times in Canada. Gov’t ...
1,1,"1, 6, 8, 9","Just as bad in Canada. In fact, our government..."
2,1,"1, 4, 9",It was only a matter of time before the mainst...
3,1,"6, 8",Russia's taking no chances: Foreigners infecte...
4,1,"6, 8, 9",Although there is now a presumptive confirmed ...
...,...,...,...
555,0,,BREAKING: Harvard classes will move online sta...
556,0,,Singularity University is hosting a FREE Virtu...
557,0,,Coronavirus: how does it spread and what are t...
558,0,,Stanford just cancelled classes for the rest o...


In [5]:
# Load tweet embeddings
fname = 'tweet_embed_{}.npy'
fname_A = fname.format('A')
tweet_embeddings_A = np.load(fname_A)
tweet_embeddings_BERT = np.load('bert_embeddings.npy')

In [6]:
# list of embeddings to iterate over
#embeddings = [tweet_embeddings_A, tweet_embeddings_S, tweet_embeddings_BERT]

# target y
target = np.array(tweets['Is_Unreliable'])

## Binary classification: five-fold CV

In [7]:
# SVC hyperparams to optimize
kernel = ['rbf', 'linear', 'poly', 'sigmoid']
C = [0.001, 0.01, 0.1, 1, 10]

In [8]:
tune_num = int(tweets.shape[0]/5)
tune_num

112

In [9]:
# Compute the folds
num_folds = 5
kf = KFold(n_splits = num_folds, shuffle = True, random_state = 1)
splits = kf.split(tweet_embeddings_BERT) # use any set of embeddings to get train/test indices splits

training_sets = []
testing_sets = []
for train_idx, test_idx in splits:
    training_sets.append(train_idx)
    testing_sets.append(test_idx)

# Construct tuning sets from training sets (20% of data ~ 1 fold) &
# write over training sets (60% of data ~ 3 folds)
tuning_sets = []
for i in range(len(training_sets)):
    train_set = training_sets[i]
    np.random.seed(i)
    tune_idx = np.random.choice(train_set,
                                size = tune_num,
                                replace = False)
    tuning_sets.append(tune_idx)
    new_train_set = train_set[~np.in1d(train_set, tune_idx)]
    training_sets[i] = new_train_set

In [10]:
# Define function to create results dictionary
#
# Inputs: numpy array
# Outputs: dictionary containing model performance stats

def get_results(np_array):
    # Initialize list/dict to store all model stats
    performance = dict()

    # Loop over folds
    for i in range(num_folds):
        key1 = 'Fold {}'.format(i+1) # key for the performance dict

        train_idx = training_sets[i]
        test_idx = testing_sets[i]
        tune_idx = tuning_sets[i]

        y_train = target[train_idx]
        y_test = target[test_idx]
        y_tune = target[tune_idx]

        X_train = np_array[train_idx]
        X_test = np_array[test_idx]
        X_tune = np_array[tune_idx]

        # Training & tuning
        models = [] # store list of models in order to retrieve optimal model
        tune_auc = [] # tune based on AUC
        model_dict = dict() # to store model params & performance metric values

        for ker in kernel:
            for el in C:
                # Training
                svc = SVC(C = el, kernel = ker, probability = True)
                svc.fit(X_train, y_train)
                models.append(svc)

                # Tuning
                tune_predict_proba = svc.predict_proba(X_tune)[:,1] # check on this subscripting
                auc = roc_auc_score(y_tune, tune_predict_proba)
                tune_auc.append(auc)

        # Get optimal model based on hyperparameter tuning
        opt_model = models[tune_auc.index(max(tune_auc))] # tune based on AUC
        opt_model_params = opt_model.get_params()
        model_dict['params'] = opt_model_params # store optimal values for model hyperparameters

        # Save training scores
        train_scores = dict() # to store all training scores
        train_predict = opt_model.predict(X_train)
        train_predict_proba = opt_model.predict_proba(X_train)[:,1] # check on this subscripting
        train_scores['auc'] = roc_auc_score(y_train, train_predict_proba)
        train_scores['accuracy'] = accuracy_score(y_train, train_predict)
        train_scores['recall_macro'] = recall_score(y_train, train_predict, average = 'macro')
        train_scores['precision_macro'] = precision_score(y_train, train_predict, average = 'macro')
        train_scores['f1_macro'] = f1_score(y_train, train_predict, average = 'macro')

        # Save training scores dictionary to model dictionary
        model_dict['training'] = train_scores

        # Save tuning scores
        tune_scores = dict() # to store all tuning scores
        tune_predict = opt_model.predict(X_tune)
        tune_predict_proba = opt_model.predict_proba(X_tune)[:,1] # check on this subscripting
        tune_scores['auc'] = roc_auc_score(y_tune, tune_predict_proba)
        tune_scores['accuracy'] = accuracy_score(y_tune, tune_predict)
        tune_scores['recall_macro'] = recall_score(y_tune, tune_predict, average = 'macro')
        tune_scores['precision_macro'] = precision_score(y_tune, tune_predict, average = 'macro')
        tune_scores['f1_macro'] = f1_score(y_tune, tune_predict, average = 'macro')

        # Save tuning scores dictionary to model dictionary
        model_dict['tuning'] = tune_scores

        # Testing
        test_scores = dict() # to store all testing scores
        test_predict = opt_model.predict(X_test)
        test_predict_proba = opt_model.predict_proba(X_test)[:,1]
        test_scores['auc'] = roc_auc_score(y_test, test_predict_proba)
        test_scores['accuracy'] = accuracy_score(y_test, test_predict)
        test_scores['recall_macro'] = recall_score(y_test, test_predict, average = 'macro')
        test_scores['precision_macro'] = precision_score(y_test, test_predict, average = 'macro')
        test_scores['f1_macro'] = f1_score(y_test, test_predict, average = 'macro')

        # Save test scores dictionary to model dictionary
        model_dict['testing'] = test_scores

        # Save model dictionary to overall dictionary
        performance[key1] = model_dict
    
    return performance

In [11]:
# get results for BERT embeddings
BERT_results = get_results(tweet_embeddings_BERT)

In [12]:
# get results for A embeddings
A_results = get_results(tweet_embeddings_A)

In [13]:
A_results

{'Fold 1': {'params': {'C': 1,
   'break_ties': False,
   'cache_size': 200,
   'class_weight': None,
   'coef0': 0.0,
   'decision_function_shape': 'ovr',
   'degree': 3,
   'gamma': 'scale',
   'kernel': 'rbf',
   'max_iter': -1,
   'probability': True,
   'random_state': None,
   'shrinking': True,
   'tol': 0.001,
   'verbose': False},
  'training': {'auc': 0.9983338060124787,
   'accuracy': 0.9821428571428571,
   'recall_macro': 0.9824163357912649,
   'precision_macro': 0.9821428571428572,
   'f1_macro': 0.982140326009922},
  'tuning': {'auc': 0.9115384615384614,
   'accuracy': 0.8392857142857143,
   'recall_macro': 0.8294871794871794,
   'precision_macro': 0.8655761024182076,
   'f1_macro': 0.8328358208955223},
  'testing': {'auc': 0.8998724489795918,
   'accuracy': 0.8303571428571429,
   'recall_macro': 0.8303571428571428,
   'precision_macro': 0.833011893281903,
   'f1_macro': 0.830018372074447}},
 'Fold 2': {'params': {'C': 1,
   'break_ties': False,
   'cache_size': 200,
   '

In [14]:
# Define function to create results df from nested dictionary
def create_df(input_dict):
    df = pd.DataFrame(input_dict)
    df = df.transpose()
    
    df_params = df['params'].apply(pd.Series)
    
    df_training = df['training'].apply(pd.Series)
    df_training.columns = ['train_' + str(col) for col in df_training.columns]
    
    df_tuning = df['tuning'].apply(pd.Series)
    df_tuning.columns = ['tune_' + str(col) for col in df_tuning.columns]
    
    df_testing = df['testing'].apply(pd.Series)
    df_testing.columns = ['test_' + str(col) for col in df_testing.columns]
    
    final_df = pd.concat([df_training, df_tuning, df_testing, df_params], axis = 1).reset_index()
    final_df = final_df.rename({'index': 'fold_num'}, axis = 1)
    
    return final_df

In [15]:
# Define function to get means for test results from dataframe of full results
def get_test_means(df):
    filter_cols = [col for col in df if col.startswith('test_')]
    df_test = df[filter_cols]
    df_test_mean = pd.DataFrame(df_test.mean(axis = 0)).transpose()
    
    return df_test_mean

In [16]:
# Save BERT embedding results
BERT_full = create_df(BERT_results)
BERT_full.to_csv('BERT_svm_full_results.csv')

BERT_test_mean = get_test_means(BERT_full)
BERT_test_mean.to_csv('BERT_svm_testmean_results.csv')

In [17]:
# Save A embedding results
A_full = create_df(A_results)
A_full.to_csv('A_svm_full_results.csv')

A_test_mean = get_test_means(A_full)
A_test_mean.to_csv('A_svm_testmean_results.csv')