## Set up

In [1]:
import numpy as np
import pandas as pd
import scipy.sparse
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support
import pickle

In [2]:
# create global variable to use for random seed as needed
random_seed = 466

## Load data

In [3]:
def load_tweet_df(dataset_name):
    """
    Load tweet dataframes (assumes filename structure)
    """
    filepath_in = f'../data/derived/tweets_supervised_{dataset_name}.csv'
    df = pd.read_csv(filepath_in)
    return df

def load_tweet_vectors(vector_name, dataset_name):
    """
    Load tweet vectors (assumes filename structure)
    """
    
    # load LSI vectors (saved as array in CSV file)
    if 'lsi' in vector_name:
        filepath_in = f'../data/derived/vectors/vector{vector_name}_{dataset_name}.csv'
        vectors = np.loadtxt(filepath_in, delimiter=',')
        
    # load other vectors (saved as sparse array in NPZ file)
    else:
        filepath_in = f'../data/derived/vectors/vector{vector_name}_{dataset_name}.npz'
        vectors = scipy.sparse.load_npz(filepath_in)
        
    return vectors

## Train classifier

In [4]:
def train_multilayer_perceptron(vectors, df, alpha, random_seed, vector_name):
    """
    Train multilayer perceptron with provided features, labels, alpha and random seed
    """
    
    # initialize classifier with default arguments
    multilayer_perceptron = MLPClassifier(alpha = alpha, random_state = random_seed)

    # train classifier
    multilayer_perceptron.fit(X = vectors, y = df['label'])
    
    # write to file
    filepath_out = f'../data/derived/models/mlp_vector{vector_name}_alpha{alpha}.pkl'
    pickle.dump(multilayer_perceptron, open(filepath_out, 'wb'))
    
    return multilayer_perceptron

## Make predictions

In [5]:
def predict_multilayer_perceptron(multilayer_perceptron, vectors, df, dataset_name, vector_name):
    """
    Generate predictions with trained multilayer perceptron classifier
    """
    # generate predictions with classifier
    predictions = multilayer_perceptron.predict(X = vectors)
    
    # create dataframe with record IDs, labels, and predicted labels
    df = pd.DataFrame(data={'tweet_id':df['tweet_id'], 'label':df['label'], 'prediction':predictions})
    
    # write dataframe to file
    filepath_out = f'../data/derived/predictions/mlp_vector{vector_name}_alpha{multilayer_perceptron.alpha}_{dataset_name}.csv'
    df.to_csv(filepath_out, index=False)
    
    return predictions

## Evaluate predictions

In [6]:
def evaluate_multilayer_perceptron(labels, predictions, alpha, dataset_name, vector_name):
    """
    Calculate accuracy, confusion matrix, support, precision, recall and F1 score and write to CSV
    """
    
    # initialize empty dictionary to store metrics
    metrics = dict()
    
    # calculate and store confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_true = labels, y_pred = predictions).ravel()
    metrics['true_positives']  = tp
    metrics['false_positives'] = fp
    metrics['true_negatives']  = tn
    metrics['false_negatives'] = fn
    
    # calculate and store accuracy
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    metrics['accuracy'] = accuracy
    
    # calculate and store macro precision, recall and F1 score
    macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(y_true = labels,
                                                                                 y_pred = predictions,
                                                                                 average='macro')
    metrics['macro_recall']    = macro_recall
    metrics['macro_precision'] = macro_precision
    metrics['macro_f1']        = macro_f1
    
    # calculate and store micro precision, recall and F1 score
    micro_precision, micro_recall, micro_f1, _ = precision_recall_fscore_support(y_true = labels,
                                                                                 y_pred = predictions,
                                                                                 average='micro')
    metrics['micro_recall']    = micro_recall
    metrics['micro_precision'] = micro_precision
    metrics['micro_f1']        = micro_f1
    
    # calculate and store binary precision, recall and F1 score
    binary_precision, binary_recall, binary_f1, _ = precision_recall_fscore_support(y_true = labels,
                                                                                    y_pred = predictions,
                                                                                    average='binary')       
    metrics['binary_recall']    = binary_recall
    metrics['binary_precision'] = binary_precision
    metrics['binary_f1']        = binary_f1
    
    # create dataframe
    metrics_df = pd.DataFrame(data=list(metrics.items()), columns=['metric','value'])
    
    # write dataframe to CSV
    filepath_out = f'../data/derived/performance/mlp_vector{vector_name}_alpha{alpha}_{dataset_name}.csv'
    metrics_df.to_csv(path_or_buf = filepath_out, index = False)
    
    return metrics_df

## Train, predict and evaluate in one function

In [7]:
def multilayer_perceptron(vector_name, alpha, random_seed):
    
    # load tweet dataframes
    train_df = load_tweet_df('train')
    dev_df   = load_tweet_df('dev')
    
    # load tweet vectors
    train_vectors = load_tweet_vectors(vector_name, 'train')
    dev_vectors   = load_tweet_vectors(vector_name, 'dev')
    
    # train multilayer perceptron classifier
    multilayer_perceptron = train_multilayer_perceptron(train_vectors, train_df, alpha, random_seed, vector_name)
    
    # generate predictions for multilayer perceptron classifier
    predictions_train = predict_multilayer_perceptron(multilayer_perceptron, train_vectors, train_df, 'train', vector_name)
    predictions_dev   = predict_multilayer_perceptron(multilayer_perceptron, dev_vectors,   dev_df,   'dev',   vector_name)
    
    # evaluate predictions for multilayer perceptron classifier and write to CSV
    metrics_train = evaluate_multilayer_perceptron(train_df['label'], predictions_train, alpha, 'train', vector_name)
    metrics_dev   = evaluate_multilayer_perceptron(dev_df['label'],   predictions_dev,   alpha, 'dev',   vector_name)
    
    return None

## Fit models with multiple vectors and regularization parameters

In [8]:
# initialize list of regularization parameters
alpha_list = [0.0001, 0.01, 1, 10]

# initialize list of vector names
vector_name_list = ['count', 'tfidf', 'lsi5', 'lsi10', 'lsi50', 'lsi100']

In [9]:
# iterate over regularization parameters
for alpha in alpha_list:
    
    # iterate over list of vector names
    for vector_name in vector_name_list:
        
        # train multilayer perceptron
        multilayer_perceptron(vector_name, alpha, random_seed)
        print(f'Multilayer perceptron with alpha {alpha} and vector {vector_name} complete.')

Multilayer perceptron with alpha 0.0001 and vector count complete.
Multilayer perceptron with alpha 0.0001 and vector tfidf complete.
Multilayer perceptron with alpha 0.0001 and vector lsi5 complete.
Multilayer perceptron with alpha 0.0001 and vector lsi10 complete.
Multilayer perceptron with alpha 0.0001 and vector lsi50 complete.
Multilayer perceptron with alpha 0.0001 and vector lsi100 complete.
Multilayer perceptron with alpha 0.01 and vector count complete.
Multilayer perceptron with alpha 0.01 and vector tfidf complete.
Multilayer perceptron with alpha 0.01 and vector lsi5 complete.
Multilayer perceptron with alpha 0.01 and vector lsi10 complete.
Multilayer perceptron with alpha 0.01 and vector lsi50 complete.
Multilayer perceptron with alpha 0.01 and vector lsi100 complete.
Multilayer perceptron with alpha 1 and vector count complete.
Multilayer perceptron with alpha 1 and vector tfidf complete.
Multilayer perceptron with alpha 1 and vector lsi5 complete.
Multilayer perceptron w