## Set up

Set up libraries and load data.

In [1]:
import numpy as np
import pandas as pd
import scipy.sparse
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support
import pickle

In [2]:
# load count vectors
train_vectors = scipy.sparse.load_npz('../data/derived/tweets_supervised_train_vectors_count.npz')
dev_vectors   = scipy.sparse.load_npz('../data/derived/tweets_supervised_dev_vectors_count.npz')

# load tweet dataframes
train_df = pd.read_csv('../data/derived/tweets_supervised_train.csv')
dev_df   = pd.read_csv('../data/derived/tweets_supervised_dev.csv')

Confirm data size and format is as expected.

In [3]:
# check size of dataframes
for df in [train_df, dev_df]:
    print(f'{len(df)} records.')

# check shape and format of vectors
for vectors in [train_vectors, dev_vectors]:
    print(f'{vectors.shape}    {type(vectors)}')

6497 records.
1393 records.
(6497, 14740)    <class 'scipy.sparse.csr.csr_matrix'>
(1393, 14740)    <class 'scipy.sparse.csr.csr_matrix'>


Create random seed to use as needed.

In [4]:
# create global variable to use for random seed as needed
random_seed = 466

## Train decision tree classifier

In [11]:
def train_decision_tree(vectors, df, max_depth, random_seed):
    """
    Train decision tree classifier with provided features, labels, maximum depth, and random seed
    """
    
    # initialize classifier with default arguments
    decision_tree = DecisionTreeClassifier(max_depth = max_depth, random_state = random_seed)
    
    # train classifier
    decision_tree.fit(X = vectors, y = df['label'])
    
    # write to file
    filepath_out = f'../data/derived/model_decisiontree_maxdepth{max_depth}.pkl'
    pickle.dump(decision_tree, open(filepath_out, 'wb'))
    
    return decision_tree

In [12]:
# train decision tree
decision_tree = train_decision_tree(train_vectors, train_df, None, random_seed)

## Predict with decision tree classifier

In [19]:
def predict_decision_tree(decision_tree, vectors, df, dataset_name):
    """
    Generate predictions with trained decision tree classifier
    """
    # generate predictions with classifier
    predictions = decision_tree.predict(X = vectors)
    
    # create dataframe with record IDs, labels, and predicted labels
    df = pd.DataFrame(data={'tweet_id':df['tweet_id'], 'label':df['label'], 'prediction':predictions})
    
    # write dataframe to file
    filepath_out = f'../data/derived/predictions_decisiontree_maxdepth{decision_tree.max_depth}_{dataset_name}.pkl'
    df.to_csv(filepath_out, index=False)
    
    return predictions

In [20]:
# generate predictions for decision tree
predictions_train = predict_decision_tree(decision_tree, train_vectors, train_df, 'train')
predictions_dev   = predict_decision_tree(decision_tree, dev_vectors,   dev_df,   'dev')

## Evaluate predictions

In [22]:
def evaluate_predictions(labels, predictions, model_name, max_depth, dataset_name):
    """
    Calculate accuracy, confusion matrix, support, precision, recall and F1 score and write to CSV
    """
    
    # initialize empty dictionary to store metrics
    metrics = dict()
    
    # calculate and store confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_true = labels, y_pred = predictions).ravel()
    metrics['true_positives']  = tp
    metrics['false_positives'] = fp
    metrics['true_negatives']  = tn
    metrics['false_negatives'] = fn
    
    # calculate and store accuracy
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    metrics['accuracy'] = accuracy
    
    # calculate and store macro precision, recall and F1 score
    macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(y_true = labels,
                                                                                 y_pred = predictions,
                                                                                 average='macro')
    metrics['macro_recall']    = macro_recall
    metrics['macro_precision'] = macro_precision
    metrics['macro_f1']        = macro_f1
    
    # calculate and store micro precision, recall and F1 score
    micro_precision, micro_recall, micro_f1, _ = precision_recall_fscore_support(y_true = labels,
                                                                                 y_pred = predictions,
                                                                                 average='micro')
    metrics['micro_recall']    = micro_recall
    metrics['micro_precision'] = micro_precision
    metrics['micro_f1']        = micro_f1
    
    # calculate and store binary precision, recall and F1 score
    binary_precision, binary_recall, binary_f1, _ = precision_recall_fscore_support(y_true = labels,
                                                                                    y_pred = predictions,
                                                                                    average='binary')       
    metrics['binary_recall']    = binary_recall
    metrics['binary_precision'] = binary_precision
    metrics['binary_f1']        = binary_f1
    
    # create dataframe
    metrics_df = pd.DataFrame(data=list(metrics.items()), columns=['metric','value'])
    
    # write dataframe to CSV
    filepath_out = f'../data/derived/performance_{model_name}_{max_depth}_{dataset_name}.csv'
    metrics_df.to_csv(path_or_buf = filepath_out, index = False)
    
    return metrics_df

In [23]:
# evaluate predictions for decision tree and write to CSV
metrics_train = evaluate_predictions(train_df['label'], predictions_train, 'decisiontree', None, 'train')
metrics_dev   = evaluate_predictions(dev_df['label'],   predictions_dev,   'decisiontree', None, 'dev')