## Set up

Set up libraries and load data.

In [1]:
import numpy as np
import pandas as pd
import scipy.sparse
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support
import pickle

In [2]:
# load tweet dataframes
train_vectors = scipy.sparse.load_npz('../data/derived/tweets_supervised_train_vectors_count.npz')
dev_vectors   = scipy.sparse.load_npz('../data/derived/tweets_supervised_dev_vectors_count.npz')

# load count vectors
train_df = pd.read_csv('../data/derived/tweets_supervised_train.csv')
dev_df   = pd.read_csv('../data/derived/tweets_supervised_dev.csv')

Confirm data size and format is as expected.

In [3]:
# check size of dataframes
for df in [train_df, dev_df]:
    print(f'{len(df)} records.')

# check shape and format of vectors
for vectors in [train_vectors, dev_vectors]:
    print(f'{vectors.shape}    {type(vectors)}')

6497 records.
1393 records.
(6497, 14740)    <class 'scipy.sparse.csr.csr_matrix'>
(1393, 14740)    <class 'scipy.sparse.csr.csr_matrix'>


Create random seed to use as needed.

In [4]:
# create global variable to use for random seed as needed
random_seed = 466

## Train decision tree classifier

In [5]:
def train_decision_tree(features, labels, random_seed):
    """
    Train decision tree classifier with provided features, labels, and random seed
    """
    
    # initialize classifier with default arguments
    decision_tree = DecisionTreeClassifier(random_state = random_seed)
    # train classifier
    decision_tree.fit(X = features, y = labels)
    
    return decision_tree

In [6]:
# train decision tree
decision_tree = train_decision_tree(features = train_vectors, labels = train_df['label'], random_seed = random_seed)

In [7]:
# save decision tree
pickle.dump(decision_tree, open('../data/derived/decision_tree.pkl', 'wb'))

## Generate and evaluate predictions with decision tree classifier

In [8]:
def predict_decision_tree(decision_tree, features):
    """
    Generate predictions with trained decision tree classifier
    """
    
    # generate predictions with classifier
    predictions = decision_tree.predict(X = features)
    
    return predictions

def save_predictions(tweet_ids, labels, predictions, filepath_out):
    """
    Write predictions to CSV
    """
    
    # create dataframe with record IDs, labels, and predicted labels
    df = pd.DataFrame(data={'tweet_id':tweet_ids, 'label':labels, 'prediction':predictions})
    # write dataframe to file
    df.to_csv(filepath_out, index=False)

def evaluate_predictions(labels, predictions, filepath_out):
    """
    Calculate accuracy, confusion matrix, support, precision, recall and F1 score and write to CSV
    """
    
    # initialize empty dictionary to store metrics
    metrics = dict()
    
    # calculate and store confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_true = labels, y_pred = predictions).ravel()
    metrics['true_positives']  = tp
    metrics['false_positives'] = fp
    metrics['true_negatives']  = tn
    metrics['false_negatives'] = fn
    
    # calculate and store accuracy
    accuracy = (tp + fp) / (tp + fp + tn + fn)
    metrics['accuracy'] = (tp + tn) / (tp + tn + fp + fn)
    
    # calculate and store macro precision, recall and F1 score
    macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(y_true = labels,
                                                                                 y_pred = predictions,
                                                                                 average='macro')
    metrics['macro_recall']    = macro_recall
    metrics['macro_precision'] = macro_precision
    metrics['macro_f1']        = macro_f1
    
    # calculate and store micro precision, recall and F1 score
    micro_precision, micro_recall, micro_f1, _ = precision_recall_fscore_support(y_true = labels,
                                                                                 y_pred = predictions,
                                                                                 average='micro')
    metrics['micro_recall']    = micro_recall
    metrics['micro_precision'] = micro_precision
    metrics['micro_f1']        = micro_f1
    
    # calculate and store binary precision, recall and F1 score
    binary_precision, binary_recall, binary_f1, _ = precision_recall_fscore_support(y_true = labels,
                                                                                    y_pred = predictions,
                                                                                    average='binary')       
    metrics['binary_recall']    = binary_recall
    metrics['binary_precision'] = binary_precision
    metrics['binary_f1']        = binary_f1
    
    # create dataframe and write to CSV
    metrics_df = pd.DataFrame(data=list(metrics.items()), columns=['metric','value'])
    metrics_df.to_csv(path_or_buf=filepath_out, index=False)
    
    return metrics_df

In [9]:
# generate predictions
train_predictions = predict_decision_tree(decision_tree, features = train_vectors)
dev_predictions   = predict_decision_tree(decision_tree, features = dev_vectors)

# write predictions to CSV
save_predictions(tweet_ids = train_df['tweet_id'], labels = train_df['label'], predictions = train_predictions,
                 filepath_out = '../data/derived/tweets_supervised_train_predictions_decision_tree.csv')
save_predictions(tweet_ids = dev_df['tweet_id'], labels = dev_df['label'], predictions = dev_predictions,
                 filepath_out = '../data/derived/tweets_supervised_dev_predictions_decision_tree.csv')

# evaluate and write to CSV
train_metrics = evaluate_predictions(labels = train_df['label'],
                                     predictions = train_predictions,
                                     filepath_out = '../data/derived/performance_train_decision_tree.csv')
dev_metrics = evaluate_predictions(labels = dev_df['label'],
                                   predictions = dev_predictions,
                                   filepath_out = '../data/derived/performance_dev_decision_tree.csv')

Check out evaluation metrics for training and development datasets.

In [10]:
train_metrics

Unnamed: 0,metric,value
0,true_positives,2685.0
1,false_positives,0.0
2,true_negatives,3812.0
3,false_negatives,0.0
4,accuracy,1.0
5,macro_recall,1.0
6,macro_precision,1.0
7,macro_f1,1.0
8,micro_recall,1.0
9,micro_precision,1.0


In [11]:
dev_metrics

Unnamed: 0,metric,value
0,true_positives,463.0
1,false_positives,86.0
2,true_negatives,731.0
3,false_negatives,113.0
4,accuracy,0.857143
5,macro_recall,0.849278
6,macro_precision,0.854733
7,macro_f1,0.851652
8,micro_recall,0.857143
9,micro_precision,0.857143
