## Set up

Set up libraries and load data.

In [1]:
import numpy as np
import pandas as pd
import scipy.sparse
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support
import pickle

In [None]:
def load_tweet_df(dataset_name):
    """
    Load tweet dataframes (assumes filename structure)
    """
    filepath_in = f'../data/derived/tweets_supervised_{dataset_name}.csv'
    df = pd.read_csv(filepath_in)
    return df

def load_tweet_vectors(vector_name, dataset_name):
    """
    Load tweet vectors (assumes filename structure)
    """
    
    # load LSI vectors (saved as array in CSV file)
    if 'lsi' in vector_name:
        filepath_in = f'../data/derived/vectors/vector{vector_name}_{dataset_name}.csv'
        vectors = np.loadtxt(filepath_in, delimiter=',')
        
    # load other vectors (saved as sparse array in NPZ file)
    else:
        filepath_in = f'../data/derived/vectors/vector{vector_name}_{dataset_name}.npz'
        vectors = scipy.sparse.load_npz(filepath_in)
        
    return vectors

In [2]:
# load tweet dataframes
train_df = load_tweet_df('train')
dev_df   = load_tweet_df('dev')

# load count vectors
train_vector_count = load_tweet_vectors('count', 'train')
dev_vector_count   = load_tweet_vectors('count', 'dev')

# load TF-IDF vectors
train_vector_tfidf = load_tweet_vectors('tfidf', 'train')
dev_vector_tfidf   = load_tweet_vectors('tfidf', 'dev')

Confirm data size and format is as expected.

In [3]:
# check size of dataframes
for df in [train_df, dev_df]:
    print(f'{len(df)} records.')

# check shape and format of vectors
for vectors in [train_vector_count, train_vector_tfidf, dev_vector_count, dev_vector_tfidf]:
    print(f'{vectors.shape}    {type(vectors)}')

6497 records.
1393 records.
(6497, 14740)    <class 'scipy.sparse.csr.csr_matrix'>
(6497, 14740)    <class 'scipy.sparse.csr.csr_matrix'>
(1393, 14740)    <class 'scipy.sparse.csr.csr_matrix'>
(1393, 14740)    <class 'scipy.sparse.csr.csr_matrix'>


Create random seed to use as needed.

In [4]:
# create global variable to use for random seed as needed
random_seed = 466

## Train classifier

In [5]:
def train_decision_tree(vectors, df, max_depth, random_seed, vector_name):
    """
    Train decision tree classifier with provided features, labels, maximum depth, and random seed
    """
    
    # initialize classifier with default arguments
    decision_tree = DecisionTreeClassifier(max_depth = max_depth, random_state = random_seed)
    
    # train classifier
    decision_tree.fit(X = vectors, y = df['label'])
    
    # write to file
    filepath_out = f'../data/derived/models/decisiontree_vector{vector_name}_maxdepth{max_depth}.pkl'
    pickle.dump(decision_tree, open(filepath_out, 'wb'))
    
    return decision_tree

## Make predictions

In [6]:
def predict_decision_tree(decision_tree, vectors, df, dataset_name, vector_name):
    """
    Generate predictions with trained decision tree classifier
    """
    # generate predictions with classifier
    predictions = decision_tree.predict(X = vectors)
    
    # create dataframe with record IDs, labels, and predicted labels
    df = pd.DataFrame(data={'tweet_id':df['tweet_id'], 'label':df['label'], 'prediction':predictions})
    
    # write dataframe to file
    filepath_out = f'../data/derived/predictions/decisiontree_vector{vector_name}_maxdepth{decision_tree.max_depth}_{dataset_name}.pkl'
    df.to_csv(filepath_out, index=False)
    
    return predictions

## Evaluate predictions

In [7]:
def evaluate_decision_tree(labels, predictions, max_depth, dataset_name, vector_name):
    """
    Calculate accuracy, confusion matrix, support, precision, recall and F1 score and write to CSV
    """
    
    # initialize empty dictionary to store metrics
    metrics = dict()
    
    # calculate and store confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_true = labels, y_pred = predictions).ravel()
    metrics['true_positives']  = tp
    metrics['false_positives'] = fp
    metrics['true_negatives']  = tn
    metrics['false_negatives'] = fn
    
    # calculate and store accuracy
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    metrics['accuracy'] = accuracy
    
    # calculate and store macro precision, recall and F1 score
    macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(y_true = labels,
                                                                                 y_pred = predictions,
                                                                                 average='macro')
    metrics['macro_recall']    = macro_recall
    metrics['macro_precision'] = macro_precision
    metrics['macro_f1']        = macro_f1
    
    # calculate and store micro precision, recall and F1 score
    micro_precision, micro_recall, micro_f1, _ = precision_recall_fscore_support(y_true = labels,
                                                                                 y_pred = predictions,
                                                                                 average='micro')
    metrics['micro_recall']    = micro_recall
    metrics['micro_precision'] = micro_precision
    metrics['micro_f1']        = micro_f1
    
    # calculate and store binary precision, recall and F1 score
    binary_precision, binary_recall, binary_f1, _ = precision_recall_fscore_support(y_true = labels,
                                                                                    y_pred = predictions,
                                                                                    average='binary')       
    metrics['binary_recall']    = binary_recall
    metrics['binary_precision'] = binary_precision
    metrics['binary_f1']        = binary_f1
    
    # create dataframe
    metrics_df = pd.DataFrame(data=list(metrics.items()), columns=['metric','value'])
    
    # write dataframe to CSV
    filepath_out = f'../data/derived/performance/decisiontree_vector{vector_name}_maxdepth{max_depth}_{dataset_name}.csv'
    metrics_df.to_csv(path_or_buf = filepath_out, index = False)
    
    return metrics_df

## Train, predict and evaluate in one function

In [8]:
def decision_tree(train_vectors, train_df, dev_vectors, dev_df, max_depth, random_seed, vector_name):
    
    # train decision tree
    decision_tree = train_decision_tree(train_vectors, train_df, max_depth, random_seed, vector_name)
    
    # generate predictions for decision tree
    predictions_train = predict_decision_tree(decision_tree, train_vectors, train_df, 'train', vector_name)
    predictions_dev   = predict_decision_tree(decision_tree, dev_vectors,   dev_df,   'dev',   vector_name)
    
    # evaluate predictions for decision tree and write to CSV
    metrics_train = evaluate_decision_tree(train_df['label'], predictions_train, max_depth, 'train', vector_name)
    metrics_dev   = evaluate_decision_tree(dev_df['label'],   predictions_dev,   max_depth, 'dev',   vector_name)

## Fit models with multiple vectors and maximum depths

In [9]:
# initialize list of maximum tree depth
max_depth_list = [10, 50, 100, None]

# iterate over maximum depth values
for max_depth in max_depth_list:
    # train decision tree with count vectors
    decision_tree(train_vector_count, train_df, dev_vector_count, dev_df, max_depth, random_seed, 'count')
    # train decision tree with TF-IDF vectors
    decision_tree(train_vector_tfidf, train_df, dev_vector_tfidf, dev_df, max_depth, random_seed, 'tfidf')