## Set up

Set up libraries and load data.

In [1]:
import numpy as np
import pandas as pd
import scipy.sparse
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support
import pickle

In [2]:
# load count vectors
train_vectors = scipy.sparse.load_npz('../data/derived/tweets_supervised_train_vectors_count.npz')
dev_vectors   = scipy.sparse.load_npz('../data/derived/tweets_supervised_dev_vectors_count.npz')

# load tweet dataframes
train_df = pd.read_csv('../data/derived/tweets_supervised_train.csv')
dev_df   = pd.read_csv('../data/derived/tweets_supervised_dev.csv')

Confirm data size and format is as expected.

In [3]:
# check size of dataframes
for df in [train_df, dev_df]:
    print(f'{len(df)} records.')

# check shape and format of vectors
for vectors in [train_vectors, dev_vectors]:
    print(f'{vectors.shape}    {type(vectors)}')

6497 records.
1393 records.
(6497, 14740)    <class 'scipy.sparse.csr.csr_matrix'>
(1393, 14740)    <class 'scipy.sparse.csr.csr_matrix'>


Create random seed to use as needed.

In [4]:
# create global variable to use for random seed as needed
random_seed = 466

## Train decision tree classifiers with different maximum depths

In [5]:
def train_decision_tree(features, labels, max_depth, random_seed):
    """
    Train and write decision tree classifier with provided features, labels, maximum depth and random seed
    """
    
    # initialize classifier with specified arguments
    decision_tree = DecisionTreeClassifier(max_depth = max_depth, random_state = random_seed)
    
    # train classifier
    decision_tree.fit(X = features, y = labels)
    
    # write classifier to file
    filepath = f'../data/derived/model_decisiontree_maxdepth{max_depth}.pkl'
    pickle.dump(decision_tree, open(filepath, 'wb'))
    
    return decision_tree

In [None]:
def predict(model, vectors, df, model_name, dataset_name):
    """
    Generate and write predictions for provided model, vectors, dataframe and names
    """
    # create dictionary with record IDs and labels
    data = {'tweet_id':df['tweet_id'], 'label':df['label']}
    
    # generate predictions
    predictions = model.predict(X = vectors)
    
    # add predictions to dictionary
    data['predictions'] = predictions

In [6]:
# initialize hyperparameter list
max_depth_list = [10, 50, 100, None]

In [7]:
# train decision trees with specified maximum depths
decision_tree_list = train_decision_trees(features = train_vectors,
                                          labels = train_df['label'],
                                          max_depth_list = max_depth_list,
                                          random_seed = random_seed)

In [None]:
def generate_predictions(model, features):
    """
    Generate predictions with trained decision tree classifier
    """
    
    # generate predictions with classifier
    predictions = model.predict(X = features)
    
    return predictions

def save_predictions(tweet_ids, labels, predictions, filepath_out):
    """
    Write predictions to CSV
    """
    
    # create dataframe with record IDs, labels, and predicted labels
    df = pd.DataFrame(data={'tweet_id':tweet_ids, 'label':labels, 'prediction':predictions})
    # write dataframe to file
    df.to_csv(filepath_out, index=False)

## Generate predictions with classifiers

In [None]:
def predict(model_list, max_depth_list, features, tweet_ids, labels):
    """
    Generate predictions for models in list
    """
    
    # create dictionary with record IDs and labels, and predicted labels
    predictions_dict = {'tweet_id':tweet_ids, 'label':labels}
    
    # iterate over models
    for model in model_list:
        
        # generate predictions with classifier
        predictions = model.predict(X = features)
    

    
    # write dataframe to file
    filepath = '../data/derived/predictions_decisiontree.csv'
    predictions_df.to_csv(filepath, index=False)
    
    return predictions

def save_predictions(tweet_ids, labels, predictions, filepath_out):
    """
    Write predictions to CSV
    """
    
    # create dataframe with record IDs, labels, and predicted labels
    df = pd.DataFrame(data={'tweet_id':tweet_ids, 'label':labels, 'prediction':predictions})
    # write dataframe to file
    df.to_csv(filepath_out, index=False)