## Set up

Set up libraries and load data.

In [1]:
import numpy as np
import pandas as pd
import scipy.sparse
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support
import pickle

In [2]:
# load count vectors
train_vectors = scipy.sparse.load_npz('../data/derived/tweets_supervised_train_vectors_count.npz')
dev_vectors   = scipy.sparse.load_npz('../data/derived/tweets_supervised_dev_vectors_count.npz')

# load tweet dataframes
train_df = pd.read_csv('../data/derived/tweets_supervised_train.csv')
dev_df   = pd.read_csv('../data/derived/tweets_supervised_dev.csv')

Confirm data size and format is as expected.

In [3]:
# check size of dataframes
for df in [train_df, dev_df]:
    print(f'{len(df)} records.')

# check shape and format of vectors
for vectors in [train_vectors, dev_vectors]:
    print(f'{vectors.shape}    {type(vectors)}')

6497 records.
1393 records.
(6497, 14740)    <class 'scipy.sparse.csr.csr_matrix'>
(1393, 14740)    <class 'scipy.sparse.csr.csr_matrix'>


Create random seed to use as needed.

In [4]:
# create global variable to use for random seed as needed
random_seed = 466

## Train decision tree classifier

In [11]:
def train_decision_tree(vectors, df, max_depth, random_seed):
    """
    Train decision tree classifier with provided features, labels, maximum depth, and random seed
    """
    
    # initialize classifier with default arguments
    decision_tree = DecisionTreeClassifier(max_depth = max_depth, random_state = random_seed)
    
    # train classifier
    decision_tree.fit(X = vectors, y = df['label'])
    
    # write to file
    filepath_out = f'../data/derived/model_decisiontree_maxdepth{max_depth}.pkl'
    pickle.dump(decision_tree, open(filepath_out, 'wb'))
    
    return decision_tree

In [12]:
# train decision tree
decision_tree = train_decision_tree(train_vectors, train_df, None, random_seed)

## Predict with decision tree classifier

In [None]:
def generate_predictions(model, features):
    """
    Generate predictions with trained decision tree classifier
    """
    
    # generate predictions with classifier
    predictions = model.predict(X = features)
    
    return predictions

def save_predictions(tweet_ids, labels, predictions, filepath_out):
    """
    Write predictions to CSV
    """
    
    # create dataframe with record IDs, labels, and predicted labels
    df = pd.DataFrame(data={'tweet_id':tweet_ids, 'label':labels, 'prediction':predictions})
    # write dataframe to file
    df.to_csv(filepath_out, index=False)

In [None]:
# generate predictions for decision tree
train_predictions_decision_tree = generate_predictions(model = decision_tree, features = train_vectors)
dev_predictions_decision_tree   = generate_predictions(model = decision_tree, features = dev_vectors)

# write predictions to CSV
save_predictions(tweet_ids = train_df['tweet_id'],
                 labels = train_df['label'],
                 predictions = train_predictions_decision_tree,
                 filepath_out = '../data/derived/tweets_supervised_train_predictions_decision_tree.csv')
save_predictions(tweet_ids = dev_df['tweet_id'],
                 labels = dev_df['label'],
                 predictions = dev_predictions_decision_tree,
                 filepath_out = '../data/derived/tweets_supervised_dev_predictions_decision_tree.csv')