## Set up

Set up libraries and load data.

In [1]:
import numpy as np
import pandas as pd
import scipy.sparse
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support
import pickle

In [2]:
# load count vectors
train_vectors = scipy.sparse.load_npz('../data/derived/tweets_supervised_train_vectors_count.npz')
dev_vectors   = scipy.sparse.load_npz('../data/derived/tweets_supervised_dev_vectors_count.npz')

# load tweet dataframes
train_df = pd.read_csv('../data/derived/tweets_supervised_train.csv')
dev_df   = pd.read_csv('../data/derived/tweets_supervised_dev.csv')

Confirm data size and format is as expected.

In [3]:
# check size of dataframes
for df in [train_df, dev_df]:
    print(f'{len(df)} records.')

# check shape and format of vectors
for vectors in [train_vectors, dev_vectors]:
    print(f'{vectors.shape}    {type(vectors)}')

6497 records.
1393 records.
(6497, 14740)    <class 'scipy.sparse.csr.csr_matrix'>
(1393, 14740)    <class 'scipy.sparse.csr.csr_matrix'>


Create random seed to use as needed.

In [4]:
# create global variable to use for random seed as needed
random_seed = 466

## Train decision tree classifier

In [5]:
def train_decision_tree(features, labels, random_seed):
    """
    Train decision tree classifier with provided features, labels, and random seed
    """
    
    # initialize classifier with default arguments
    decision_tree = DecisionTreeClassifier(random_state = random_seed)
    # train classifier
    decision_tree.fit(X = features, y = labels)
    
    return decision_tree

In [6]:
# train decision tree
decision_tree = train_decision_tree(features = train_vectors, labels = train_df['label'], random_seed = random_seed)

In [7]:
# save decision tree
pickle.dump(decision_tree, open('../data/derived/decision_tree.pkl', 'wb'))