## Load required libraries and packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt #needed for reading in data, dataframe/variable manipulation, preprocessing

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn import tree #needed for developing and evaluating decision tree model

import seaborn as sns #needed to visualize confusion matrix
import graphviz #needed to visualize decisiont tree plot --> this needs to be installed for Jupyter notebooks 

# go to terminal in Anaconda and input 'pip install graphviz'

## Calculate apriori (naive) prediction accuracy

In [None]:
df['DV'].value_counts() #find the most common class of your outcome variable using value_counts

In [None]:
a = 'the value count of most common class'/len(df) 
#divide the number of observations from most common class by the total number of observations
print("apriori accuracy = ", a)

# Develop Decision Model

## Assign IVs to 'x' and DV to 'y', create dummy variables for categorical IVS

In [None]:
x = df.drop(['primary key', 'DV'], axis=1) #drop out primary key and DV columns
x = pd.get_dummies(data = x, drop_first = False) #make sure to NOT leave out a reference group

y = df['DV']

x_train, x_test, y_train, y_test = train_test_split( 
    x, y, test_size = 0.2, random_state = 100) #create 80-20 train-test split on x and y

In [None]:
model = DecisionTreeClassifier(criterion = "gini", random_state=100, 
                               max_depth=5, min_samples_leaf=8, min_impurity_decrease=0.004)
#can change parameters for criterion (gini, entropy, or log_loss), max_depth, min_samples_leaf, min_impurity_decrease

model.fit(x_train, y_train) #fit model to training set

## Visualize decision tree model

In [None]:
labels = y.value_counts()
dot_data = tree.export_graphviz(model, out_file=None, 
                                feature_names=x.columns,  
                                class_names=labels.index.values,
                                filled=True) #plot model

# Draw graph
graph = graphviz.Source(dot_data, format="png") 
graph

# Predict model onto test set (x_test)

In [None]:
predictions = model.predict(x_test)

## View predictions against actual observations in datatable

In [None]:
df_pred = x_test

df_pred['predicted_class'] = predictions
df_pred['actual_class'] = y_test

df_pred

## Generate confusion matrix with heatmapping to view predictions vs actual observations

In [None]:
conf = pd.DataFrame(df_pred, columns=['actual_class','predicted_class'])
confusion_matrix = pd.crosstab(conf['actual_class'], conf['predicted_class'], rownames=['Actual'], colnames=['Predicted'])
#specify the confusion matrix 


sns.heatmap(confusion_matrix, annot=True) #create a heatmap from the confusion matrix, with annotations
sns.set(rc={'figure.figsize':(12,10)}) #set figsize to 12x10 (change to whatever size you want)
plt.show()

### Calculate overall accuracy from confusion matrix

In [None]:
# add up all values on the diagonal of the confusion matrix, then divide by the len(df) or the total # of observations

# example from ICE is below

acc = (9 + 46 + 21 + 25)/len(df) #for APA, in denominator, use len(y_test)
print("overall accuracy = ", acc)

### Use classifcation_report to view classification accuracy, recall, precision and $f_1$ score

In [None]:
print(classification_report(y_test, predictions))

# Use best performing model to predict onto unlabeled data

In [None]:
df_2 = pd.read_csv("unlabled.csv") #read in unlabled data
df_2.head()

In [None]:
x = df_2(['primary key'], axis=1)
x = pd.get_dummies(data = x, drop_first = False) # assign IVs to x and create dummy variables for categorical IVs

In [None]:
predictions = model.predict(x)

### Also provide predicted probability for the predicted outcome class

In [None]:
proba = np.max(model.predict_proba(x), axis=1)

## View both predicted outcome and its probability in datatable

In [None]:
df_2['Predicted_Class'] = predictions #add predicted outcome column
df_2['Predicted_Prob'] = proba #add probability column
df_2

Group By

In [None]:
df_2.groupby(['routeid','Predicted_Class'])['routeid'].count().unstack(0)# using groupby to view counts of predicted classes by route