# Decision Tree Model

### Imports

In [1]:
! pip install graphviz
! pip install dl8.5
! pip install chefboost



In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time as t

! pip install graphviz

from IPython.display import SVG
from graphviz import Source

from sklearn import tree
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report



### Helper Functions

In [3]:
def accuracy(y_actual, y_predicted):
    if(not len(y_actual) == len(y_predicted)):
        print("Lengths don't match")
        return 0
    correct = 0
    for i in range(len(y_actual)):
        if y_actual[i] == y_predicted[i]:
            correct += 1
    return 1.0 * correct / len(y_actual)

def clean_df(df):
    for (field, vals) in df.iteritems():
#         print("{:} | {:}".format(field, type(vals[0])))
        if type(vals[0]) == str:
            strings = []
            for s_ind in vals.index:
                if vals[s_ind] not in strings:
                    strings += [vals[s_ind]]
                df.at[s_ind, field] = strings.index(vals[s_ind])
#             print(strings)
    return df

### Data Importing and Processing

In [4]:
df = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

# print(df)

### Data Processing

In [5]:
# Remove NaN Columns

df = df.dropna()
df_test = df_test.dropna()

# Make string values into numbers

clean_df(df)
clean_df(df_test)

# Not 10 like normal since we already have a test partition
num_folds = 9 

cv_length = 1.0 * df.shape[0] / num_folds

partitions = [df.iloc[int(i * cv_length):int((i+1) * cv_length),:] for i in range(num_folds)]
# print([len(partitions[i]) for i in range(len(partitions))])


### Model

In [6]:
# Rotate the folds to tune the parameters

results = []
max_tree_depths = range(1, 50)

for i in range(num_folds):
    validation = partitions[i]
    training = pd.concat([j for j in partitions if not j.equals(validation)])
    
    y_train = training['Decision']
    X_train = training.loc[:, df.columns != 'Decision']
    
    y_valid = validation['Decision']
    X_valid = validation.loc[:, df.columns != 'Decision']
    
    results_for_fold = []
    
    for d in max_tree_depths:
        clf = tree.DecisionTreeClassifier()
        clf.criterion = 'entropy'
        clf.max_tree_depth = d
        clf.fit(X_train, y_train)
#         print("Prediction", clf.predict(X_valid))
#         print("Actual", np.array(y_valid))
        a = accuracy(np.array(y_valid), clf.predict(X_valid))
        results_for_fold += [a]
#         print("Accuracy for depth {:} in fold {:} is {:}".format(d, i, a))
    
    results += [results_for_fold]

# print(results)

results_for_d = np.array(results).T

# print(results_for_d)

min_acc = []
max_acc = []
avg_acc = []

for ind_d in range(len(max_tree_depths)):
    min_acc += [np.min(results_for_d[ind_d])]
    max_acc += [np.max(results_for_d[ind_d])]
    avg_acc += [np.mean(results_for_d[ind_d])]
    print("For depth {:3d}: Avg: {:4f} | Min: {:4f} | Max {:4f}".format(max_tree_depths[ind_d], avg_acc[ind_d], min_acc[ind_d], max_acc[ind_d]))
        
optimal_depth = max_tree_depths[avg_acc.index(np.max(avg_acc))]
print("Optimal max tree depth:", optimal_depth)


For depth   1: Avg: 0.651668 | Min: 0.625812 | Max 0.680297
For depth   2: Avg: 0.656211 | Min: 0.626741 | Max 0.688662
For depth   3: Avg: 0.652908 | Min: 0.623027 | Max 0.680297
For depth   4: Avg: 0.653837 | Min: 0.628598 | Max 0.683086
For depth   5: Avg: 0.653318 | Min: 0.620818 | Max 0.676880
For depth   6: Avg: 0.653011 | Min: 0.629526 | Max 0.686803
For depth   7: Avg: 0.653321 | Min: 0.632312 | Max 0.684015
For depth   8: Avg: 0.654351 | Min: 0.628598 | Max 0.675651
For depth   9: Avg: 0.654869 | Min: 0.632312 | Max 0.684015
For depth  10: Avg: 0.651152 | Min: 0.624884 | Max 0.676580
For depth  11: Avg: 0.652597 | Min: 0.622098 | Max 0.675651
For depth  12: Avg: 0.657860 | Min: 0.629526 | Max 0.689591
For depth  13: Avg: 0.654249 | Min: 0.632312 | Max 0.677509
For depth  14: Avg: 0.651152 | Min: 0.623027 | Max 0.681227
For depth  15: Avg: 0.652184 | Min: 0.625812 | Max 0.680297
For depth  16: Avg: 0.652598 | Min: 0.620241 | Max 0.696097
For depth  17: Avg: 0.654559 | Min: 0.63

### Test

In [7]:
X_test = df_test.loc[:, df_test.columns != 'Decision']

y_train = df['Decision']
X_train = df.loc[:, df.columns != 'Decision']

clf = tree.DecisionTreeClassifier()
clf.criterion = 'entropy'
clf.max_tree_depth = optimal_depth

clf.fit(X_train, y_train)
results = clf.predict(X_test)

### Format for Output

In [8]:
df_test['Decision'] = results
df_test = df_test[['id', 'Decision']]
print(df_test)

df_test.to_csv(r"../data/decision_tree_results.csv", index = False)

        id  Decision
0        1         0
1        2         1
2        3         0
3        4         1
4        5         1
...    ...       ...
2495  2496         1
2496  2497         0
2497  2498         0
2498  2499         1
2499  2500         0

[2390 rows x 2 columns]
