# Decision Trees

In [None]:
%run Preprocessing.ipynb

In [3]:
# Import packages
from sklearn import tree
import sklearn.feature_selection
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix
import seaborn as sns
import numpy as np
import statistics
import graphviz
import pandas as pd

In [4]:
x_train = x_train
y_train = y_train
x_test = x_test #final valication with the kfold
y_test = y_test #final valication with the kfold

**Feature Selection**
- Build the model using the pre-processed data

In order to avoid overfitting and slow computing (due to the increase in features from dummying as well as increasing dimensionality), selecting the most important features is important. 


In [5]:
# select k best is a univariate method for feature selection:
# looks at the outcome and the relationship with each feature and selects k number of best features 
select = sklearn.feature_selection.SelectKBest(k=20)
selected_features = select.fit(x_train, y_train)
indices_selected = selected_features.get_support(indices=True)
colnames_selected = [x.columns[I] for I in indices_selected]

x_train_selected = x_train[colnames_selected]
x_test_selected = x_test[colnames_selected]

colnames_selected # 20 features selected

['PctKids2Par',
 'racePctWhite',
 'PctKidsBornNeverMar',
 'PctFam2Par',
 'PctYoungKids2Par',
 'PctTeen2Par',
 'racepctblack',
 'pctWInvInc',
 'pctWPubAsst',
 'PctPersOwnOccup',
 'PctPopUnderPov',
 'FemalePctDiv',
 'PctNotHSGrad',
 'PctHousNoPhone',
 'TotalPctDiv',
 'MalePctDivorce',
 'PctPersDenseHous',
 'PctHousOwnOcc',
 'PctHousLess3BR',
 'medFamInc']

**Build Decision tree #1**

Firstly, we create a basic DecisionTreeClassifier, and then will slowly tune the parameters, firstly, we need to look at the **max depth**. The cell below will calcaulate the accuracy score for a max depth up to *n*. This is then plotted in a bar graph.

For each parameter, we need to create a k-means loop, that then plots the overall figure. 


In [6]:
kf =KFold(n_splits=j, shuffle=True, random_state=42)
score = cross_val_score(tree.DecisionTreeClassifier(criterion='entropy', max_depth=i), x_train, y_train, cv= kf, scoring="accuracy")
print(score.mean())

kf =KFold(n_splits=j, shuffle=True, random_state=42)
score = cross_val_score(tree.DecisionTreeClassifier(criterion='gini', max_depth=i), x_train, y_train, cv= kf, scoring="accuracy")
print(score.mean())

NameError: name 'j' is not defined

In [None]:
def get_max_depth(n):
    scores ={}
    x = list()
    y = list()
    error = list()
    
    for i in range(3,n): #first create a for loop for each max depth
        value_of_j = [] #this will take the average score for each value of k
        
        for j in range(2,10): #create a for loop for each kfold
            kf =KFold(n_splits=j, shuffle=True, random_state=42)
            score = cross_val_score(tree.DecisionTreeClassifier(criterion='gini', max_depth=i), x_train, y_train, cv= kf, scoring="accuracy")
            value_of_j.append(score.mean())
            
        res = {str(i): value_of_j}
        scores.update(res) # for each max_depth, the average k_fold score
  
    return scores
            
d_max_depth = get_max_depth(10)

x =[]
y =[]
error = []
for key, value in d_max_depth.items():
    x.append(key)
    y.append(sum(value)/len(value))
    error.append(statistics.stdev(value))
    
xfit = np.linspace(0, 10, 1000)
# plt.plot(x, error, 'or')
plt.bar(x, y, color=(0.2, 0.4, 0.6, 0.6))
plt.errorbar(x, y, yerr = error, barsabove = False, fmt='o', capsize=3, color='red')
#plt.errorbar(x, y, yerr = error, ls='-.', fmt='o', capsize=3)
plt.xlabel('Max_Depth', fontsize=15)
plt.ylabel('Accuracy', fontsize=15)
plt.title('Change in Accuracy as Max Depth increases')
plt.ylim(0.8, 0.86)

For this dataset, we have shown that the max_depth at 6 provides the highest accuracy. Therefore, the current model is:

In [None]:
scores ={}
x = list()
y = list()
error = list()

min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)

for i in min_samples_splits: #first create a for loop for each max depth
    value_of_j = [] #this will take the average score for each value of k

    for j in range(2,10): #create a for loop for each kfold
        kf =KFold(n_splits=j, shuffle=True, random_state=42)
        score = cross_val_score(tree.DecisionTreeClassifier(criterion='gini', splitter = 'best', max_depth=3, min_samples_split = i), x_train, y_train, cv= kf, scoring="accuracy")
        value_of_j.append(score.mean())

    res = {str(i): value_of_j}
    scores.update(res) # for each max_depth, the average k_fold score

get_min_sample_split = scores



In [None]:
x =[]
y =[]
error = []
for key, value in get_min_sample_split.items():
    x.append(key)
    y.append(sum(value)/len(value))
    error.append(statistics.stdev(value))
    
x = [round(float(i),2) for i in x]
    
# plt.plot(x, error, 'or')
plt.plot(x, y, color=(0.2, 0.4, 0.6, 0.6), lw=3)
plt.errorbar(x, y, yerr = error, barsabove = False, fmt='o', capsize=3, color='red')
#plt.errorbar(x, y, yerr = error, ls='-.', fmt='o', capsize=3)
plt.xlabel('Min_samples_split', fontsize=15)
plt.ylabel('Accuracy', fontsize=15)
plt.title('Change in Accuracy as min_samples_split increases')
plt.ylim(0.8, 0.86)
plt.xlim(0,1.1)

In [None]:
dtree = DecisionTreeClassifier(criterion='gini', splitter = 'best', max_depth=6, min_samples_split =0.1)
dtree = dtree.fit(x_train, y_train)
#tree.plot_tree(dtree, feature_names=data.columns) 

# model can then be used to predict !
# crime_predict = dtree.predict([[0,1]])

y_pred = dtree.predict(x_test)  # Predict the response for test dataset
print("Accuracy:",accuracy_score(y_test, y_pred))

In [None]:
# Get the tree
features = []
df = pd.DataFrame(x_train)
for i in df[1:]:
    features.append(str(i))
dot_data = tree.export_graphviz(dtree, out_file=None, feature_names=features, filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(dot_data)
graph

code adapted from https://medium.com/@mohtedibf/indepth-parameter-tuning-for-decision-tree-6753118a03c3

In [None]:
scores ={}
x = list()
y = list()
error = list()

features = []
df = pd.DataFrame(x_train)
for i in df[1:]:
    features.append(str(i))

max_features = len(features)

for i in range(1, max_features): #first create a for loop for each max depth
    value_of_j = [] #this will take the average score for each value of k

    for j in range(2,10): #create a for loop for each kfold
        kf =KFold(n_splits=j, shuffle=True, random_state=42)
        score = cross_val_score(tree.DecisionTreeClassifier(criterion='gini', splitter = 'best', max_depth=3, min_samples_split = 0.1, max_features=i), x_train, y_train, cv= kf, scoring="accuracy")
        value_of_j.append(score.mean())

    res = {str(i): value_of_j}
    scores.update(res) # for each max_depth, the average k_fold score

get_min_sample_split = scores

In [None]:
get_min_sample_split

In [None]:

x =[]
y =[]
error = []
for key, value in get_min_sample_split.items():
    x.append(key)
    y.append(sum(value)/len(value))
    error.append(statistics.stdev(value))
    
x = [round(float(i),2) for i in x]

y_above = []
y_below = []
for i in range(len(error)):
    y_above.append(y[i]+(error[i]*0.5))
    y_below.append(y[i]-(error[i]*0.5))
    
# plt.plot(x, error, 'or')
plt.plot(x, y, color=(0.2, 0.4, 0.6, 0.6), lw=3)
#plt.errorbar(x, y, yerr = error, barsabove = False, fmt='o', capsize=3, color='red')

#plt.fill_between(range(100), y-y_diff, y+y_diff, alpha=0.5)
#plt.errorbar(x, y, yerr = error, ls='-.', fmt='o', capsize=3)
plt.xlabel('Max_Features', fontsize=15)
plt.ylabel('Accuracy', fontsize=15)
plt.title('Change in Accuracy as max_features increases')
plt.ylim(0.8, 0.86)
plt.xlim(0,100)#plot the outputs

z1 = np.array(y_above)
z2 = np.array(y_below)

plt.fill_between(x,y_above,y_below,where=z1>=z2,color='grey',alpha=0.5, interpolate=True)


In [None]:
kf =KFold(n_splits=4, shuffle=True, random_state=42)
score = cross_val_score(tree.DecisionTreeClassifier(criterion='gini', splitter = 'best', max_depth=9, min_samples_split = 0.6, ccp_alpha = 0.012, max_features = 100), x, y, cv= kf, scoring="accuracy")
print(f'Scores for each fold are: {score}')
print(f'Average score: {"{:.2f}".format(score.mean())}')

In [None]:
scores ={}
x = list()
y = list()
error = list()

ccp_alpha_vals = np.arange(0,0.3,0.001)


for i in ccp_alpha_vals: #first create a for loop for each max depth
    value_of_j = [] #this will take the average score for each value of k

    for j in range(2,10): #create a for loop for each kfold
        kf =KFold(n_splits=j, shuffle=True, random_state=42)
        score = cross_val_score(tree.DecisionTreeClassifier(criterion='gini',splitter = 'best', max_depth=3, min_samples_split = 0.1, max_features=100, ccp_alpha=i), x_train, y_train, cv= kf, scoring="accuracy")
        value_of_j.append(score.mean())

    res = {str(i): value_of_j}
    scores.update(res) # for each max_depth, the average k_fold score

get_min_sample_split = scores

In [None]:
x =[]
y =[]
error = []
for key, value in get_min_sample_split.items():
    x.append(key)
    y.append(sum(value)/len(value))
    error.append(statistics.stdev(value))
    
x = [round(float(i),2) for i in x]
    
# plt.plot(x, error, 'or')
plt.plot(x, y, color=(0.2, 0.4, 0.6, 0.6), lw=3)
#plt.errorbar(x, y, yerr = error, barsabove = False, fmt='o', capsize=3, color='red')
#plt.errorbar(x, y, yerr = error, ls='-.', fmt='o', capsize=3)
plt.xlabel('ccp_alpha', fontsize=15)
plt.ylabel('Accuracy', fontsize=15)
plt.title('Change in Accuracy as ccp_alpha increases')
plt.ylim(0.8, 0.86)
plt.xlim(0,0.15)#plot the outputs


**Build the Decision tree #2** - applying pre-pruning techniques

**K-Fold Classification** 
This will increase the number of times the test and train data is split, then will average out the accuracy scores. The cells below will increase the number of K up to 12, then plot the accuracy for the data.

In [None]:
kfold_scores = {}
for i in range(2, 12):
    kf =KFold(n_splits=i, shuffle=True, random_state=42)
    score = cross_val_score(tree.DecisionTreeClassifier(criterion='gini', splitter = 'best', max_depth=3, min_samples_split = 0.1, max_features=100, ccp_alpha=0.01), x_test, y_test, cv= kf, scoring="accuracy")
    res = {str(i):list(score)}
    kfold_scores.update(res)

x =[]
y =[]
error = []
for key, value in kfold_scores.items():
    x.append(key)
    y.append(sum(value)/len(value))
    error.append(statistics.stdev(value))
    
xfit = np.linspace(0, 10, 1000)
# plt.plot(x, error, 'or')
plt.bar(x, y, color=(0.2, 0.4, 0.6, 0.6))
plt.errorbar(x, y, yerr = error, barsabove = False, fmt='o', capsize=3, color='red')
#plt.errorbar(x, y, yerr = error, ls='-.', fmt='o', capsize=3)
plt.xlabel('K', fontsize=15)
plt.ylabel('Accuracy', fontsize=15)
plt.title('Change in Accuracy as K increases')
plt.ylim(0.7, 0.9)

In [None]:
kf =KFold(n_splits=5, shuffle=True, random_state=42)
score = cross_val_score(tree.DecisionTreeClassifier(criterion='gini', max_depth=3, min_samples_split = 0.1, max_features=100, ccp_alpha=0.01, splitter = 'best'), x_test, y_test, cv= kf, scoring="accuracy")
print(f'Scores for each fold are: {score}')
print(f'Average score: {"{:.2f}".format(score.mean())}')

In [None]:
x_test.shape

In [None]:
dt = tree.DecisionTreeClassifier(criterion='gini', max_depth=4, min_samples_split = 0.1, max_features=100, ccp_alpha=0.01, splitter = 'best')
dt.fit(x_train, y_train)
y_train_predict = dt.predict(x_train)
y_test_predict = dt.predict(x_test)

In [None]:
# Plot confusion matrix
def plot_confusionmatrix(train_p, train, dom):
    cf = confusion_matrix(train_p,train)
    sns.heatmap(cf,annot=True,yticklabels=classes,xticklabels=classes,cmap='Blues', fmt='g')
    plt.title(f'{dom} Confusion matrix')
    plt.tight_layout()
    # plt.savefig(f'{dom} Confusion matrix.png')
    plt.show()
    
plot_confusionmatrix(y_train_predict,y_train,dom='Train')
plot_confusionmatrix(y_test_predict,y_test,dom='Test')