# GTI770 - Systèmes Intelligents et Apprentissage Machine

### Alessandro L. Koerich

## Notebook Jupyter - 5_Decision_Trees - Simpsons Dataset

### with "model persistence" - Save the model into a file to reuse it later
 
##### Created: May 2018
##### Revised: Jan 2019 

In [None]:
# Imports
import numpy as np
# to visualize the tree you must install this library
# conda install python-graphviz OR pip install graphviz
import graphviz
from sklearn import tree

In [None]:
# Load data from file
# File with 4 features extracted from Simpsons
num_features = 4
data_train = np.loadtxt("simpsons_train_4features.csv", delimiter=",")

In [None]:
# Define the training set
X_train  = data_train[:,0:num_features]
Y_train  = data_train[:,num_features]

## Scikit-Learn Decision Trees Documentation

http://scikit-learn.org/stable/modules/tree.html

http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier

In [None]:
# Train the Decision Tree with the training set
model = tree.DecisionTreeClassifier(criterion='entropy', min_samples_leaf=10)
model = model.fit(X_train, Y_train)

In [None]:
# Show all parameters of the model Decision Tree
# You can change all these parameters
# See the documentation
model

In [None]:
# Visualize the tree in jupyter and save it in a PNG file
dot_data = tree.export_graphviz(model, out_file=None, 
                         feature_names = ['orange', 'white', 'blue', 'beige'],  
                         class_names = ['bart', 'home', 'lisa', 'magg', 'marg', 'fami', 'othe', 'scho'],
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = graphviz.Source(dot_data)
graph.format = 'svg'
graph.render("Simpsons_Data") 
graph 

In [None]:
# Use the model to predict the class of samples
# Notice that we are testing the train dataset
Y_train_pred = model.predict(X_train)
Y_train_pred

In [None]:
# You can also predict the probability of each class
# train dataset
Y_train_pred_prob = model.predict_proba(X_train)
Y_train_pred_prob

In [None]:
# Evaluation metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [None]:
acc_simpsons_train = accuracy_score(Y_train, Y_train_pred )
print("Correct classification rate for train dataset = "+str(acc_simpsons_train*100)+"%")

In [None]:
from sklearn.metrics import classification_report

In [None]:
target_names = ['bart', 'home', 'lisa', 'magg', 'marg', 'fami', 'othe', 'scho']
print( classification_report(Y_train, Y_train_pred, target_names=target_names))
# This works, but we have labels with no predicted samples

In [None]:
cm_simpsons_train = confusion_matrix(Y_train, Y_train_pred )
cm_simpsons_train

In [None]:
import itertools
import matplotlib.pyplot as plt

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
np.set_printoptions(precision=2)

In [None]:
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cm_simpsons_train, classes= ['bart', 'home', 'lisa', 'magg', 'marg', 'fami', 'othe', 'scho'],
                      title='Confusion matrix, without normalization')

In [None]:
plt.show()

In [None]:
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cm_simpsons_train, classes= ['bart', 'home', 'lisa', 'magg', 'marg', 'fami', 'othe', 'scho'],
                      normalize=True,
                      title='Confusion matrix, with normalization')

In [None]:
plt.show()

***

## This is the "persitence" part where the model is saved into a .pkl file

***

In [None]:
# Model persistence
from sklearn.externals import joblib
joblib.dump(model, 'dectree_simpsons_4feat.pkl')

In [None]:
print("Notebook ended")