In [48]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import (load_iris, load_breast_cancer)
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# ___Decision Trees___
---------------

In [9]:
# Decision trees are a supervised learning models, that can be used for classification & regression.
# These are easy to use and understand and are helpful in figuring out influential features in datasets exploratively.
# Decision trees learn a series of "if then" rules that result in a decision that predicts a target value.

In [10]:
# An analogy for decision trees.

# Say one person guesses an object.
# The next person has to find out the guess, asking as few questions as possible!

# => Person 1 guesses a cat.
# Q1 => Is it alive? -> yes
# Q2 => Is it a biped? -> no
# Q3 => Is it a pet? -> yes
# Q4 => Does it bark? -> yes

In [11]:
# Broader questions help eliminate a large set of elements from the possible choices.
# Specific questions are useful later down in the decision trees, one we have narrowed down our choices.
# Then we could ask very specific questions to pin down the answer?

# e.g. once we know it is a quadruped mammal that is a domestic pet, we could ask does it have long whiskers and a furry tail?

In [12]:
# These decision trees can be represented by a tree, with nodes representing questions.
# And yes & no answers as the left and right branches from that node.

# The node where the tree starts => root node.
# At the bottom of the tree, where the branches terminate => leaf nodes.

In [13]:
# An object is identified by the path from the root node to the given leaf node, in terms of a set of yes/no asnwers to a set of questions.

# e.g.

# Alive? yes
# Quadruped? yes
# Pet? yes
# Bark? no
# Furry coat? yes

# Then, that's a cat

# Alive? yes
# Quadruped? yes
# Pet? no
# Large? yes
# Furry coat? no
# Trunk? yes

# Then, that's an elephant.

## ___Decision Trees with Iris dataset___
--------------

In [14]:
iris = load_iris()

In [15]:
iris.data, iris.target, iris.target_names

(array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
        [5

In [16]:
# features -> - sepal length in cm, sepal width in cm, petal length in cm, petal width in cm
# labels -> species names.

iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [17]:
# There are 50 flowers from each Iris species.

iris.data.shape

(150, 4)

In [18]:
# The features here are continuous variables.
# Thus a decision tree cannot operate by simple binary yes/no questions like is the petal length 4.9 cm?
# Questions need to be based on bounds, like is the petal length greater than 3.0 cm and less than 6.29 cm?

In [19]:
# In decision tree, each decision splits the data into two groups.

# If petal length <= 2.35         at this point, sample size = 112, species frequency = [37, 34, 41]
#      then -> Iris setosa        here, sample size = 37, species frequency = [37, 0, 0]
# else if petal width > 0.2       sample size = (112 - 37), species frequency = [0, 34, 41]
#      then -> Iris versicolor    sample size = 36, species frequency = [0, 33, 3]
# else -> Iris virginica          sample size = 39, species frequency = [0, 1, 38]

In [20]:
# The goal of building a decision tree is to find a series of questions that would help accurately classify the data with fewest steps.
# the thresholds used in these questions are called split points. 
# e.g. if petal length >= 6.56 cm

# How informative these split points are matters a lot.
# If a split point is highly informative -> it is highly selective/exclusive
# It does a very good job at separating classes effectively without having groups with mixed members.

# e.g In the previous cell, the split point If petal length <= 2.35  does a good job at isolating I. setosa exclusively.
# However, else if petal width > 0.2 isn't as efficient since in the group it classified as I.versicolor, we have 3 I. virginica members.
# Similarly the last splitting point (else) also has mixed members, 1 I. versicolor

In [21]:
# When building a decision tree, the algorithm finds out the question that leads to the most informative split.
# Even for the most informative splitting point, chances are that it still may classify few data points incorrectly.

# We can improve the accuracy of the process, by continuing splitting steps further down leveraging more criteria.
# These subsequent splits may generate more homogenous categories, which is desirable.
# These steps can be continued recursively, until all the leaf nodes are completely/almost homogenous.

In [22]:
# Trees where all leaf nodes have homogenous elements are called -> pure
# Trees with leaf nodes that have mixed elements are called -> mixed.

In [23]:
# To classify an unknown data point, we simply start at the root node, evaluate the question/criteria and make decisions accordingly.
# Keep iterating this with subsequent questions, until we get to a leaf node.

In [24]:
# In regression, the predicted value will be the mean of all the elements in a given leaf node.

In [25]:
train_x, test_x, train_y, test_y = train_test_split(iris.data, iris.target, train_size = 0.8)

In [26]:
dtClassifier = DecisionTreeClassifier().fit(train_x, train_y)

In [27]:
dtClassifier.score(train_x, train_y)

1.0

In [28]:
dtClassifier.score(test_x, test_y)

0.9666666666666667

In [29]:
# Note that the training data is predicted perfectly, with an accuracy of 1.0
# While there are some flawed predictions in the test set. -> likely overfitting of the tree

# Overfitting is a very common issue in tree based models.
# We keep adding rules/criteria until we end up with homogenous leaf nodes.
# This makes the trees overly complex and inherently attuned to the training data, the tree essentially memorizes the training data.
# Consequently, they may fail to generalize well to test datasets.

## ___Strategies to Control Decision Tree Complexity___
-----------------------

In [30]:
# There are many strategies avaliable to mitigate the increase in the decision tree complexity.
# One is to specify a maximum depth, at which the model will stop adding further split points.
# This is called pre-pruning.

# Another way is to build a complete tree, and then prune it to have just the maximum depth
# This process is caled post-pruning.

In [31]:
# Other restrictions that could be applied include ->
# specifying a maximum number for leaf nodes
# specifying a minimum number of elements to consider splitting into a leaf, at a split point.

In [32]:
dtClassifier = DecisionTreeClassifier(max_depth = 3, min_samples_leaf = 10).fit(train_x, train_y)

In [33]:
# Accuracy of train set predictions has gone down

dtClassifier.score(train_x, train_y)

0.9666666666666667

In [34]:
# Accuracy of test set predictions has improved.

dtClassifier.score(test_x, test_y)

0.9333333333333333

## ___Visualizing Decision Trees___
---------------

In [35]:
# One great advantage of decision trees is that they are easy to interpret.
# need graphviz
# not doing this for now :(

## ___Analyzing Trees___
---------------

In [36]:
# For larger trees, it can help to analyze which path majority of the data takes, instead of analyzing all the nodes.
# This can be accomplished by tracing the nodes with largest sample sizes in the tree.

# Another way is to do a feature importance evaluation.
# Feature importance is usually a value betwen 0 and 1.
# This is one of the most useful and widely used form of summary analysis to perform on trees.

In [37]:
# These values are assigned to individual features, indicating how important a feature is.
# A feature importance of 0 => indicated that the feature is not used at all in the prediction.
# A feature importance of 1 => that feature alone predicts the target perfectly.

# Typically feature importance numbers are always positive and are normalized to sum up to 1.0

In [43]:
# note the trailing underscore, indicating a learned model parameter.
dtClassifier.feature_importances_

array([0., 0., 0., 1.])

In [44]:
list(zip(iris.feature_names, dtClassifier.feature_importances_))

[('sepal length (cm)', 0.0),
 ('sepal width (cm)', 0.0),
 ('petal length (cm)', 0.0),
 ('petal width (cm)', 1.0)]

In [45]:
# In this example, for this particular train-test split,
# It appears that only the petal width feature has a deterministic effect in the predictions.

In [46]:
# Feature importance values do not tell us anything about which class of the feature has the deterministic effect in predictions
# They also do not indicate complex relationships that may exist between features.

In [47]:
# Since the feature importance values are dependent on the model's state, i.e train-test split
# It is common to compute averages of the feature importances over a range of train-test splits, similar to cross-validation.

In [49]:
bcancer = load_breast_cancer()

In [54]:
bcancer.data.shape

(569, 30)

In [50]:
train_x, test_x, train_y, test_y = train_test_split(bcancer.data, bcancer.target, train_size = 0.8)

In [64]:
dtClassifier = DecisionTreeClassifier(max_depth = 5, min_samples_leaf = 10).fit(train_x, train_y)

In [65]:
dtClassifier.score(train_x, train_y)

0.9582417582417583

In [66]:
dtClassifier.score(test_x, test_y)

0.9473684210526315