In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris

# ___Decision Trees___
---------------

In [1]:
# Decision trees are a supervised learning models, that can be used for classification & regression.
# These are easy to use and understand and are helpful in figuring out influential features in datasets exploratively.
# Decision trees learn a series of "if then" rules that result in a decision that predicts a target value.

In [2]:
# An analogy for decision trees.

# Say one person guesses an object.
# The next person has to find out the guess, asking as few questions as possible!

# => Person 1 guesses a cat.
# Q1 => Is it alive? -> yes
# Q2 => Is it a biped? -> no
# Q3 => Is it a pet? -> yes
# Q4 => Does it bark? -> yes

In [3]:
# Broader questions help eliminate a large set of elements from the possible choices.
# Specific questions are useful later down in the decision trees, one we have narrowed down our choices.
# Then we could ask very specific questions to pin down the answer?

# e.g. once we know it is a quadruped mammal that is a domestic pet, we could ask does it have long whiskers and a furry tail?

In [4]:
# These decision trees can be represented by a tree, with nodes representing questions.
# And yes & no answers as the left and right branches from that node.

# The node where the tree starts => root node.
# At the bottom of the tree, where the branches terminate => leaf nodes.

In [6]:
# An object is identified by the path from the root node to the given leaf node, in terms of a set of yes/no asnwers to a set of questions.

# e.g.

# Alive? yes
# Quadruped? yes
# Pet? yes
# Bark? no
# Furry coat? yes

# Then, that's a cat

# Alive? yes
# Quadruped? yes
# Pet? no
# Large? yes
# Furry coat? no
# Trunk? yes

# Then, that's an elephant.

## ___Decision Trees with Iris dataset___
--------------

In [8]:
iris = load_iris()

In [10]:
iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [13]:
# features -> - sepal length in cm, sepal width in cm, petal length in cm, petal width in cm
# labels -> species names.

iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [15]:
# There are 50 flowers from each Iris species.

iris.data.shape

(150, 4)

In [16]:
# The features here are continuous variables.
# Thus a decision tree cannot operate by simple binary yes/no questions like is the petal length 4.9 cm?
# Questions need to be based on bounds, like is the petal length greater than 3.0 cm and less than 6.29 cm?

In [None]:
# In decision tree, each decision splits the data into two groups.

# If petal length <= 2.35         at this point, sample size = 112, species frequency = [37, 34, 41]
#      then -> Iris setosa        here, sample size = 37, species frequency = [37, 0, 0]
# else if petal width > 0.2       sample size = (112 - 37), species frequency = [0, 34, 41]
#      then -> Iris versicolor    sample size = 36, species frequency = [0, 33, 3]
# else -> Iris virginica          sample size = 39, species frequency = [0, 1, 38]