In [1]:
# dt classifier on iris dataset

from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

iris = load_iris()
X = iris.data[:, 2:] # petal length and width
y = iris.target

tree_clf = DecisionTreeClassifier(max_depth = 2)
tree_clf.fit(X, y)

DecisionTreeClassifier(max_depth=2)

In [2]:
# can visualize a dt using graphviz

from sklearn.tree import export_graphviz

export_graphviz(tree_clf,
               out_file = 'iris_tree.dot',
               feature_names = iris.feature_names[2:],
               class_names = iris.target_names,
               rounded = True,
               filled = True
               )

# note that gini measures impurity
# if all 100 classes apply on leaf node, gini = 0, if some are wrong it will not equal 0

# sklearn uses the cart algorithm which will only produce binary trees. id3 can produce
# trees with any number of children

# white box vs black box

# dt is white box model, as you can easily tell why it made the classification it did.
# random forests and neural networks are black box, because you can understand the computation
# but not really how the classification was made



In [4]:
# you can also output class probabilities by going to leaf node with 
# certain properties and dividing samples by total

tree_clf.predict_proba([[5, 1.5]]) # 5 cm long, 1.5 cm wide

array([[0.        , 0.90740741, 0.09259259]])

In [5]:
tree_clf.predict([[5, 1.5]])
# just tells which class

array([1])

In [6]:
# scikit uses cart (classification and regression tree) algorithm
# splits dataset into two subsets using a feature k and threshold tk
# k and tk are chosen by pair that produces purest subset weighted by size
# does this recursively to build tree until stopping conditions satisfied
# greedy algorithm

# prediction is O(log(m)), while training is O(nmlog(m)) because it looks at all features
# on all samples at each node. for small datasets, scikit can speed up training by presorting
# the data (presort = True) but this will slow down considerably for large sets

# instead of gini impurity, we can measure with entropy by setting criterion hyperparameter to entropy.
# reduction of entropy is often called information gain. entropy close to 0 means everything is well ordered,
# as entropy originated from uncertainty of molecules in thermodynamics.
# a set's entropy is zero when it contains instances of only one class

# entropy tends to produce more balanced trees. gini has faster computation and isolates most frequent class in
# its own branch of the tree



In [7]:
# regularization

# dts will overfit very easy, as it is a non-parametric model meaning the number of params is not determined
# prior to training. parametric is the opposite, like a linear model has a predetermined number of params.
# because parametric has limited degrees of freedom, it reduces the risk of overfitting but increases
# risk of underfitting

# one way is max_depth hyperparam. min_samples_split is minimum number of samples a node must
# have before it can split. min_samples_leaf is minimum number of samples a leaf node must have.
# min_weight_fraction_leaf is same as min samples leaf but expressed as a fraction of total number of 
# weighted instances. max_leaf_nodes is max number of leaf nodes, max_features is max number of features
# evalutated for splitting at each node
# increasing min_* or reducing max_* will regularize the model

# other algorithms build the tree and then prune it. it will get rid of unnecessary leaf nodes by trying
# to see if the leaf is statistically significant using the chi squared test. the null hypothesis is that
# the improvement is purely chance. if p-value is less than threshold, typically 5% (hyperparameter), then node is 
# dropped



In [8]:
# Regression

# will train on noisy quadratic dataset with max depth of 2

from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(max_depth = 2)
tree_reg.fit(X, y)

# prediction value for each region is average target value of instances in that region with mean squared error

# cart algorithm works similarly, instead of minimizing impurity it now tries to minimize mean squared error
# this still overfits very badly, so remember to set hyperparameters


DecisionTreeRegressor(max_depth=2)

In [None]:
# Instability

# one issue is the nature of the decision boundary in dts to be orthogonal to an axis
# simple linearly separable dataset down the middle is rotated 45 degrees; now decision boundary
# looks like a staircase instead of a line. PCA can reduce this kind of issue.

# dts are very sensitive to small variations in the training data. you can get very different
# models based on the same training data because scikit dt algorithm is stochastic (random)
# removing the widest petaled iris versicolor created a whole different model
# (unless you set the random_state hyperparameter)

# Random Forests can limit this instability by averaging predictions of many trees