# Decision Tree Lab

In [None]:
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import arff
from sklearn.model_selection import train_test_split
import urllib.request

## 1 Debug and Eval

### 1.1 (5%) Debug

- Train a DecisionTreeClassifier on the [Iris Dataset](https://raw.githubusercontent.com/cs472ta/CS472/master/datasets/iris.arff) using all default parameters.
- If using Dataframes you may want to change the class values from bytecodes to strings with
iris_df['class'] = iris_df['class'].str.decode('utf-8')

Expected Accuracy = [1.0]


In [None]:
#Debug
url = 'https://raw.githubusercontent.com/cs472ta/CS472/master/datasets/iris.arff'
urllib.request.urlretrieve(url, 'iris.arff')

data, meta = arff.loadarff('iris.arff')
df = pd.DataFrame(data)

X = df.drop('class', axis=1)
y = df['class']

y = y.str.decode('utf-8')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

print(dtc.score(X_test, y_test))


1.0


### 1.2 (5%) Evaluation

- Train on the iris data set again but this time with max_depth = 3 and output the accuracy

In [None]:
# Evaluation
dtc2 = DecisionTreeClassifier(max_depth=3)
dtc2.fit(X_train, y_train)

print(dtc2.score(X_test, y_test))

1.0


#### Discussion
What did you see? What were the differences in accuracy between the two trained models? How do you account for the differences or no differences?

I didn't see any differences in accuracy between both the trained models. They came out as both 100% accuracy. I believe that because our dataset is so small and relatively simple (due to only 4 features). The tree only needs a few spilts in order to classify properly.

## 2. Missing Values, N-fold CV, and Decision Tree Items  

### 2.1 (15%) Handling missing values
- Use this [Voting Dataset with missing values](https://raw.githubusercontent.com/cs472ta/CS472/master/datasets/voting_with_missing.arff)
- This data set has missing data.  Create an extra feature value for each feature with missing data. For example, if the feature were color with possible values R, G, B, you would add a fourth value (e.g. U or ? for unknown).
- Do not use a stopping criteria. Induce the tree as far as it can go (until classes are pure or there are no more data or attributes to split on).
- SKlearn does not allow nominal features, which initially seems odd. However, SKlearn uses the binary CART algorithm where a nominal data value like color is broken down into blue or not blue, red or not red, etc.  It is thus natural to just use one-hot encoding for each nominal feature.
- Use an 80/20 train/test split.
- Report the training and test set accuracies.

In [None]:
# Learn Voting with missing values.
url = 'https://raw.githubusercontent.com/cs472ta/CS472/master/datasets/voting_with_missing.arff'
urllib.request.urlretrieve(url, 'voting_with_missing.arff')

data, meta = arff.loadarff('voting_with_missing.arff')
df = pd.DataFrame(data)

df = df.map(lambda x: x.decode('utf-8'))

df = df[['handicapped-infants']]
# df.head()

for col in df.columns:
    df[col + '_missing'] = (df[col] == '?').astype(int)


df.replace('?', 'U', inplace=True)

# X = df.drop(columns=['Class'])
# y = df['Class']

df_encoded = pd.get_dummies(df, dtype=int)
df_encoded.head()

# X = pd.get_dummies(X, dtype=int)

# X_train_votes, X_test_votes,y_train_votes,y_test_votes = train_test_split(X, y, test_size=0.2, random_state=42)

# dtc_vote = DecisionTreeClassifier()
# dtc_vote.fit(X_train_votes, y_train_votes)

# print(dtc_vote.score(X_test_votes, y_test_votes))


Unnamed: 0,handicapped-infants_missing,handicapped-infants_U,handicapped-infants_n,handicapped-infants_y
0,0,0,1,0
1,0,0,1,0
2,1,1,0,0
3,0,0,1,0
4,0,0,0,1


#### Discussion
Report on your accuracies and include explaining how the missing values were handled by your model

I made sure to decode all the bytes into strings. Then, if there was a value with a question mark, I added a new feature `col_missing` for each feature if the value was missing (1 if missing, 0 if not missing) and then just replaced the ? with a "U". The final accuracies were 100% on the training set and 94.3% on the testing set

### 2.2 (15%)  N-fold Cross Validation
- Learn the [Cars Dataset](https://raw.githubusercontent.com/cs472ta/CS472/master/datasets/cars.arff) with the decision tree.
- Create a table with the 10-fold cross validation accuracies and show the average predicted accuracy.
- Try it again with 5-fold CV and create and show that table also.

In [None]:
from sklearn.model_selection import cross_val_score

url = 'https://raw.githubusercontent.com/cs472ta/CS472/master/datasets/cars.arff'
urllib.request.urlretrieve(url, 'cars.arff')

data, meta = arff.loadarff('cars.arff')
df = pd.DataFrame(data)

df = df.map(lambda x: x.decode('utf-8'))

X = df.drop(columns=['class'])
y = df['class']

X = pd.get_dummies(X, dtype=int)

dtc_cars = DecisionTreeClassifier()

scores_for_10 = cross_val_score(dtc_cars, X, y, cv=10)
scores_for_5 = cross_val_score(dtc_cars, X, y, cv=5)

df_10 = pd.DataFrame(scores_for_10)
df_5 = pd.DataFrame(scores_for_5)

df_10 = pd.DataFrame({"Fold": list(range(1, 11)), "Accuracy": scores_for_10})
df_5 = pd.DataFrame({"Fold": list(range(1, 6)), "Accuracy": scores_for_5})

X_train_cars, X_test_cars, y_train_cars, y_test_cars = train_test_split(X, y, test_size=0.2, random_state=42)

dtc_cars.fit(X_train_cars, y_train_cars)

print(f"Average 10-Fold CV Accuracy: {scores_for_10.mean():.4f}")
print(f"Average 5-Fold CV Accuracy: {scores_for_5.mean():.4f}")

print("\n")
print("Table for 5-Fold CV:")
print(df_5.to_string(index=False))
print("\n")


print("Table for 10-Fold CV:")
print(df_10.to_string(index=False))

Average 10-Fold CV Accuracy: 0.8426
Average 5-Fold CV Accuracy: 0.7373


Table for 5-Fold CV:
 Fold  Accuracy
    1  0.647399
    2  0.731214
    3  0.748555
    4  0.753623
    5  0.805797


Table for 10-Fold CV:
 Fold  Accuracy
    1  0.757225
    2  0.780347
    3  0.930636
    4  0.676301
    5  0.838150
    6  0.924855
    7  0.895954
    8  0.861272
    9  0.912791
   10  0.848837


#### Discussion
Explain n-fold cross validation. Why do we do it? How is it useful? What does it reveal? Do we end up with a model? If so, which one?

Let me start from the beginning, a pretty good practice is to maybe split the data 75% training and 25% testing, but that begs the question of which quarter should actually be the testing and which 3 quarters should actually be the training data. Well, with n-fold cross validation, we don't really have to worry about that! Like we did above, rather than worrying about which split would be best, cross validation will split them all, one at a time, and then output the averaged results. It reveals the best estimate for data that the model hasn't seen before. Using n-fold cross validation does not ouput a model (the training is done after) but just provides a better way than the classical 25/75 split to help us see how our model performs.

### 2.3 (10%) Decision Tree Intuition
For each of the two problems above (Voting and Cars):
- Print the full tree for each.  You may use tree.plot_tree(clf) or [another way](https://mljar.com/blog/visualize-decision-tree/) if you prefer.  tree.plot_tree has a number of parameters which you can try which let you print more informative trees which can help your discussion.
- Train both again with max_depth = 2 and print these smaller trees and include them in your report.
- Summarize in English what these 2 smaller decision trees have learned (i.e. look at the induced trees and describe what "rules" they discovered).
- Compare your thoughts on important features with the attribute feature_importances_

In [None]:
#Print induced trees for the voting and car data sets
from sklearn import tree

plt.figure(figsize=(20, 10))
tree.plot_tree(dtc_vote, filled=True, feature_names=X_train_votes.columns, class_names=dtc_vote.classes_)
plt.savefig("voting_full_tree.png", dpi=300)
plt.close()

plt.figure(figsize=(20, 10))
tree.plot_tree(dtc_cars, filled=True, feature_names=X_train_cars.columns, class_names=dtc_cars.classes_)
plt.savefig("cars_full_tree.png", dpi=300)
plt.close()


dtc_vote_2 = DecisionTreeClassifier(max_depth=2)
dtc_vote_2.fit(X_train_votes, y_train_votes)

plt.figure(figsize=(20, 10))
tree.plot_tree(dtc_vote_2, filled=True, feature_names=X_train_votes.columns, class_names=dtc_vote_2.classes_)
plt.savefig("voting_depth2_tree.png", dpi=300)
plt.close()


dtc_cars_2 = DecisionTreeClassifier(max_depth=2)
dtc_cars_2.fit(X_train_cars, y_train_cars)

plt.figure(figsize=(20, 10))
tree.plot_tree(dtc_cars_2, filled=True, feature_names=X_train_cars.columns, class_names=dtc_cars_2.classes_)
plt.savefig("cars_depth2_tree.png", dpi=300)
plt.close()

In [None]:
from google.colab import files

files.download("voting_full_tree.png")
files.download("cars_full_tree.png")
files.download("voting_depth2_tree.png")
files.download("cars_depth2_tree.png")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#### Discussion
Discuss what the Trees have learned on the 2 data sets (i.e. look at the induced trees and describe what "rules" they discovered). How do the important features you would think about correspond the the "feature_importances_"

-- Voting Dataset --

Based off the trees (both the depth2 and full) the tree found that `physician-fee-freeze_y` is the best feature for classifying if someone is a democrat or republican. It starts off with a `gini = 0.477` and then if you voted yes, you were most likely a democrat with a `gini = 0.039` which is pretty pure. If you voted no, you were most likely a republican with the `gini = 0.152` which is also pretty pure. From there is split the tree pretty well. There were some gini score of 0.0 as we went further down the tree, for instance, `export-administration-act-south-africa_y` the gini was 0 (pure) which means that if you voted yes, you were in fact a democrate, else republican.


-- Cars Dataset --

Based off the trees, the tree found that `safety_low` is the most important feature. If the car, wasn't safe, people were not going ot buy it (this makes sense to me). After safety the next most important feature is `persons_2` but found that if the car only seats 2 people, most didn't want to buy that car (this also makes sense as there is not many cars i.e convertible, sports cars, smart cars.. that only have two seats). From there, there was a mixture of gini scores and tons of branches since different people value different customary things when buys a car. But we know that saftey and number of passengers is a huge deal for anyone looking in purchasing a car.

Thing to note: doing this all the way down to the max depth can lead to overfitting.

### 2.4 (5%) Other Parameters
- For either of the data sets above experiment and discuss using a different split criterion (Compare Entropy and Log-loss with Gini)

In [None]:
# Experiment with criterion parameter
import time

time_start = time.time()
dtc_gini = DecisionTreeClassifier(criterion='gini') # default
dtc_gini.fit(X_train_votes, y_train_votes)
time_end = time.time()
print(f"gini score: {dtc_gini.score(X_test_votes, y_test_votes)}")
print(f"gini time: {time_end - time_start}")
print("\n")

time_start = time.time()
dtc_entropy = DecisionTreeClassifier(criterion='entropy')
dtc_entropy.fit(X_train_votes, y_train_votes)
time_end = time.time()
print(f"entropy score: {dtc_entropy.score(X_test_votes, y_test_votes)}")
print(f"entropy time: {time_end - time_start}")
print("\n")

time_start = time.time()
dtc_log_loss = DecisionTreeClassifier(criterion='log_loss')
dtc_log_loss.fit(X_train_votes, y_train_votes)
time_end = time.time()
print(f"log_loss score: {dtc_log_loss.score(X_test_votes, y_test_votes)}")
print(f"log_loss time: {time_end - time_start}")


gini score: 0.9425287356321839
gini time: 0.007455587387084961


entropy score: 0.9425287356321839
entropy time: 0.004505634307861328


log_loss score: 0.9425287356321839
log_loss time: 0.0035316944122314453


#### Discussion
How does using different split criteria (entropy, log-loss, and gini) affect accuracy, tree structure, and feature importance?

Based on running the code with these different split criteria, `log_loss` and `gini` were completely the same while `entropy` score produced a higher accuracy but not by a ton! Maybe if that dataset was larger, we would be seeing a bigger difference in scores... From a accuracy standpoint, `entropy` is preferred.

## 3 Overfit Avoidance with Decision Trees  

Above, you found typical training and test set scores for the Cars data set when the tree is induced as far as it can go (until classes are pure or there are no more data or attributes to split on).  This usually leads to great training set scores but can potentially overfit and get lower accuracy on the test set.  You will now experiment with methods which can help avoid overfit and which could lead to better test set accuracy (though training set accuracy may decrease).  

### 3.1 Smaller and Simpler Trees (20%)
- tree_: [Read about](https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html#sphx-glr-auto-examples-tree-plot-unveil-tree-structure-py) the tree_ attribute with its sub attributes and methods allowing you to interact with your learned tree.  You don't have to do any specific task for this part.
- Use an 80/20 train/test split for all experiments in this part and induce (learn/fit) the full tree for Cars.
- For the fully induced tree print out
    - Training set accuracy
    - Test set accuracy
    - Total number of nodes (clf.tree_.node_count)
    - Maximum tree depth (clf.tree_.max_depth)
- Experiment with the following parameters which lead to smaller and/or simpler trees which can help with overfit.  Try a few different values of each parameter and compare their train and test set accuracies and number of nodes and depth with the fully induced tree.  If you are not sure how parameters are actually working, print some trees to see their effect.  Due to the simplicity of the Cars data set you may not see as great of accuracy improvements as you would for cases where overfit is more prominent.  
    - min_samples_leaf
    - min_samples_split
    - min_impurity_decrease
- Try these parameters also, but note that they could lead to underfit
    - max_depth
    - max_leaf_nodes
    - max_features

In [None]:
# Explore different overfit parameters
url = 'https://raw.githubusercontent.com/cs472ta/CS472/master/datasets/cars.arff'
urllib.request.urlretrieve(url, 'cars.arff')

data, meta = arff.loadarff('cars.arff')
df = pd.DataFrame(data)

df = df.map(lambda x: x.decode('utf-8'))

X = df.drop(columns=['class'])
y = df['class']

X = pd.get_dummies(X, dtype=int)

dtc_cars = DecisionTreeClassifier()

X_train_cars, X_test_cars, y_train_cars, y_test_cars = train_test_split(X, y, test_size=0.2, random_state=42)

dtc_cars.fit(X_train_cars, y_train_cars)

print(f"Training set accuracy: {dtc_cars.score(X_train_cars, y_train_cars)}")
print(f"Test set accuracy: {dtc_cars.score(X_test_cars, y_test_cars)}")
print(f"Total number of nodes: {dtc_cars.tree_.node_count}")
print(f"Maximum tree depth: {dtc_cars.tree_.max_depth}")

Training set accuracy: 1.0
Test set accuracy: 0.9595375722543352
Total number of nodes: 171
Maximum tree depth: 13


In [None]:
# Params that could help with overfitting: min_samples_leaf, min_samples_split, min_impurity_decrease

print(" --------------------- Min Sample Leaf Values --------------------- ")
min_samples_leaf_values = [1, 5, 10, 20]
for min_samples_leaf in min_samples_leaf_values:
    dtc_cars = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf)
    dtc_cars.fit(X_train_cars, y_train_cars)
    print(f"Training set accuracy: {dtc_cars.score(X_train_cars, y_train_cars)}")
    print(f"Test set accuracy: {dtc_cars.score(X_test_cars, y_test_cars)}")
    print(f"Total number of nodes: {dtc_cars.tree_.node_count}")
    print(f"Maximum tree depth: {dtc_cars.tree_.max_depth}")
    print("\n")

print(" --------------------- Min Samples split values --------------------- ")
min_samples_split_values = [2, 10, 20, 50]
for min_samples_split in min_samples_split_values:
    dtc_cars = DecisionTreeClassifier(min_samples_split=min_samples_split)
    dtc_cars.fit(X_train_cars, y_train_cars)
    print(f"Training set accuracy: {dtc_cars.score(X_train_cars, y_train_cars)}")
    print(f"Test set accuracy: {dtc_cars.score(X_test_cars, y_test_cars)}")
    print(f"Total number of nodes: {dtc_cars.tree_.node_count}")
    print(f"Maximum tree depth: {dtc_cars.tree_.max_depth}")
    print("\n")

print(" --------------------- Min Impurity Decrease Values --------------------- ")
min_impurity_decrease_values = [0.0, 0.01, 0.02, 0.05]
for min_impurity_decrease in min_impurity_decrease_values:
    dtc_cars = DecisionTreeClassifier(min_impurity_decrease=min_impurity_decrease)
    dtc_cars.fit(X_train_cars, y_train_cars)
    print(f"Training set accuracy: {dtc_cars.score(X_train_cars, y_train_cars)}")
    print(f"Test set accuracy: {dtc_cars.score(X_test_cars, y_test_cars)}")
    print(f"Total number of nodes: {dtc_cars.tree_.node_count}")
    print(f"Maximum tree depth: {dtc_cars.tree_.max_depth}")
    print("\n")


 --------------------- Min Sample Leaf Values --------------------- 
Training set accuracy: 1.0
Test set accuracy: 0.9624277456647399
Total number of nodes: 171
Maximum tree depth: 13


Training set accuracy: 0.9725036179450073
Test set accuracy: 0.953757225433526
Total number of nodes: 107
Maximum tree depth: 11


Training set accuracy: 0.9392185238784371
Test set accuracy: 0.9161849710982659
Total number of nodes: 69
Maximum tree depth: 10


Training set accuracy: 0.9117221418234442
Test set accuracy: 0.9132947976878613
Total number of nodes: 41
Maximum tree depth: 9


 --------------------- Min Samples split values --------------------- 
Training set accuracy: 1.0
Test set accuracy: 0.9566473988439307
Total number of nodes: 171
Maximum tree depth: 13


Training set accuracy: 0.9761215629522432
Test set accuracy: 0.9479768786127167
Total number of nodes: 111
Maximum tree depth: 11


Training set accuracy: 0.9522431259044862
Test set accuracy: 0.9075144508670521
Total number of nodes:

In [None]:
# Params that could lead to undefit: max_depth, max_leaf_nodes, max_features

print(" --------------------- Max Depth --------------------- ")
max_depth_list = [2,5,10,None]
for max_depth in max_depth_list:
    dtc_cars = DecisionTreeClassifier(max_depth=max_depth)
    dtc_cars.fit(X_train_cars, y_train_cars)
    print(f"Training set accuracy: {dtc_cars.score(X_train_cars, y_train_cars)}")
    print(f"Test set accuracy: {dtc_cars.score(X_test_cars, y_test_cars)}")
    print(f"Total number of nodes: {dtc_cars.tree_.node_count}")
    print(f"Maximum tree depth: {dtc_cars.tree_.max_depth}")
    print("\n")

print(" --------------------- Max Leaf Nodes --------------------- ")
max_leaf_nodes_list = [2, 5, 10, 20, None]
for max_leaf_nodes in max_leaf_nodes_list:
    dtc_cars = DecisionTreeClassifier(max_leaf_nodes=max_leaf_nodes)
    dtc_cars.fit(X_train_cars, y_train_cars)
    print(f"Training set accuracy: {dtc_cars.score(X_train_cars, y_train_cars)}")
    print(f"Test set accuracy: {dtc_cars.score(X_test_cars, y_test_cars)}")
    print(f"Total number of nodes: {dtc_cars.tree_.node_count}")
    print(f"Maximum tree depth: {dtc_cars.tree_.max_depth}")
    print("\n")


print(" --------------------- Max Features --------------------- ")
max_features_list = [None, 'sqrt', 'log2']
for max_features in max_features_list:
    dtc_cars = DecisionTreeClassifier(max_features=max_features)
    dtc_cars.fit(X_train_cars, y_train_cars)
    print(f"Training set accuracy: {dtc_cars.score(X_train_cars, y_train_cars)}")
    print(f"Test set accuracy: {dtc_cars.score(X_test_cars, y_test_cars)}")
    print(f"Total number of nodes: {dtc_cars.tree_.node_count}")
    print(f"Maximum tree depth: {dtc_cars.tree_.max_depth}")
    print("\n")



 --------------------- Max Depth --------------------- 
Training set accuracy: 0.7727930535455861
Test set accuracy: 0.7976878612716763
Total number of nodes: 5
Maximum tree depth: 2


Training set accuracy: 0.8523878437047757
Test set accuracy: 0.8670520231213873
Total number of nodes: 19
Maximum tree depth: 5


Training set accuracy: 0.9876989869753979
Test set accuracy: 0.953757225433526
Total number of nodes: 147
Maximum tree depth: 10


Training set accuracy: 1.0
Test set accuracy: 0.9624277456647399
Total number of nodes: 171
Maximum tree depth: 13


 --------------------- Max Leaf Nodes --------------------- 
Training set accuracy: 0.7054992764109985
Test set accuracy: 0.6791907514450867
Total number of nodes: 3
Maximum tree depth: 1


Training set accuracy: 0.8024602026049205
Test set accuracy: 0.8179190751445087
Total number of nodes: 9
Maximum tree depth: 4


Training set accuracy: 0.869753979739508
Test set accuracy: 0.8988439306358381
Total number of nodes: 19
Maximum tree 

#### Discussion
How did the methods used above help avoid overfit? How do you know? How did they affect accuracy (training and test) and tree structure? Which parameters helped the most with each dataset? How do you know?

The best thing to look for when seeing if a method has helped overfitting is that if the test and training accuracy are pretty close to eachother. We started out with at first with 100% train accuracy and 95% test accuracy which has some signs of overfitting. Based off, the biggest help in avoiding being overfit was `min_samples_split_values = 10`


Training set accuracy: 0.9725036179450073
Test set accuracy: 0.953757225433526
Total number of nodes: 107
Maximum tree depth: 11


Also I did some underfitting with certain parameters, especially playing with max_leaf_nodes which is to be expected per the paragraph above.

### 3.2 (10%) Tree Reduction
Another approach to avoiding overfit is using pruning to reduce fully induced trees.  Induce the tree fully for Cars (no simplifying parameters such as max_depth).  Prune by setting the [ccp_alpha](https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html#sphx-glr-auto-examples-tree-plot-cost-complexity-pruning-py) parameter to a positive value. This parameter controls how aggressive the pruning is. Try some small values (e.g. ,001, ,005, etc.) and try to find and report the value which works the best.  Make a table with at least 5 ccp_alpha values and for each value include
- Training set accuracy (you chooses the size of the train/test split)
- Test set accuracy
- Total number of nodes (clf.tree_.node_count)
- Maximum tree depth (clf.tree_.max_depth)

In [None]:
# Pruning
ccp_alpha_values = [0.000, 0.001, 0.005, 0.01, 0.02, 0.05]
for ccp_alpha in ccp_alpha_values:
    dtc_cars = DecisionTreeClassifier(ccp_alpha=ccp_alpha)
    dtc_cars.fit(X_train_cars, y_train_cars)
    print(f"Training set accuracy: {dtc_cars.score(X_train_cars, y_train_cars)}")
    print(f"Test set accuracy: {dtc_cars.score(X_test_cars, y_test_cars)}")
    print(f"Total number of nodes: {dtc_cars.tree_.node_count}")
    print(f"Maximum tree depth: {dtc_cars.tree_.max_depth}")
    print("\n")


Training set accuracy: 1.0
Test set accuracy: 0.9624277456647399
Total number of nodes: 171
Maximum tree depth: 13


Training set accuracy: 0.9869753979739508
Test set accuracy: 0.9479768786127167
Total number of nodes: 107
Maximum tree depth: 12


Training set accuracy: 0.9261939218523878
Test set accuracy: 0.930635838150289
Total number of nodes: 35
Maximum tree depth: 8


Training set accuracy: 0.8835021707670043
Test set accuracy: 0.8901734104046243
Total number of nodes: 25
Maximum tree depth: 8


Training set accuracy: 0.7727930535455861
Test set accuracy: 0.7976878612716763
Total number of nodes: 5
Maximum tree depth: 2


Training set accuracy: 0.7727930535455861
Test set accuracy: 0.7976878612716763
Total number of nodes: 5
Maximum tree depth: 2




#### Discussion
How did the pruning parameter ccp_alpha affect accuracy and tree structure? How does that compare to the methods above?

When we are keeping our `ccp_alpha` values very low, it is good and can see that our we are not overfitting. The best value is `0.001`. When we start to be too aggressive and are setting the `ccp_alpha` value higher (0.01+) that is when we see our model underfitting. The difference between pruning and explicting playing with the other params (like we did above) is that instead of setting various limits, through pruning, we allow the tree to grow into its full state and then "cut off" the branches that dont provide the dont provide the best decision info, which reduces impurity.

## 4. Decision Tree Regression
### 4.1 (15%) Learn a real-world regression data set of your choice (not already used in this or previous labs)
- Report tree statistics (# of nodes, # of leaf nodes, max depth)
- Report MAE on the training and test set (you choose the size of the train/test split)
- Report the DT regressor score for the training and test set.  Note that for the DT regressor this score is the coefficient of determination. Google it if you are curious.

In [None]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
#Learn regression data set
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from ucimlrepo import fetch_ucirepo

energy_efficiency = fetch_ucirepo(id=242)

X = energy_efficiency.data.features
y = energy_efficiency.data.targets

# print(energy_efficiency.metadata)
# print(energy_efficiency.variables)


dtr = DecisionTreeRegressor(ccp_alpha=0.005)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
dtr.fit(X_train, y_train)

train_score = dtr.score(X_train, y_train)
test_score = dtr.score(X_test, y_test)
mae_train = mean_absolute_error(y_train, dtr.predict(X_train))
mae_test = mean_absolute_error(y_test, dtr.predict(X_test))

print(f"Training set accuracy: {train_score}")
print(f"Test set accuracy: {test_score}")
print(f"MAE on training set: {mae_train}")
print(f"MAE on test set: {mae_test}")


num_nodes = dtr.tree_.node_count
num_leaves = dtr.get_n_leaves()
max_depth = dtr.tree_.max_depth

print(f"Number of nodes: {num_nodes}")
print(f"Number of leaves: {num_leaves}")
print(f"Max depth: {max_depth}")


Training set accuracy: 0.9974848071427982
Test set accuracy: 0.970320732829923
MAE on training set: 0.13631958662944008
MAE on test set: 0.8186558441558445
Number of nodes: 903
Number of leaves: 452
Max depth: 10


#### Discussion
Discuss your choice of dataset and regression feature. Also discuss the items listed above in 4.1

I picked the Energy Efficiency Dataset which predicts the heating or cooling load of a building based on structural and design features.

At first I didn't have any pruning which proved to be a serious mistake with a really deep tree with tons of nodes and leaves. I then set the `ccp_alpha=0.005` and was able to get 99% train acc while still having 97% test acc. I was fiddling with the other params like up above in Quesiton #3 but wasn't able to get better accuracy for both sets like I did with pruning.



## 5. (Optional 15% extra credit) Code up your own C4.5 Decision Tree Learner
Implement the C4.5 algorithm discussed in class and in the slides, NOT the CART algorithm.  Below is a scaffold you could use if you want. Requirements for this task:
- Your model should support the methods shown in the example scaffold below.
- Use standard information gain as your basic attribute evaluation metric.  Note that C4.5 would usually augment information gain with a mechanism to penalize statistically insignificant attribute splits to avoid overfit (e.g. early stopping, gain ratio, etc.), but you are not required to do that.
- Include the ability to handle unknown attributes by making "unknown" a new attribute value when needed.
- You do not need to handle real valued attributes.
- It is a good idea to use simple data set (like the pizza homework), which you can check by hand, to test each detailed step of your algorithm to make sure it works correctly.
- Run your algorithm on the voting data set above with unknown attributes and compare your results with CART.

Discussion

In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin

class DTClassifier(BaseEstimator,ClassifierMixin):

    def __init__(self,counts=None):
        """ Initialize class with chosen hyperparameters.
        Args:
        Optional Args (Args we think will make your life easier):
            counts: A list of Ints that tell you how many types of each feature there are
        Example:
            DT  = DTClassifier()
            or
            DT = DTClassifier(count = [2,3,2,2])
            Dataset =
            [[0,1,0,0],
            [1,2,1,1],
            [0,1,1,0],
            [1,2,0,1],
            [0,0,1,1]]

        """

    def fit(self, X, y):
        """ Fit the data; Make the Decision tree

        Args:
            X (array-like): A 2D numpy array with the training data, excluding targets
            y (array-like): A 1D numpy array with the training targets

        Returns:
            self: this allows this to be chained, e.g. model.fit(X,y).predict(X_test)

        """

        return self

    def predict(self, X):
        """ Predict all classes for a dataset X

        Args:
            X (array-like): A 2D numpy array with the training data, excluding targets

        Returns:
            array, shape (n_samples,)
                Predicted target values per element in X.
        """
        pass


    def score(self, X, y):
        """ Return accuracy(Classification Acc) of model on a given dataset. Must implement own score function.

        Args:
            X (array-like): A 2D numpy array with data, excluding targets
            y (array-like): A 1D numpy array of the targets
        """
        return 0

In [None]:
# Optional Debugging Dataset - Pizza Homework
# pizza_dataset = np.array([[1,2,0],[0,0,0],[0,1,1],[1,1,1],[1,0,0],[1,0,1],[0,2,1],[1,0,0],[0,2,0]])
# pizza_labels = np.array([2,0,1,2,1,2,1,1,0])