In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries
import time
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import MultipleLocator

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

import nltk

# Set the randomizer seed so results are the same each time.
np.random.seed(0)

data = pd.read_csv(r"..\..\data\covtype.csv")
train_df = pd.read_csv(r"..\..\data\train.csv")
test_df = pd.read_csv(r"..\..\data\test.csv")

# Build the np arrays
train_data = train_df.to_numpy()
test_data = test_df.to_numpy()

# Get last column for train labels
train_labels = train_data[ :,55]

#Remove last column from train_data because that is the labels
train_data = np.delete(train_data, 55, axis=1)

# Shuffle the input
shuffle = np.random.permutation(np.arange(train_data.shape[0]))
train_data, train_labels = train_data[shuffle], train_labels[shuffle]

# Set some variables to hold test, dev, and training data.
#test_data, test_labels = train_data[0:2000,:], train_labels[0:2000]
dev_data, dev_labels = train_data[2000:14000,:], train_labels[2000:14000]
mini_train_data, mini_train_labels = train_data[14000:15000,:], train_labels[14000:15000]

## Decision Tree

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn import tree

def print_importances(clf):
    importances = np.round(clf.feature_importances_,4)
    features = train_df.columns[0:55].to_numpy()
    importances_df = pd.DataFrame({'feature':features,'importance':importances})
    importances_df = importances_df.sort_values('importance',ascending=False)
    importances_df = importances_df[(importances_df.sum(axis=1) != 0)]  
    print(importances_df)
    
def print_structure(clf):
    n_nodes = clf.tree_.node_count
    children_left = clf.tree_.children_left
    children_right = clf.tree_.children_right
    feature = clf.tree_.feature
    threshold = clf.tree_.threshold

    node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    stack = [(0, 0)]  # start with the root node id (0) and its depth (0)
    while len(stack) > 0:
        # `pop` ensures each node is only visited once
        node_id, depth = stack.pop()
        node_depth[node_id] = depth

        # If the left and right child of a node is not the same we have a split
        # node
        is_split_node = children_left[node_id] != children_right[node_id]
        # If a split node, append left and right children and depth to `stack`
        # so we can loop through them
        if is_split_node:
            stack.append((children_left[node_id], depth + 1))
            stack.append((children_right[node_id], depth + 1))
        else:
            is_leaves[node_id] = True

    print("The binary tree structure has {n} nodes and has the following tree structure:\n".format(n=n_nodes))
    for i in range(n_nodes):
        if is_leaves[i]:
            values = clf.tree_.value[i]
            max_value = values.max()
            cover_type = np.argmax(values)
            print("{space}node={node} is a leaf node with max value of {value} and cover type of {ctype}"
                  .format(space=node_depth[i] * "\t", node=i, value=max_value, ctype=cover_type))
        else:
            print("{space}node={node} is a split node: go to node {left} if {feature} <= {threshold} else to node {right}.".format(
                      space=node_depth[i] * "\t",
                      node=i,
                      left=children_left[i],
                      feature=train_df.columns[feature[i]],
                      threshold=threshold[i],
                      right=children_right[i]))



In [4]:
clf = tree.DecisionTreeClassifier(max_depth=4, random_state=0)
clf = clf.fit(train_data, train_labels)
predicted_labels = clf.predict(dev_data)
tree_model_score = clf.score(dev_data, dev_labels)
depth = clf.get_depth()
num_leaves = clf.get_n_leaves()

print('Desicion Tree Score: %.2f%%' % (tree_model_score*100))   
print('Decision Tree Depth is %d and has %d leaves' % (depth, num_leaves))

#plt.figure(figsize=(75,10))
#tree.plot_tree(clf, feature_names=train_df.columns, proportion=True)
#plt.show()
#print_importances(clf)
#print_structure(clf)
print("Confusion Matrix:\n")
print(confusion_matrix(dev_labels, predicted_labels))



Desicion Tree Score: 64.74%
Decision Tree Depth is 4 and has 16 leaves
Confusion Matrix:

[[1068  286    0    0  148    6  187]
 [ 420  641    4    1  569   73   13]
 [   0    1  717  326  112  565    0]
 [   0    0   92 1529    0  103    0]
 [   0   81    7    0 1530   94    0]
 [   0    8  407  206  144  933    0]
 [ 371    1    0    0    6    0 1351]]


In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import plot_confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

depths = {"max_depth": [3, 4, 5, 10, 25, 30]}

#Grid search is not working as expected.  Keeps claming the second to last is better even though it is not...
grid_search_decision_tree = GridSearchCV(tree.DecisionTreeClassifier(), depths, scoring='accuracy')
grid_search_decision_tree.fit(train_data, train_labels)
predicted = grid_search_decision_tree.predict(dev_data)
optimal_depth = grid_search_decision_tree.best_params_['max_depth']
print("The optimal value for depth using GridSearchCV method is {depth} with accuracy of {accuracy}"
      .format(depth=optimal_depth, accuracy=metrics.accuracy_score(dev_labels, predicted)))

clf_final = tree.DecisionTreeClassifier(max_depth=25, random_state=0)
clf_final = clf_final.fit(train_data, train_labels)
test_predictions = clf_final.predict(test_df)

predictions_dt = pd.DataFrame(data = test_predictions, index = test_df.loc[:, "Id"], columns = ["Cover_Type"])
predictions_dt.to_csv("dt_predictions.csv")  #Kaggle Score of 0.59266
predictions_dt 

The optimal value for depth using GridSearchCV method is 30 with accuracy of 0.9995


Unnamed: 0_level_0,Cover_Type
Id,Unnamed: 1_level_1
15121,5
15122,5
15123,5
15124,5
15125,5
...,...
581008,3
581009,3
581010,3
581011,3


In [6]:
from sklearn.ensemble import AdaBoostClassifier

boosted_decision_tree = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=20), 
                              n_estimators=50, learning_rate=1,
                              algorithm="SAMME")
boosted_decision_tree.fit(train_data, train_labels)
boosted_tree_model_score = boosted_decision_tree.score(dev_data, dev_labels)
print("Boosted Model Score: ", boosted_tree_model_score)

boosted_predictions = boosted_decision_tree.predict(test_df)
predictions_bdt = pd.DataFrame(data = boosted_predictions, index = test_df.loc[:, "Id"], columns = ["Cover_Type"])
predictions_bdt.to_csv("bdt_predictions.csv")  #Kaggle Score of 0.69381 increase of 10%!!!
#predictions_bdt 


Boosted Model Score:  1.0


In [7]:
from sklearn.ensemble import RandomForestClassifier

rnd = RandomForestClassifier(criterion="entropy", max_depth=20, bootstrap=True, random_state=0)
rnd.fit(train_data, train_labels)
print(cross_val_score(rnd, dev_data, dev_labels, cv = 3).mean())

rf_predictions = rnd.predict(test_df)
predictions_rf = pd.DataFrame(data = rf_predictions, index = test_df.loc[:, "Id"], columns = ["Cover_Type"])
predictions_rf.to_csv("rf_predictions.csv")  #Kaggle Score of 0.71587


0.8543333333333333


In [8]:
### Warning - this runs VERY slow with n_estimators at 1000
from sklearn.ensemble import ExtraTreesClassifier
ex = ExtraTreesClassifier(random_state=0, n_estimators = 1000) 

ex.fit(train_data, train_labels)
print(cross_val_score(ex, dev_data, dev_labels, cv = 3).mean())

ex_predictions = ex.predict(test_df)
predictions_ex = pd.DataFrame(data = ex_predictions, index = test_df.loc[:, "Id"], columns = ["Cover_Type"])
predictions_ex.to_csv("ex_predictions.csv")  #Kaggle Score of 0.72549


0.8525
