# Decision Tree with Grid Search and Adaboost models

In [1]:
import sklearn
sklearn.__version__

'0.23.1'

In [2]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# Import a bunch of libraries.
import time
import numpy as np
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_openml
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn import ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import linear_model
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from scipy import stats
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics.pairwise import euclidean_distances

# Set the randomizer seed so results are the same each time.
np.random.seed(0)

## Read the provided dataset and understand the observations

In [3]:
# Read data set provided from the Kaggle competition website

train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

print("The training dataset contains {0} observations with {1} features for each observation.".\
    format(train_df.shape[0], train_df.shape[1]))
train_df.head()

The training dataset contains 15120 observations with 56 features for each observation.


Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,1,2596,51,3,258,0,510,221,232,148,...,0,0,0,0,0,0,0,0,0,5
1,2,2590,56,2,212,-6,390,220,235,151,...,0,0,0,0,0,0,0,0,0,5
2,3,2804,139,9,268,65,3180,234,238,135,...,0,0,0,0,0,0,0,0,0,2
3,4,2785,155,18,242,118,3090,238,238,122,...,0,0,0,0,0,0,0,0,0,2
4,5,2595,45,2,153,-1,391,220,234,150,...,0,0,0,0,0,0,0,0,0,5


## Split the training data into train and dev dataset

In [4]:
# Split the training data into train and dev data set.
# Read the training data into X and y
train_file = open("data/train.csv")
column_names_train = train_file.readline()
data = np.loadtxt(train_file, delimiter=",")

y, X = data[:, -1].astype('u1'), data[:, :-1]

# Shuffle the data, but make sure that the features and accompanying labels stay in sync.
np.random.seed(0)
shuffle = np.random.permutation(np.arange(X.shape[0]))
X, y = X[shuffle], y[shuffle]

# Split the training data into 90% training data and 10% dev data
train_size = int(X.shape[0] * 0.9)

# Discard 1st feature (ID number that doesn't provide info about the label)
y_train, X_train = y[:train_size], X[:train_size, 1:]
y_dev, X_dev = y[train_size:], X[train_size:, 1:]
print(X_dev.shape, X_train.shape)

# Read the test data and store in X_test
test_file = open("data/test.csv")
column_names_test = test_file.readline()
data_test = np.loadtxt(test_file, delimiter=",")

# Save the test data in X_test. Test data does not have the 1st feature
X_test = data_test
X_test = X_test[:, 1:]
print(X_test.shape)

(1512, 54) (13608, 54)
(565892, 54)


In [5]:
# import training data from relative filepath
data = pd.read_csv("data/train.csv")

# extract training data except labels and ID column
train_df = data.loc[:, (data.columns != "Cover_Type") & (data.columns != "Id")]

# extract labels from training data
train_labels_df = data.loc[:, "Cover_Type"]

# import test data from relative filepath
test_data = pd.read_csv("data/test.csv")

# extract test data except ID column
test_df = test_data.loc[:, test_data.columns != "Id"]

# train model using K-NN and the mini_train data and mini_train_labels
knn_model = KNeighborsClassifier(n_neighbors=1, metric='euclidean')
knn_model.fit(train_df, train_labels_df)

# Supply the test_df to knn_model and create predictions
predictions = knn_model.predict(test_df)

# converts predictions from np array to pd dataframe
predictions_df = pd.DataFrame(data = predictions, index = test_data.loc[:, "Id"], columns = ["Cover_Type"])

predictions_df

Unnamed: 0_level_0,Cover_Type
Id,Unnamed: 1_level_1
15121,2
15122,2
15123,1
15124,1
15125,1
...,...
581008,3
581009,3
581010,3
581011,3


# Decision Tree

In [6]:
criterion = ['gini', 'entropy']
max_features = [2, 5, 10, 20, 50, 52, 54]
max_depth = [5, 10, 20, 30, 40]

'''
Grid of parameters with a discrete number of values for each. Can be used to iterate over parameter value 
combinations with the Python built-in function iter. The order of the generated parameter combinations is 
deterministic.
'''
param_grid = {'criterion': criterion, 'max_features': max_features, 'max_depth': max_depth}

# Find best parameter for the decision tree classifier using the param_grid and gridsearch
# Fit this decision tree using the trianing data

best_param_DT = GridSearchCV(DecisionTreeClassifier(), param_grid, scoring='accuracy')
best_param_DT.fit(X_train, y_train)

# Find optimal criterion between gini and entropy
optimal_criterion_DT = best_param_DT.best_params_['criterion']
print('The optimal criterion is {0}'.format(optimal_criterion_DT))

# Find optimal max_features
optimal_max_features_DT = best_param_DT.best_params_['max_features']
print('The optimal maximum number of features is {0}'.format(optimal_max_features_DT))

# Find optimal max depth for the decision tree
optimal_max_depth_DT = best_param_DT.best_params_['max_depth']
print('The optimal maximum depth of the tree is {0}'.format(optimal_max_depth_DT))

# Pass the optimal criterion, max_features, and max_depth to develop the model and fit to the train data
DT = DecisionTreeClassifier(criterion=optimal_criterion_DT, max_features=optimal_max_features_DT, 
                            max_depth=optimal_max_depth_DT, random_state=0)
DT.fit(X_train, y_train)

# Using the dev data predict the y using decision tree algorithm
y_dev_dec = DT.predict(X_dev)
print(metrics.classification_report(y_dev, y_dev_dec))
print(metrics.accuracy_score(y_dev, y_dev_dec))

The optimal criterion is entropy
The optimal maximum number of features is 52
The optimal maximum depth of the tree is 20
              precision    recall  f1-score   support

           1       0.69      0.64      0.66       220
           2       0.64      0.61      0.63       208
           3       0.79      0.74      0.76       220
           4       0.94      0.94      0.94       212
           5       0.88      0.93      0.90       227
           6       0.77      0.83      0.79       206
           7       0.89      0.94      0.92       219

    accuracy                           0.80      1512
   macro avg       0.80      0.80      0.80      1512
weighted avg       0.80      0.80      0.80      1512

0.8029100529100529


# Ada Boost

AdaBoost is best used to boost the performance of decision trees on binary classification problems. 

In [7]:
name_features = column_names_test.split(",")
abc = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=30), n_estimators=500, learning_rate=1.0)
abc.fit(X_train, y_train)
y_pred = abc.predict(X_dev)
print('Accuracy (adaboost with decision trees NORMAL new feature):', abc.score(X_dev,y_dev))
print(classification_report(y_dev,y_pred))

Accuracy (adaboost with decision trees NORMAL new feature): 0.8333333333333334
              precision    recall  f1-score   support

           1       0.77      0.74      0.75       220
           2       0.71      0.67      0.69       208
           3       0.78      0.78      0.78       220
           4       0.97      0.92      0.94       212
           5       0.87      0.90      0.88       227
           6       0.79      0.86      0.83       206
           7       0.94      0.95      0.95       219

    accuracy                           0.83      1512
   macro avg       0.83      0.83      0.83      1512
weighted avg       0.83      0.83      0.83      1512



In [8]:
# Use Decision tree and Adaboost to get the models
DT.fit(train_df, train_labels_df)
abc.fit(train_df, train_labels_df)

# Supply the test_df to models and create predictions
predictions_DT = DT.predict(test_df)
predictions_abc = abc.predict(test_df)

# converts predictions from np array to pd dataframe
predictions_DT_df = pd.DataFrame(data = predictions_DT, index = test_data.loc[:, "Id"], columns = ["Cover_Type"])
predictions_abc_df = pd.DataFrame(data = predictions_abc, index = test_data.loc[:, "Id"], columns = ["Cover_Type"])

# outputs to csv file
predictions_DT_df.to_csv("DT_predictions.csv")
predictions_abc_df.to_csv("abc_predictions.csv")

# When submitted DT into Kaggle the score was 0.672

# Post adaboost the score in Kaggle was 0.70