In [27]:
import numpy as np
from numpy import mean,std
from scipy.stats import randint
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold,train_test_split,cross_val_score,cross_validate

In [4]:
X= pd.read_csv('../data/processed/X_train', header=0, index_col=0)
X_final_test = pd.read_csv('../data/processed/X_final_test', header=0, index_col=0)
y = pd.read_csv('../data/processed/y_train', header=0, index_col=0)
y_final_test = pd.read_csv('../data/processed/y_final_test', header=0, index_col=0)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

# Feature Scaling

In [6]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Decision Tree Model Building

In [None]:
#without hyperparameter tuning
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy:', accuracy_score(y_test, y_pred)*100,'%')
print(classification_report(y_test,y_pred))

In [None]:
#without hyperparameter tuning
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=3)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",accuracy_score(y_test, y_pred))

# Cross Fold Validation

In [None]:
#cross fold validation without hyper tuning
cv = KFold(n_splits=10, random_state=42, shuffle=True)
clf = DecisionTreeClassifier()
print(cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1).mean())

In [None]:
#cross fold validation with hyper tuning
cv = KFold(n_splits=10, random_state=42, shuffle=True)
clf = DecisionTreeClassifier(criterion="entropy", max_depth=3)
print(cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1).mean())

# Hyperparameter tuning of DT using RandomizedSearchCV
In RandomizedSearchCV,not all hyperparameter values are used. Instead, a fixed number of hyperparameters are sampled from specified probability distributions.The following hyperparameter are chosen for tuning:

"max_depth":[3,None]
max_depth: The maximum depth of the tree.The higher value of maximum depth causes overfitting, and a lower value causes underfitting. So, we set 3 to none.
"min_samples_leaf":randint(1,20),
min_samples_leaf: The minimum number of samples required to be at a leaf node. We set random value to be chosen from 1 to 20.
"criterion":["gini","entropy"]
criterion: The function to measure the quality of a split. We set both gini and entropy to choose the best one for this decision tree model.

In [None]:
from sklearn.model_selection import RandomizedSearchCV

#paramters setup for tuning
param={"max_depth":[3,None],"min_samples_leaf":randint(1,20),"criterion":["gini","entropy"]}
#"min_samples_split":randint(1,40)
#"min_samples_leaf":randint(1,20)

#instatiate decision tree classifier
clf = DecisionTreeClassifier()

#instatiate RandomizedSearchCV
clf_cv=RandomizedSearchCV(clf,param,cv=5)

#fitting the model
clf_cv.fit(X_train,y_train)

#print the tuned parameters and score
print("Best: %f using %s" % (clf_cv.best_score_.mean(), clf_cv.best_params_))
means = clf_cv.cv_results_['mean_test_score']
stds = clf_cv.cv_results_['std_test_score']
params = clf_cv.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
#printing the confusion metrix
y_pred=clf_cv.predict(X_test)
print(confusion_matrix(y_test,y_pred))


# Building the model using Bagging Classifier

In [None]:
from sklearn.ensemble import BaggingClassifier

bagging_model = BaggingClassifier(
base_estimator=DecisionTreeClassifier(),
n_estimators=100,
max_samples=0.8,
bootstrap=True,
oob_score=True,
random_state=0
)
bagging_model.fit(X_train,y_train)
print(bagging_model.oob_score_)
