### Section 7 -Decision Tree
Decisions trees takes each feature and makes a decision / split on a value that provides the most information gain(minimum entropy). Then it alternates to another feature and makes the same split decision. It continues to do so until no further information gain is available

![](./images/15.jpg "")
![](./images/16.jpg "")

When classifying a new data point, take the average values of all the data points in that decision section. 


In [1]:
import numpy as np   #Mathematics library
import matplotlib.pyplot as plt # for plotting
import pandas as pd  #manage datasets
import seaborn as sea
import scipy.stats as stats
import sklearn

In [2]:
df = pd.read_csv('ImputeMissing.csv')
df=df.drop('Unnamed: 0',axis=1)

In [3]:
###Convert 15 Categorical to Dummies
df=pd.get_dummies(df,columns=['Auction','Make','Model','Trim','Color','Transmission','WheelType','Nationality','Size','TopThreeAmericanName','PRIMEUNIT','AUCGUART','VNZIP1','VNST','IsOnlineSale'],drop_first=True)


In [4]:
# Importing the dataset
dataset = df
X = dataset.iloc[:, 2:].values
y = dataset.iloc[:, 1].values

In [5]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)


In [6]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### DIMENSIONALITY REDUCTION - FEATURE EXTRACTION

In [7]:
# Applying PCA
"""
from sklearn.decomposition import PCA
pca = PCA(n_components = 10)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
explained_variance
"""

'\nfrom sklearn.decomposition import PCA\npca = PCA(n_components = 10)\nX_train = pca.fit_transform(X_train)\nX_test = pca.transform(X_test)\nexplained_variance = pca.explained_variance_ratio_\nexplained_variance\n'

In [8]:
# Applying LDA 

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components = 10)
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)




In [9]:
# Applying Kernel PCA
"""
from sklearn.decomposition import KernelPCA
kpca = KernelPCA(n_components = 10, kernel = 'rbf')
X_train = kpca.fit_transform(X_train)
X_test = kpca.transform(X_test)
"""

"\nfrom sklearn.decomposition import KernelPCA\nkpca = KernelPCA(n_components = 10, kernel = 'rbf')\nX_train = kpca.fit_transform(X_train)\nX_test = kpca.transform(X_test)\n"

### Training the Decision Tree

In [10]:
# Fitting Decision Tree Classification to the Training set
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)


DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=0, splitter='best')

In [11]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

### Testing Model Accuracy

In [12]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[14438,  1572],
       [ 1545,   691]])

In [13]:
# Accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.82916803682999018

In [14]:
# Precision 
from sklearn.metrics import precision_score
precision_score(y_test, y_pred)


0.30534688466637205

In [15]:
# Recall
from sklearn.metrics import recall_score
recall_score(y_test, y_pred)


0.3090339892665474

In [16]:
# F1 score
from sklearn.metrics import f1_score
f1_score(y_test,y_pred)


0.30717937319404304

In [17]:
# Cohen's kappa
from sklearn.metrics import cohen_kappa_score
cohen_kappa_score(y_test, y_pred)


0.20975581329073179

In [18]:
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
accuracies.mean()


0.83205140443994474

In [19]:
accuracies.std()

0.0042268395983021591