### Decision Tree

#### Implementation in Python

In [None]:
#Necessary Imports 
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, classification_report 
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('wine.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
# any missing value?

data.isna().sum()

#### As we have categorical column(Alcohol_Content), we need to convert into numeric data using encoding method.

Content is following some order like Low, Med and High. We need to give weight according to the order.

In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
ord_encoder = OrdinalEncoder(categories=[['Low', 'Medium', 'High']]) 

df1 = ord_encoder.fit_transform(data[['Alcohol_content']])
df1

### Override alcohol content column with code

In [None]:
data['Alcohol_content'] = df1

In [None]:
data.head()

## plotting  Heatmap (Correletion matrix)

- Let's try to see if we can reduce the features using different techniques
- Let's plot heatmap to visualize and find the coefficient of multicollinearity

In [None]:
df_corr = data.corr().abs() # This code will get the coefficient of one variable vs all other variable

plt.figure(figsize = (14,8))
sns.heatmap(df_corr, annot = True, annot_kws={'size':10})
plt.show()

seems like alcohol_content and alcohol are correlated as per above heat map. But we need more proof.

### Make sure they are really correlated (it should follow some trend)

In [None]:
plt.scatter(data.alcohol, data.Alcohol_content)
plt.xlabel('Alcohol')
plt.ylabel('Alcohol_content')
plt.title('Alcohol vs Alcohol_content')
plt.show()

We see clear trend.As and when Alcohol level increases its content also increasing. So we can delete one of them. 

In [None]:
x = data.drop(columns = ['quality','Alcohol_content'])
y = data['quality']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.25, random_state =41) 

In [None]:
# write one function and call as many as times to check accuracy_score of different models
def metric_score(clf, x_train, x_test, y_train, y_test, train = True):
    if train:
        y_pred = clf.predict(x_train)
        
        print("\n===================Train Result============================")
        
        print(f"Accuracy Score:{accuracy_score(y_train,y_pred) * 100:.2f}%")
    
    elif train == False:
        pred = clf.predict(x_test)
        
        print("\n===================Test Result============================")
        
        print(f"Accuracy Score:{accuracy_score(y_test, pred) * 100:.2f}%")
        
        print("\n \n Test Classification Report \n", classification_report(y_test, pred, digits = 2))        
        

In [None]:
# Model initialization

clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)

In [None]:
# Call the function and pass dataset to check train and test score

metric_score(clf, x_train, x_test, y_train, y_test, train = True)   # this is for Training Score

metric_score(clf, x_train, x_test, y_train, y_test, train = False)   # this is for Test Score

### Let's see how the tree look like(this is nothing to do with algirithm/accuracy). It's just for visualization purpose.

In [None]:
feature_name = list(x.columns)
class_name =list(y_train.unique())
feature_name

In [None]:
import graphviz
from sklearn.tree import export_graphviz
from sklearn import tree
from Ipython.display import Image
import pydotplus

# create a dot_file which store the tree structure
dot_data = export_graphviz(clf, feature_name = feature_name, rounded = True, filled = True)

# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_png("myTree.png")

# Show graph
Image(graph.create_png())

### Model Confidene/Accuracy

#### Let's now try to tune some hyperparameters using the GridSearchCV algorithm.

GridSearchCV is a methos used to tune our hypermeters. We can pass different values of hyperparameters as parameters for grid search. It does a exhausive generation of combination of different parameters passed. Using cross validaton score, Grid Search returns the combination of hyperparameter for which the model is performing the best.

#### What are Hyper parameters?

   DecisionTreeClassifier(Class_weight = Non, criterion = 'gini', max_depth = None, max_feature =None,      
                            max_leaf_nodes=None,min_impurity_decrease=0.0,min_impurity_split=None,                                                      min_samples_leaf=1,min_samples_split=2, min_weight_fraction_leaf = 0.0, presort = False,                                    random_state=None, splitter='best')


We can see above the decision tree classfier algorithm takes all those parameters which are also known as hyperparameters.


In [None]:
# We are tuning four important hyperparameters right now, we are passing the different values for both parameters
grid_param = {
    'criterion':['gini', 'entropy'],
    'max_depth': range(10,15),            # The maxium depth of the tree
    'min_sample_leaf' : range(2,6),       # The minimum number of samples required to be at a leaf node
    'min_sample_split': range(3,8),       # The minimum number of samples required to split an internal node
    'max_leaf_nodes': range(5,10)         # Best nodes are defined as relative in impurity. If none then unlimited number of leaf node.
}

In [None]:
grid_search = GridSearchCV(estimator = clf,
                          param_grid = grid_param,
                          cv = 5,
                          n_jobs =-1) # Use all the cores in your system. For performance improvement.

In [None]:
grid_search.fit(x_train, y_train)

In [None]:
best_parameters = grid_search.best_params_
print(best_parameters)

In [None]:
# initiate DecisionTreeClassifier with new parameter and train
clf = DecisionTreeClassifier(criterion ='entropy', min_samples_split = 3,max_depth = 10, max_leaf_nodes=7, min_samples_leaf = 2)

#train the model
clf.fit(x_train, y_train)

In [None]:
# Call the function and pass the dataset to check train and test score
metric_score(clf, x_train, x_test, y_train, y_test, train = True)   # this is for Training Score

metric_score(clf, x_train, x_test, y_train, y_test, train = False)   # this is for Test Score