## Decision Tree Case Study

### Wine Quality Data - White

In [1]:
import os
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score

In [2]:
os.chdir('D:/MLP_Session_26_JULY/DATASET')

data = pd.read_csv("winequality_white.csv")
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [3]:
data.quality.value_counts()

6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: quality, dtype: int64

In [4]:
data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0,4898.0
mean,6.854788,0.278241,0.334192,6.391415,0.045772,35.308085,138.360657,0.994027,3.188267,0.489847,10.514267,5.877909
std,0.843868,0.100795,0.12102,5.072058,0.021848,17.007137,42.498065,0.002991,0.151001,0.114126,1.230621,0.885639
min,3.8,0.08,0.0,0.6,0.009,2.0,9.0,0.98711,2.72,0.22,8.0,3.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,108.0,0.991723,3.09,0.41,9.5,5.0
50%,6.8,0.26,0.32,5.2,0.043,34.0,134.0,0.99374,3.18,0.47,10.4,6.0
75%,7.3,0.32,0.39,9.9,0.05,46.0,167.0,0.9961,3.28,0.55,11.4,6.0
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.82,1.08,14.2,9.0


In [5]:
data.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [6]:
X = data.drop(columns = 'quality')
y = data['quality']

In [7]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.30, random_state= 0)

In [8]:
clf = DecisionTreeClassifier()

clf.fit(x_train,y_train)

DecisionTreeClassifier()

In [9]:
X.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol'],
      dtype='object')

In [10]:
y_train.unique()

array([5, 6, 8, 4, 7, 3, 9], dtype=int64)

In [11]:
# Accuracy for the Training data

clf.score(x_train,y_train)

1.0

In [12]:
# Accuracy for the Test data

clf.score(x_test,y_test)

0.5625850340136055

In [13]:
# Feature Scaling using StandardScaler

scalar = StandardScaler()

x_scaled = scalar.fit_transform(X)

In [14]:
# Splitting the dataset into Train and Test 

x_train,x_test,y_train,y_test = train_test_split(x_scaled,y,test_size = 0.30, random_state= 0)

In [15]:
# Giving different choices for the hyperparameters

grid_param = {
    'criterion': ['gini', 'entropy'],
    'max_depth' : range(2,32,1),
    'min_samples_leaf' : range(1,10,1),
    'min_samples_split': range(2,10,1),
    'splitter' : ['best', 'random']
    
}

In [16]:
# Performing GridSearchCV

grid_search = GridSearchCV(estimator=clf,
                     param_grid=grid_param,
                     cv=5,
                    n_jobs =-1)

In [17]:
# Fitting the model using GridSearchCV

grid_search.fit(x_train,y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(2, 32),
                         'min_samples_leaf': range(1, 10),
                         'min_samples_split': range(2, 10),
                         'splitter': ['best', 'random']})

In [18]:
# Optimal hyperparameter combination using GridSearchCV

best_parameters = grid_search.best_params_
print(best_parameters)

{'criterion': 'gini', 'max_depth': 26, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}


In [19]:
# Best score from GridSearchCV

grid_search.best_score_

0.5802345129918495

In [21]:
# Calling DecisionTreeClassifier using the optimal hyperparameter combination 

clf = DecisionTreeClassifier(criterion = 'gini', max_depth =19, min_samples_leaf= 1, min_samples_split= 2, splitter ='best')


In [22]:
# Fitting the model using optimal hyperparameter combination

clf.fit(x_train,y_train)

DecisionTreeClassifier(max_depth=19)

In [23]:
# Checking the Accuracy for the Test data 

clf.score(x_test,y_test)

0.5639455782312925

In [24]:
# So we find a little increase in Accuracy from 0.5625 to 0.5639 after applying GridSearchCV