# Decision Tree Case Study
## Author : Ashish Kumar Patra

## 1. Import the Required Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

## 2. Load the Dataset and Check Details
### a. Load the Dataset

In [10]:
df = pd.read_csv('C:\\Users\\ASHISH\\Desktop\\DataSets\\adult_dataset.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


### b. Check Size of Data

In [5]:
df.shape

(32561, 15)

### c. Check Various Data Types in Data

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


### d. Check Statistical Information of Data

In [9]:
df.describe()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


### e. Check Missing Values 

In [11]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

### f. There are some " ? " that can be seen in Data. Check it and get rid of it.

In [13]:
# Getting only the values of workclass is " ? " and assigning it to df1

df1 = df[df['workclass']=='?']
df1.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
14,51,?,172175,Doctorate,16,Never-married,?,Not-in-family,White,Male,0,2824,40,United-States,>50K
24,61,?,135285,HS-grad,9,Married-civ-spouse,?,Husband,White,Male,0,2603,32,United-States,<=50K
44,71,?,100820,HS-grad,9,Married-civ-spouse,?,Husband,White,Male,0,2489,15,United-States,<=50K


In [14]:
# Getting the Data which doesn't have " ? "

df = df[df['workclass']!='?']
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
5,34,Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States,<=50K
6,38,Private,150601,10th,6,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,United-States,<=50K


Lets see whether any other column contains '?', since '?' is a string, we can apply this check only on the categorical columns.

In [15]:
# Select all categorical variables
df_categorical = df.select_dtypes(include=['object'])

# Checking whether any other columns contain a '?'
df_categorical.apply(lambda x:x=='?',axis=0).sum()

workclass           0
education           0
marital.status      0
occupation          7
relationship        0
race                0
sex                 0
native.country    556
income              0
dtype: int64

Thus the columns occupation and native.country contain some '?'s. Lets get rid of them.

In [16]:
# Dropping the '?'s

df = df[df['occupation']!='?']
df = df[df['native.country']!='?']

Now we have a clean dataframe which is ready for model building.

In [17]:
df.shape

(30162, 15)

## 3. Apply Label Encoding to the New Clean DataFrame

In [19]:
from sklearn import preprocessing

# Select all categorical variables

df_categorical = df.select_dtypes(include=['object'])
df_categorical.head()

Unnamed: 0,workclass,education,marital.status,occupation,relationship,race,sex,native.country,income
1,Private,HS-grad,Widowed,Exec-managerial,Not-in-family,White,Female,United-States,<=50K
3,Private,7th-8th,Divorced,Machine-op-inspct,Unmarried,White,Female,United-States,<=50K
4,Private,Some-college,Separated,Prof-specialty,Own-child,White,Female,United-States,<=50K
5,Private,HS-grad,Divorced,Other-service,Unmarried,White,Female,United-States,<=50K
6,Private,10th,Separated,Adm-clerical,Unmarried,White,Male,United-States,<=50K


In [20]:
# Apply Label Encoder to df_categorical

le = preprocessing.LabelEncoder()
df_categorical = df_categorical.apply(le.fit_transform)
df_categorical.head()

Unnamed: 0,workclass,education,marital.status,occupation,relationship,race,sex,native.country,income
1,2,11,6,3,1,4,0,38,0
3,2,5,0,6,4,4,0,38,0
4,2,15,5,9,3,4,0,38,0
5,2,11,0,7,4,4,0,38,0
6,2,0,5,0,4,4,1,38,0


In [21]:
# Convert Target variable income to categorical

df_categorical['income'] = df_categorical['income'].astype('category')
df_categorical.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30162 entries, 1 to 32560
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   workclass       30162 non-null  int32   
 1   education       30162 non-null  int32   
 2   marital.status  30162 non-null  int32   
 3   occupation      30162 non-null  int32   
 4   relationship    30162 non-null  int32   
 5   race            30162 non-null  int32   
 6   sex             30162 non-null  int32   
 7   native.country  30162 non-null  int32   
 8   income          30162 non-null  category
dtypes: category(1), int32(8)
memory usage: 1.2 MB


Now all the categorical variables are suitabaly encoded. Lets build the model.

## 4. Splitting the Data into Train and Test (70/30)

In [22]:
# Putting feature variables to X
X = df_categorical.drop('income',axis=1)

# Putting response variables to y
y = df_categorical['income']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=99)

X_train.head()

Unnamed: 0,workclass,education,marital.status,occupation,relationship,race,sex,native.country
24351,2,11,2,13,0,4,1,38
15626,1,11,4,7,1,4,1,38
4347,0,12,2,9,0,4,1,19
23972,5,9,4,0,1,4,0,38
26843,2,8,0,12,3,4,1,38


## 5. Build a Decision Tree Classifier & Print the Confusion Matrix 

In [25]:
# Import Decision Tree Classifier from sk-learn library

from sklearn.tree import DecisionTreeClassifier

# Fitting the Decision Tree with default Hyperparameters.
# Apart from max_depth which is 5 so that we can Plot and Read the Tree

dt_default = DecisionTreeClassifier(max_depth=5)

# Fiting on Train Data
dt_default.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=5)

In [26]:
# lets check the Evaluation Metrics of our default Model
# Importing Classification Report and Confusion Matrix from sk-learn metrics

from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

# Making Predictions
y_pred_default = dt_default.predict(X_test)

# Printing Classification Report
print(classification_report(y_test,y_pred_default))

              precision    recall  f1-score   support

           0       0.84      0.93      0.88      6867
           1       0.67      0.46      0.55      2182

    accuracy                           0.82      9049
   macro avg       0.76      0.70      0.72      9049
weighted avg       0.80      0.82      0.80      9049



In [27]:
print(confusion_matrix(y_test,y_pred_default))
print(accuracy_score(y_test,y_pred_default))

[[6365  502]
 [1171 1011]]
0.8151176925627142


## 6. Use GridSearch to find the Best Parameters for Decision Tree Classifier

In [28]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

# Specify number of folds for k-fold CV
n_folds = 5

# Parameters to build Model on
parameters = {'max_depth':range(1,10), 'min_samples_split':range(1,8), 'ccp_alpha': [0,0.1,0.3,0.55]}

# Instantiate the Model
dtree = DecisionTreeClassifier(criterion='gini',random_state=100)

# Fit Tree on the Training Data
tree = GridSearchCV(dtree,parameters,cv=n_folds,scoring='accuracy')
tree.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=100),
             param_grid={'ccp_alpha': [0, 0.1, 0.3, 0.55],
                         'max_depth': range(1, 10),
                         'min_samples_split': range(1, 8)},
             scoring='accuracy')

In [29]:
# Scores of GridSearch CV

scores = tree.cv_results_
pd.DataFrame(scores).head(20)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_ccp_alpha,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.01378,0.006104,0.0,0.0,0,1,1,"{'ccp_alpha': 0, 'max_depth': 1, 'min_samples_...",,,,,,,,252
1,0.013788,0.003624,0.004714,0.001628,0,1,2,"{'ccp_alpha': 0, 'max_depth': 1, 'min_samples_...",0.74781,0.74781,0.747573,0.74775,0.74775,0.747738,8.7e-05,43
2,0.01004,0.000725,0.003275,0.00037,0,1,3,"{'ccp_alpha': 0, 'max_depth': 1, 'min_samples_...",0.74781,0.74781,0.747573,0.74775,0.74775,0.747738,8.7e-05,43
3,0.009659,0.00049,0.003505,0.000424,0,1,4,"{'ccp_alpha': 0, 'max_depth': 1, 'min_samples_...",0.74781,0.74781,0.747573,0.74775,0.74775,0.747738,8.7e-05,43
4,0.009845,0.000479,0.003642,0.000429,0,1,5,"{'ccp_alpha': 0, 'max_depth': 1, 'min_samples_...",0.74781,0.74781,0.747573,0.74775,0.74775,0.747738,8.7e-05,43
5,0.010448,0.001429,0.003206,0.000394,0,1,6,"{'ccp_alpha': 0, 'max_depth': 1, 'min_samples_...",0.74781,0.74781,0.747573,0.74775,0.74775,0.747738,8.7e-05,43
6,0.010083,0.000733,0.003805,0.000389,0,1,7,"{'ccp_alpha': 0, 'max_depth': 1, 'min_samples_...",0.74781,0.74781,0.747573,0.74775,0.74775,0.747738,8.7e-05,43
7,0.006374,0.000303,0.0,0.0,0,2,1,"{'ccp_alpha': 0, 'max_depth': 2, 'min_samples_...",,,,,,,,226
8,0.012466,0.000396,0.003882,0.000459,0,2,2,"{'ccp_alpha': 0, 'max_depth': 2, 'min_samples_...",0.750414,0.74781,0.743074,0.74775,0.745144,0.746838,0.002514,211
9,0.012332,0.000121,0.003897,0.000471,0,2,3,"{'ccp_alpha': 0, 'max_depth': 2, 'min_samples_...",0.750414,0.74781,0.743074,0.74775,0.745144,0.746838,0.002514,211


In [30]:
param = tree.best_params_
param

{'ccp_alpha': 0, 'max_depth': 8, 'min_samples_split': 7}

In [31]:
tree.best_score_

0.8177903863404852

## 7. Creating Model on Best Parameters

In [32]:
# Instantiating the Model

d_tree_model = DecisionTreeClassifier(criterion='gini',random_state =100,
                                     max_depth=10, min_samples_split=9)

In [34]:
# Fit on Train Data

d_tree_model.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=10, min_samples_split=9, random_state=100)

In [35]:
# Predict on Test Data

d_tree_model.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [36]:
# Printing Classification Report

print(classification_report(y_test,d_tree_model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.87      0.90      0.88      6867
           1       0.65      0.56      0.60      2182

    accuracy                           0.82      9049
   macro avg       0.76      0.73      0.74      9049
weighted avg       0.81      0.82      0.82      9049

