In [51]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

In [52]:
df = pd.read_csv("Social_Network_Ads.csv")

In [53]:
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [54]:
encoder = LabelEncoder()
df['Gender'] = encoder.fit_transform(df['Gender'])

In [55]:
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,1,19,19000,0
1,15810944,1,35,20000,0
2,15668575,0,26,43000,0
3,15603246,0,27,57000,0
4,15804002,1,19,76000,0


In [56]:
df['Purchased'].value_counts()

0    257
1    143
Name: Purchased, dtype: int64

In [57]:
X = df.iloc[:,1:4]
y = df['Purchased']

In [58]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=4)

In [59]:
dt = DecisionTreeClassifier(criterion='gini')

In [60]:
dt.fit(X_train,y_train)

DecisionTreeClassifier()

In [61]:
y_pred = dt.predict(X_test)

In [62]:
accuracy_score(y_test,y_pred)

0.85

# Hyperparameter Tuning

In [63]:
param_grid = {
    'max_depth':[2,4,6,8,10,None],
    'criterion':['entropy','gini'],
    'max_features':[0.25,0.5,0.75,1.0],
    'min_samples_split':[0.01,0.1,0.25,0.5,0.75,1.0],
    'min_samples_leaf':[0.01,0.1,0.15,0.25],
    'min_impurity_decrease':[0.0001,0.001,0.01,0.1]
}

In [64]:
reg = GridSearchCV(DecisionTreeClassifier(),param_grid=param_grid)

In [65]:
reg.fit(X_train,y_train)

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['entropy', 'gini'],
                         'max_depth': [2, 4, 6, 8, 10, None],
                         'max_features': [0.25, 0.5, 0.75, 1.0],
                         'min_impurity_decrease': [0.0001, 0.001, 0.01, 0.1],
                         'min_samples_leaf': [0.01, 0.1, 0.15, 0.25],
                         'min_samples_split': [0.01, 0.1, 0.25, 0.5, 0.75,
                                               1.0]})

In [66]:
reg.best_score_

0.915625

In [67]:
reg.best_params_

{'criterion': 'entropy',
 'max_depth': 4,
 'max_features': 0.75,
 'min_impurity_decrease': 0.001,
 'min_samples_leaf': 0.1,
 'min_samples_split': 0.01}

In [68]:
dr = DecisionTreeClassifier(criterion = 'entropy',
 max_depth = 10,
 max_features = 0.75,
 min_impurity_decrease = 0.0001,
 min_samples_leaf = 0.1,
 min_samples_split = 0.1)

In [69]:
dr.fit(X_train,y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=10, max_features=0.75,
                       min_impurity_decrease=0.0001, min_samples_leaf=0.1,
                       min_samples_split=0.1)

In [70]:
accuracy_score(y_test,dr.predict(X_test))

0.925

In [71]:
print(classification_report(y_test,dr.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      0.90      0.95        59
           1       0.78      1.00      0.88        21

    accuracy                           0.93        80
   macro avg       0.89      0.95      0.91        80
weighted avg       0.94      0.93      0.93        80



# Feature Importance

In [72]:
for importance, name in sorted(zip(dt.feature_importances_, X_train.columns),reverse=True):
    print (name, importance)

Age 0.5290751723635373
EstimatedSalary 0.457589271710475
Gender 0.013335555925987666
