In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [36]:
data = pd.read_csv('Social_Network_Ads.csv')

In [38]:
print(data.shape)
data.head()

(400, 5)


Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [40]:
data['Gender'].replace({'Male': 0, 'Female': 1}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Gender'].replace({'Male': 0, 'Female': 1}, inplace=True)
  data['Gender'].replace({'Male': 0, 'Female': 1}, inplace=True)


In [42]:
data.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,0,19,19000,0
1,15810944,0,35,20000,0
2,15668575,1,26,43000,0
3,15603246,1,27,57000,0
4,15804002,0,19,76000,0


In [63]:
X = data.iloc[:,:4]
Y = data.iloc[:,-1]

In [74]:
X[:4]

array([[-0.93657932, -1.02020406, -1.78179743, -1.49004624],
       [ 1.66838653, -1.02020406, -0.25358736, -1.46068138],
       [-0.32087714,  0.98019606, -1.11320552, -0.78528968],
       [-1.23369247,  0.98019606, -1.01769239, -0.37418169]])

In [67]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=1, test_size=0.2)

In [80]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

In [82]:
y_pred = clf.predict(X_test)

In [84]:
print(accuracy_score(y_test, y_pred))

0.85


# Hyperparameter Tuning

In [89]:
param_dist = {
    'criterion': ['gini','entropy','log_loss'],
    'max_depth' : [1,2,3,4,5,6,7, None]
}

In [91]:
grid = GridSearchCV(clf, param_dist, cv=10, n_jobs=-1)

In [93]:
grid.fit(X_train, y_train)

In [95]:
grid.best_estimator_

In [97]:
grid.best_score_

0.91875

In [101]:
grid.best_params_

{'criterion': 'gini', 'max_depth': 2}