In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split as split

#for selecting K best features
from sklearn.feature_selection import SelectKBest,chi2

#for selecting optimal hyperparameyters - hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV


from sklearn.tree import DecisionTreeClassifier as decision_tree
from sklearn.ensemble import RandomForestClassifier as random_forest
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier as xgb


import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

%matplotlib inline 

In [2]:
social = pd.read_csv('Social_Network_Ads.csv')
social.head(10)

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,11021175,Male,42,121315,0
1,17782313,Male,53,67214,0
2,14603818,Female,59,111005,0
3,18188198,Female,39,53414,0
4,19357305,Male,39,114285,0
5,19524602,Male,39,107458,0
6,19455396,Female,20,85582,0
7,16879913,Female,59,148335,1
8,11838352,Male,48,80579,0
9,13118940,Female,42,114204,0


In [3]:
social.isnull().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [4]:
for i in range(len(social["Gender"])):
    if social.iloc[i,1] == 'Male':
        social.iloc[i,1] = 1
    else:
        social.iloc[i,1] = 0 

social.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,11021175,1,42,121315,0
1,17782313,1,53,67214,0
2,14603818,0,59,111005,0
3,18188198,0,39,53414,0
4,19357305,1,39,114285,0


In [5]:
x = social.iloc[:,1:4]
x["Gender"] = x["Gender"].astype(int) 
y = social.iloc[:,4]

In [6]:
x.describe()

Unnamed: 0,Gender,Age,EstimatedSalary
count,400.0,400.0,400.0
mean,0.49,39.885,99748.2275
std,0.500526,11.796596,28374.358336
min,0.0,20.0,50075.0
25%,0.0,30.0,76060.25
50%,0.0,40.0,99205.0
75%,1.0,51.0,122748.0
max,1.0,60.0,149553.0


In [7]:
# using scalar on age and estimatedSalary
x_new = x
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scale_col = ['Age','EstimatedSalary']
x_new[scale_col] = scaler.fit_transform(x[scale_col]) 

x_new.head()

Unnamed: 0,Gender,Age,EstimatedSalary
0,1,0.179514,0.761032
1,1,1.113154,-1.148043
2,0,1.622412,0.39722
3,0,-0.075116,-1.635006
4,1,-0.075116,0.512962


In [8]:
x_new.describe()

Unnamed: 0,Gender,Age,EstimatedSalary
count,400.0,400.0,400.0
mean,0.49,1.731948e-16,2.231548e-16
std,0.500526,1.001252,1.001252
min,0.0,-1.687767,-1.75283
25%,0.0,-0.839003,-0.835883
50%,0.0,0.009760784,-0.01916899
75%,1.0,0.943401,0.8115981
max,1.0,1.707288,1.757472


In [9]:
X_train,X_test,y_train,y_test = split(x_new,y,test_size=0.2)

X_train = X_train.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)

X_test = X_test.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)

## Decision Tree

In [10]:
#hyper parameter search 
from scipy.stats import randint

param_dt = {"max_depth": [3, None],
              "max_features": randint(1, 9),
              "min_samples_leaf": randint(1, 9),
              "criterion": ["gini", "entropy"]}

tree = decision_tree()
tree_cv = RandomizedSearchCV(estimator = tree, param_distributions = param_dt, n_iter = 100, cv = 9, verbose=2, random_state=42, n_jobs = -1)
tree_cv = tree_cv.fit(X_train,y_train)

print(tree_cv.best_estimator_)

Fitting 9 folds for each of 100 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 717 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:    5.5s finished


DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=1, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')


In [19]:
tree = decision_tree(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=1, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

tree = tree.fit(X_train,y_train)
pred = tree.predict(X_test)

accuracy = accuracy_score(y_test,pred)
print(accuracy)

conf_mat = confusion_matrix(y_test,pred)
print(conf_mat)

0.7125
[[55  1]
 [22  2]]


## Random Forest

In [12]:
#hyperparameter search
from scipy.stats import randint

max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

param_rf = {'n_estimators': [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)],
               'max_features': ['auto', 'sqrt'],
               'max_depth': max_depth,
               'min_samples_split': randint(1,10),
               'min_samples_leaf': randint(1,5),
               'bootstrap': [True, False],
               'criterion':['gini','entropy']}

rf = random_forest()
rf_cv = RandomizedSearchCV(estimator = rf, param_distributions = param_rf, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_cv = rf_cv.fit(X_train,y_train)

print(rf_cv.best_estimator_)


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   41.0s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.4min finished


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=80, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=6,
                       min_weight_fraction_leaf=0.0, n_estimators=1366,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


In [21]:
rf = random_forest(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=80, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=6,
                       min_weight_fraction_leaf=0.0, n_estimators=1366,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

rf.fit(X_train,y_train)
pred = rf.predict(X_test)

accuracy = accuracy_score(y_test,pred)
print(accuracy)

conf_mat = confusion_matrix(y_test,pred)
print(conf_mat)

0.6375
[[43 13]
 [16  8]]


## Naive Bayes

In [14]:
nb = GaussianNB()
nb = nb.fit(X_train,y_train)

pred = nb.predict(X_test)

accuracy = accuracy_score(y_test,pred)
print(accuracy)

conf_mat = confusion_matrix(y_test,pred)
print(conf_mat)

0.7
[[56  0]
 [24  0]]


## Ada Boost

In [15]:
#hyperparameter search

param_ada = {'n_estimators' : [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)],
             'learning_rate' : [0.04,0.045,0.05,0.055,0.06]}

ada = AdaBoostClassifier()
ada_cv = RandomizedSearchCV(estimator = ada, param_distributions = param_ada, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1)
ada_cv = ada_cv.fit(X_train,y_train)

print(ada_cv.best_estimator_)


Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    8.2s


AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.04,
                   n_estimators=100, random_state=None)


[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   53.0s finished


In [23]:
ada = AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.06,
                   n_estimators=100, random_state=None)

ada = ada.fit(X_train,y_train)

pred = ada.predict(X_test)

accuracy = accuracy_score(y_test,pred)
print(accuracy)

conf_mat = confusion_matrix(y_test,pred)
print(conf_mat)

0.725
[[55  1]
 [21  3]]


## XG Boost

In [17]:
from scipy import stats
param_xgb = {'n_estimators': stats.randint(150, 1000),
              'learning_rate': stats.uniform(0.01, 0.6),
              'subsample': stats.uniform(0.3, 0.9),
              'max_depth': [3, 4, 5, 6, 7, 8, 9],
              'colsample_bytree': stats.uniform(0.5, 0.9),
              'min_child_weight': [1, 2, 3, 4]}

XGB = xgb()
XGB_cv = RandomizedSearchCV(estimator = XGB, param_distributions = param_xgb, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
XGB_cv = XGB_cv.fit(X_train,y_train)

print(XGB_cv.best_estimator_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    4.5s finished


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5228772140696857, gamma=0,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.07473485619598266, max_delta_step=0, max_depth=9,
              min_child_weight=4, missing=nan, monotone_constraints='()',
              n_estimators=892, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=0.7278332008639006,
              tree_method='exact', validate_parameters=1, verbosity=None)


In [24]:
XGB = xgb(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5228772140696857, gamma=0,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.07473485619598266, max_delta_step=0, max_depth=9,
              min_child_weight=4, missing=None, monotone_constraints='()',
              n_estimators=892, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=0.7278332008639006,
              tree_method='exact', validate_parameters=1, verbosity=None)

XGB.fit(X_train,y_train)
pred = XGB.predict(X_test)

accuracy = accuracy_score(y_test,pred)
print(accuracy)

conf_mat = confusion_matrix(y_test,pred)
print(conf_mat)

0.6
[[39 17]
 [15  9]]


In [25]:
import pickle as p
p.dump(ada,open('social.pickle','wb'))