## Importing the Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing the dataset

In [2]:
dataset= pd.read_csv('Social_Network_Ads.csv')
X= dataset.iloc[:,:-1].values
y= dataset.iloc[:,-1].values

## Splitting the dataset into the Training set,Validation set and Test set

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0) 
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state = 0) 


## Feature Scaling

In [4]:
from sklearn.preprocessing import StandardScaler
sc= StandardScaler()
X_train= sc.fit_transform(X_train)
X_test= sc.transform(X_test)
X_val= sc.transform(X_val)

In [5]:
import joblib
import pandas as pd
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

## Finding the best parameters (hyperparameter tuning)

In [6]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [7]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
parameters = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

cv = GridSearchCV(lr, parameters, cv=5)
cv.fit(X_train, y_train)

print_results(cv)

BEST PARAMS: {'C': 0.1}

0.828 (+/-0.127) for {'C': 0.001}
0.825 (+/-0.125) for {'C': 0.01}
0.831 (+/-0.123) for {'C': 0.1}
0.828 (+/-0.131) for {'C': 1}
0.828 (+/-0.131) for {'C': 10}
0.828 (+/-0.131) for {'C': 100}
0.828 (+/-0.131) for {'C': 1000}


In [8]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [9]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[32  0]
 [ 4  4]]


0.9

In [10]:
cv.best_estimator_

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

## Saving the model

In [11]:
joblib.dump(cv.best_estimator_, 'LR_model.pkl')

['LR_model.pkl']

# SVM

In [12]:
from sklearn.svm import SVC

svc = SVC()
parameters = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10]
}

cv = GridSearchCV(svc, parameters, cv=5)
cv.fit(X_train, y_train)

print_results(cv)

BEST PARAMS: {'C': 10, 'kernel': 'rbf'}

0.816 (+/-0.105) for {'C': 0.1, 'kernel': 'linear'}
0.897 (+/-0.077) for {'C': 0.1, 'kernel': 'rbf'}
0.816 (+/-0.107) for {'C': 1, 'kernel': 'linear'}
0.894 (+/-0.081) for {'C': 1, 'kernel': 'rbf'}
0.825 (+/-0.125) for {'C': 10, 'kernel': 'linear'}
0.9 (+/-0.079) for {'C': 10, 'kernel': 'rbf'}


In [13]:
cv.best_estimator_

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [14]:
joblib.dump(cv.best_estimator_, 'SVM_model.pkl')

['SVM_model.pkl']

# KNN

In [15]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

parameters = {
    'n_neighbors': np.arange(1,20)
}

cv = GridSearchCV(knn, parameters, cv=5)
cv.fit(X_train, y_train)

print_results(cv)

BEST PARAMS: {'n_neighbors': 9}

0.862 (+/-0.098) for {'n_neighbors': 1}
0.847 (+/-0.107) for {'n_neighbors': 2}
0.897 (+/-0.099) for {'n_neighbors': 3}
0.881 (+/-0.111) for {'n_neighbors': 4}
0.9 (+/-0.096) for {'n_neighbors': 5}
0.894 (+/-0.085) for {'n_neighbors': 6}
0.9 (+/-0.088) for {'n_neighbors': 7}
0.9 (+/-0.088) for {'n_neighbors': 8}
0.903 (+/-0.086) for {'n_neighbors': 9}
0.891 (+/-0.07) for {'n_neighbors': 10}
0.903 (+/-0.084) for {'n_neighbors': 11}
0.9 (+/-0.089) for {'n_neighbors': 12}
0.903 (+/-0.071) for {'n_neighbors': 13}
0.9 (+/-0.074) for {'n_neighbors': 14}
0.903 (+/-0.071) for {'n_neighbors': 15}
0.894 (+/-0.065) for {'n_neighbors': 16}
0.9 (+/-0.063) for {'n_neighbors': 17}
0.897 (+/-0.063) for {'n_neighbors': 18}
0.9 (+/-0.063) for {'n_neighbors': 19}


In [16]:
cv.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=9, p=2,
                     weights='uniform')

In [17]:
joblib.dump(cv.best_estimator_, 'KNN_model.pkl')

['KNN_model.pkl']

## Naive Bayes

In [18]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
parameters = {
    'var_smoothing': [1e-09, 1e-06, 1e-12],
}

cv = GridSearchCV(gnb, parameters, cv=5)
cv.fit(X_train, y_train)

print_results(cv)

BEST PARAMS: {'var_smoothing': 1e-09}

0.881 (+/-0.105) for {'var_smoothing': 1e-09}
0.881 (+/-0.105) for {'var_smoothing': 1e-06}
0.881 (+/-0.105) for {'var_smoothing': 1e-12}


In [19]:
cv.best_estimator_

GaussianNB(priors=None, var_smoothing=1e-09)

In [20]:
joblib.dump(cv.best_estimator_, 'GNB_model.pkl')

['GNB_model.pkl']

# Decision tree

In [21]:
from sklearn.tree import DecisionTreeClassifier

tree= DecisionTreeClassifier()

parameters = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 4, 8, 16, 32, None]
}

cv = GridSearchCV(tree, parameters, cv=5)
cv.fit(X_train, y_train)

print_results(cv)

BEST PARAMS: {'criterion': 'gini', 'max_depth': 2}

0.9 (+/-0.093) for {'criterion': 'gini', 'max_depth': 2}
0.853 (+/-0.043) for {'criterion': 'gini', 'max_depth': 4}
0.838 (+/-0.09) for {'criterion': 'gini', 'max_depth': 8}
0.838 (+/-0.071) for {'criterion': 'gini', 'max_depth': 16}
0.834 (+/-0.082) for {'criterion': 'gini', 'max_depth': 32}
0.828 (+/-0.085) for {'criterion': 'gini', 'max_depth': None}
0.9 (+/-0.093) for {'criterion': 'entropy', 'max_depth': 2}
0.866 (+/-0.057) for {'criterion': 'entropy', 'max_depth': 4}
0.841 (+/-0.086) for {'criterion': 'entropy', 'max_depth': 8}
0.844 (+/-0.088) for {'criterion': 'entropy', 'max_depth': 16}
0.844 (+/-0.088) for {'criterion': 'entropy', 'max_depth': 32}
0.841 (+/-0.066) for {'criterion': 'entropy', 'max_depth': None}


In [22]:
cv.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [23]:
joblib.dump(cv.best_estimator_, 'TREE_model.pkl')

['TREE_model.pkl']

# MLP

In [24]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier()
parameters = {
    'hidden_layer_sizes': [(10,), (50,), (100,)],
    'activation': ['relu', 'tanh', 'logistic'],
    'learning_rate': ['constant', 'invscaling', 'adaptive']
}

cv = GridSearchCV(mlp, parameters, cv=5)
cv.fit(X_train, y_train)

print_results(cv)









BEST PARAMS: {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'constant'}

0.853 (+/-0.139) for {'activation': 'relu', 'hidden_layer_sizes': (10,), 'learning_rate': 'constant'}
0.856 (+/-0.126) for {'activation': 'relu', 'hidden_layer_sizes': (10,), 'learning_rate': 'invscaling'}
0.794 (+/-0.052) for {'activation': 'relu', 'hidden_layer_sizes': (10,), 'learning_rate': 'adaptive'}
0.894 (+/-0.095) for {'activation': 'relu', 'hidden_layer_sizes': (50,), 'learning_rate': 'constant'}
0.884 (+/-0.115) for {'activation': 'relu', 'hidden_layer_sizes': (50,), 'learning_rate': 'invscaling'}
0.894 (+/-0.084) for {'activation': 'relu', 'hidden_layer_sizes': (50,), 'learning_rate': 'adaptive'}
0.897 (+/-0.082) for {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'constant'}
0.894 (+/-0.084) for {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'invscaling'}
0.894 (+/-0.084) for {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learn



In [25]:
cv.best_estimator_

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [26]:
joblib.dump(cv.best_estimator_, 'MLP_model.pkl')

['MLP_model.pkl']

## Random Forest

In [27]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
parameters = {
    'n_estimators': [5, 50, 250],
    'max_depth': [2, 4, 8, 16, 32, None]
}

cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(X_train, y_train)
print_results(cv)

BEST PARAMS: {'max_depth': 4, 'n_estimators': 50}

0.853 (+/-0.079) for {'max_depth': 2, 'n_estimators': 5}
0.903 (+/-0.086) for {'max_depth': 2, 'n_estimators': 50}
0.906 (+/-0.085) for {'max_depth': 2, 'n_estimators': 250}
0.884 (+/-0.072) for {'max_depth': 4, 'n_estimators': 5}
0.909 (+/-0.086) for {'max_depth': 4, 'n_estimators': 50}
0.903 (+/-0.09) for {'max_depth': 4, 'n_estimators': 250}
0.888 (+/-0.07) for {'max_depth': 8, 'n_estimators': 5}
0.881 (+/-0.063) for {'max_depth': 8, 'n_estimators': 50}
0.888 (+/-0.059) for {'max_depth': 8, 'n_estimators': 250}
0.897 (+/-0.089) for {'max_depth': 16, 'n_estimators': 5}
0.894 (+/-0.099) for {'max_depth': 16, 'n_estimators': 50}
0.888 (+/-0.059) for {'max_depth': 16, 'n_estimators': 250}
0.881 (+/-0.077) for {'max_depth': 32, 'n_estimators': 5}
0.888 (+/-0.113) for {'max_depth': 32, 'n_estimators': 50}
0.894 (+/-0.081) for {'max_depth': 32, 'n_estimators': 250}
0.866 (+/-0.074) for {'max_depth': None, 'n_estimators': 5}
0.891 (+/-0.07)

In [28]:
cv.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=4, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [29]:
joblib.dump(cv.best_estimator_, 'RF_model.pkl')

['RF_model.pkl']

## Gradient boosting

In [30]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
parameters = {
    'n_estimators': [5, 50, 250, 500],
    'max_depth': [1, 3, 5, 7, 9],
    'learning_rate': [0.01, 0.1, 1, 10, 100]
}

cv = GridSearchCV(gb, parameters, cv=5)
cv.fit(X_train, y_train)

print_results(cv)

BEST PARAMS: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}

0.622 (+/-0.008) for {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 5}
0.691 (+/-0.031) for {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 50}
0.891 (+/-0.106) for {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 250}
0.891 (+/-0.106) for {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 500}
0.622 (+/-0.008) for {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 5}
0.881 (+/-0.101) for {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}
0.897 (+/-0.082) for {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 250}
0.9 (+/-0.074) for {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500}
0.622 (+/-0.008) for {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 5}
0.866 (+/-0.05) for {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 50}
0.878 (+/-0.088) for {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 250}
0.875 (+/-0.064) for {'learning_rate'

In [31]:
cv.best_estimator_

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=50,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [32]:
joblib.dump(cv.best_estimator_, 'GB_model.pkl')

['GB_model.pkl']

## Evaluating the model on Validation Set

In [33]:
models = {}

for mdl in ['LR', 'KNN', 'GNB' ,'TREE', 'SVM', 'MLP', 'RF', 'GB']:
    models[mdl] = joblib.load('{}_model.pkl'.format(mdl))

In [34]:
models

{'LR': LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='warn', n_jobs=None, penalty='l2',
                    random_state=None, solver='warn', tol=0.0001, verbose=0,
                    warm_start=False),
 'KNN': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                      metric_params=None, n_jobs=None, n_neighbors=9, p=2,
                      weights='uniform'),
 'GNB': GaussianNB(priors=None, var_smoothing=1e-09),
 'TREE': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
                        max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort=False,
                        random_state=None, splitter='be

In [35]:
def evaluate_model(name, model, features, labels):
    start = time()
    pred = model.predict(features)
    end = time()
    accuracy = round(accuracy_score(labels, pred), 3)
    precision = round(precision_score(labels, pred,pos_label='2'), 3)
    recall = round(recall_score(labels, pred,pos_label='2'), 3)
    F1_Score= round(f1_score(labels, pred), 3)
    print('{} -- Accuracy: {} / Precision: {} / Recall: {} / F1_Score: {} / Latency: {}ms'.format(name,
                                                                                   accuracy,
                                                                                   precision,
                                                                                   recall,
                                                                                   F1_Score,      
                                                                                   round((end - start)*1000, 1)))

In [36]:
from time import time
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score


for name, mdl in models.items():
    evaluate_model(name, mdl, X_val, y_val)

ValueError: pos_label='2' is not a valid label: array([0, 1], dtype=int64)

## Testing the dataset on the best model

In [None]:
evaluate_model('KNN', models['KNN'], X_test, y_test)