In [1]:
###########importing libraries ##
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV , train_test_split
from tqdm import tqdm_notebook
import warnings
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import gc
warnings.filterwarnings('ignore')
%matplotlib inline

In [3]:
##########Loading the Data###############################
df = pd.read_csv('data.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55146 entries, 0 to 55145
Data columns (total 83 columns):
hour                                  55146 non-null int64
minutes                               55146 non-null int64
NUM_UNIQUE(hist.action)               55146 non-null int64
NUM_UNIQUE(hist.user_id)              55146 non-null int64
NUM_UNIQUE(hist.product)              55146 non-null int64
MODE(hist.user_id)                    55146 non-null int64
NUM_UNIQUE(hist.MONTH(DateTime))      55146 non-null int64
NUM_UNIQUE(hist.YEAR(DateTime))       55146 non-null int64
NUM_UNIQUE(hist.DAY(DateTime))        55146 non-null int64
NUM_UNIQUE(hist.WEEKDAY(DateTime))    55146 non-null int64
MODE(hist.MONTH(DateTime))            55146 non-null int64
MODE(hist.YEAR(DateTime))             55146 non-null int64
MODE(hist.DAY(DateTime))              55146 non-null int64
MODE(hist.WEEKDAY(DateTime))          55146 non-null int64
product_B                             55146 non-null int64
produc

In [9]:
#get header
cols=df.columns 
#get dimensions
print(df.shape)

(55146, 83)


In [10]:
print(df.dtypes) 
print('ColumnName| DataType| MissingValues')
for i in cols:
    print(i, '|', df[i].dtype,'|',df[i].isnull().any())

hour                        int64
minutes                     int64
NUM_UNIQUE(hist.action)     int64
NUM_UNIQUE(hist.user_id)    int64
NUM_UNIQUE(hist.product)    int64
                            ...  
MODE(hist.product)_H        int64
MODE(hist.product)_I        int64
MODE(hist.product)_J        int64
MODE(hist.action)_view      int64
is_click                    int64
Length: 83, dtype: object
ColumnName| DataType| MissingValues
hour | int64 | False
minutes | int64 | False
NUM_UNIQUE(hist.action) | int64 | False
NUM_UNIQUE(hist.user_id) | int64 | False
NUM_UNIQUE(hist.product) | int64 | False
MODE(hist.user_id) | int64 | False
NUM_UNIQUE(hist.MONTH(DateTime)) | int64 | False
NUM_UNIQUE(hist.YEAR(DateTime)) | int64 | False
NUM_UNIQUE(hist.DAY(DateTime)) | int64 | False
NUM_UNIQUE(hist.WEEKDAY(DateTime)) | int64 | False
MODE(hist.MONTH(DateTime)) | int64 | False
MODE(hist.YEAR(DateTime)) | int64 | False
MODE(hist.DAY(DateTime)) | int64 | False
MODE(hist.WEEKDAY(DateTime)) | int64 | Fa

In [11]:
from sklearn import preprocessing
from sklearn.ensemble import ExtraTreesClassifier

In [12]:
df_fs = df.copy(deep = True)

In [13]:
x_fs = df_fs.drop('is_click', 1)

In [14]:
y = df_fs['is_click'] # define label as nominal values
le = preprocessing.LabelEncoder()
le.fit(y)

LabelEncoder()

In [15]:
#encode nominal to numerical
y_encoded = le.transform(y) 

df_fs['is_click'] = y_encoded

y_fs = df_fs['is_click']

In [16]:
model = ExtraTreesClassifier()
model.fit(x_fs, y_fs)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [17]:
values=model.feature_importances_.tolist()
keys=x_fs.columns.tolist()
d = dict(zip(keys, values))
# sort pairs by values descending
s = [(k, d[k]) for k in sorted(d, key=d.get, reverse=True)]

In [18]:
print('\nSelected features by Wrapper model (classification):\n')
for k, v in s:
    print(k,'\t',v)


Selected features by Wrapper model (classification):

user_group_id_2.0 	 0.042712729186713386
age_level_3.0 	 0.035973824612533935
MODE(hist.user_id) 	 0.03488261010142997
campaign_id_359520 	 0.033727210777141234
age_level_4.0 	 0.031215308100149997
product_I 	 0.029089014351741365
minutes 	 0.028674324217029062
product_category_1_2 	 0.028243676322667065
hour 	 0.02797751866628529
city_development_index_3.0 	 0.027209619906709698
webpage_id_13787 	 0.025505532377725598
weekday_Wednesday 	 0.025352452480641254
MODE(hist.product)_H 	 0.02515952424780865
product_H 	 0.024755142365439374
campaign_id_405490 	 0.023073956419868816
product_category_1_5 	 0.02101926737420972
city_development_index_2.0 	 0.02049388626472003
product_C 	 0.020219560859486306
MODE(hist.product)_B 	 0.019835948572882723
age_level_5.0 	 0.019682710483198296
campaign_id_404347 	 0.01856376939302847
user_group_id_3.0 	 0.018389194332197788
weekday_Sunday 	 0.017440170705713402
age_level_2.0 	 0.01740929699120389
p

In [19]:
df_new = x_fs[['webpage_id_13787', 'weekday_Wednesday', 'MODE(hist.user_id)', 'weekday_Sunday', 'hour', 'minutes', 'product_category_1_2', 'age_level_3.0', 'product_H', 'product_I', 'age_level_2.0', 'user_group_id_4.0', 'weekday_Thursday', 'product_category_1_4', 'weekday_Monday', 'product_category_1_5', 'user_group_id_3.0', 'city_development_index_2.0', 'weekday_Tuesday', 'product_C', 'MODE(hist.product)_H', 'webpage_id_53587', 'campaign_id_118601', 'city_development_index_3.0', 'campaign_id_405490', 'campaign_id_360936', 'campaign_id_359520', 'city_development_index_4.0', 'var_1_1', 'product_category_1_3', 'MODE(hist.product)_B', 'product_D', 'age_level_5.0', 'webpage_id_51181', 'MODE(hist.DAY(DateTime))', 'campaign_id_404347', 'user_group_id_2.0', 'age_level_4.0', 'webpage_id_60305', 'campaign_id_396664', 'webpage_id_6970', 'webpage_id_45962', 'webpage_id_11085', 'webpage_id_28529', 'product_E']]

In [20]:
df_nb = pd.concat([df_new, y_fs], axis = 1)

In [21]:
df_nb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55146 entries, 0 to 55145
Data columns (total 46 columns):
webpage_id_13787              55146 non-null int64
weekday_Wednesday             55146 non-null int64
MODE(hist.user_id)            55146 non-null int64
weekday_Sunday                55146 non-null int64
hour                          55146 non-null int64
minutes                       55146 non-null int64
product_category_1_2          55146 non-null int64
age_level_3.0                 55146 non-null int64
product_H                     55146 non-null int64
product_I                     55146 non-null int64
age_level_2.0                 55146 non-null int64
user_group_id_4.0             55146 non-null int64
weekday_Thursday              55146 non-null int64
product_category_1_4          55146 non-null int64
weekday_Monday                55146 non-null int64
product_category_1_5          55146 non-null int64
user_group_id_3.0             55146 non-null int64
city_development_index_2

#Naive-Bayes Classification

In [22]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
y=df_nb['is_click']
x=df_nb.drop('is_click',axis=1)
clf = GaussianNB()
acc =cross_val_score(clf, x, y, cv=5, scoring='accuracy').mean()
print("Accuracy by N-fold Cross Validation:",acc)

Accuracy by N-fold Cross Validation: 0.5683101811888374


#Decision Trees

In [25]:
# Copying the Pre-processed data from the Naive Bayes
df_dt=df_nb.copy(deep=True)
display(HTML(df_dt.head(10).to_html()))

Unnamed: 0,webpage_id_13787,weekday_Wednesday,MODE(hist.user_id),weekday_Sunday,hour,minutes,product_category_1_2,age_level_3.0,product_H,product_I,age_level_2.0,user_group_id_4.0,weekday_Thursday,product_category_1_4,weekday_Monday,product_category_1_5,user_group_id_3.0,city_development_index_2.0,weekday_Tuesday,product_C,MODE(hist.product)_H,webpage_id_53587,campaign_id_118601,city_development_index_3.0,campaign_id_405490,campaign_id_360936,campaign_id_359520,city_development_index_4.0,var_1_1,product_category_1_3,MODE(hist.product)_B,product_D,age_level_5.0,webpage_id_51181,MODE(hist.DAY(DateTime)),campaign_id_404347,user_group_id_2.0,age_level_4.0,webpage_id_60305,campaign_id_396664,webpage_id_6970,webpage_id_45962,webpage_id_11085,webpage_id_28529,product_E,is_click
0,0,0,539151,0,13,22,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,1,1,0,0,0,6,0,0,0,1,0,0,0,0,0,0,0
1,0,0,541788,0,12,38,1,1,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,6,0,0,0,0,0,0,0,1,0,0,0
2,0,1,842864,0,19,3,0,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,6,0,0,0,0,0,0,0,1,0,0,0
3,1,0,1093941,1,17,3,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,5,0,1,0,0,0,0,0,0,0,0,0
4,0,0,1045160,1,12,34,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,5,0,1,0,0,0,1,0,0,0,0,1
5,1,0,359425,1,11,39,0,1,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0
6,0,0,20431,0,10,39,0,1,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0
7,0,1,206470,0,7,11,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,6,0,0,0,0,0,0,1,0,0,0,0
8,0,0,392133,0,8,50,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,6,0,1,0,0,0,0,0,0,1,0,0
9,0,0,281905,0,10,20,0,1,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,6,0,0,0,1,0,0,0,0,0,0,0


In [24]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
# by N-fold cross validation
y=df_dt['is_click']
x=df_dt.drop('is_click',axis=1)
acc=cross_val_score(clf, x, y, cv=5, scoring='accuracy').mean()
print("Tree Accuracy by N-fold Cross Validation:",acc)

Tree Accuracy by N-fold Cross Validation: 0.5683101811888374


#Random Forest

In [26]:
# Copying the Pre-processed data from the Naive Bayes
df_rf=df_nb.copy(deep=True)
display(HTML(df_rf.head(10).to_html()))

Unnamed: 0,webpage_id_13787,weekday_Wednesday,MODE(hist.user_id),weekday_Sunday,hour,minutes,product_category_1_2,age_level_3.0,product_H,product_I,age_level_2.0,user_group_id_4.0,weekday_Thursday,product_category_1_4,weekday_Monday,product_category_1_5,user_group_id_3.0,city_development_index_2.0,weekday_Tuesday,product_C,MODE(hist.product)_H,webpage_id_53587,campaign_id_118601,city_development_index_3.0,campaign_id_405490,campaign_id_360936,campaign_id_359520,city_development_index_4.0,var_1_1,product_category_1_3,MODE(hist.product)_B,product_D,age_level_5.0,webpage_id_51181,MODE(hist.DAY(DateTime)),campaign_id_404347,user_group_id_2.0,age_level_4.0,webpage_id_60305,campaign_id_396664,webpage_id_6970,webpage_id_45962,webpage_id_11085,webpage_id_28529,product_E,is_click
0,0,0,539151,0,13,22,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,1,1,0,0,0,6,0,0,0,1,0,0,0,0,0,0,0
1,0,0,541788,0,12,38,1,1,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,6,0,0,0,0,0,0,0,1,0,0,0
2,0,1,842864,0,19,3,0,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,6,0,0,0,0,0,0,0,1,0,0,0
3,1,0,1093941,1,17,3,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,5,0,1,0,0,0,0,0,0,0,0,0
4,0,0,1045160,1,12,34,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,5,0,1,0,0,0,1,0,0,0,0,1
5,1,0,359425,1,11,39,0,1,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0
6,0,0,20431,0,10,39,0,1,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0
7,0,1,206470,0,7,11,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,6,0,0,0,0,0,0,1,0,0,0,0
8,0,0,392133,0,8,50,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,6,0,1,0,0,0,0,0,0,1,0,0
9,0,0,281905,0,10,20,0,1,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,6,0,0,0,1,0,0,0,0,0,0,0


In [27]:
y=df_rf['is_click']
x=df_rf.drop('is_click',axis=1)
tree = DecisionTreeClassifier()
bag = BaggingClassifier(tree, n_estimators=100, max_samples=0.8, random_state=1)
acc=cross_val_score(bag, x, y, cv=5, scoring='accuracy').mean()
print("RandomForest Accuracy by N-fold Cross Validation:",acc)

RandomForest Accuracy by N-fold Cross Validation: 0.9539428524639005


In [None]:
#KNN Model

In [30]:
# Copying the Pre-processed data from the Naive Bayes
df_knn=df_nb.copy(deep=True)
display(HTML(df_knn.head(10).to_html()))

Unnamed: 0,webpage_id_13787,weekday_Wednesday,MODE(hist.user_id),weekday_Sunday,hour,minutes,product_category_1_2,age_level_3.0,product_H,product_I,age_level_2.0,user_group_id_4.0,weekday_Thursday,product_category_1_4,weekday_Monday,product_category_1_5,user_group_id_3.0,city_development_index_2.0,weekday_Tuesday,product_C,MODE(hist.product)_H,webpage_id_53587,campaign_id_118601,city_development_index_3.0,campaign_id_405490,campaign_id_360936,campaign_id_359520,city_development_index_4.0,var_1_1,product_category_1_3,MODE(hist.product)_B,product_D,age_level_5.0,webpage_id_51181,MODE(hist.DAY(DateTime)),campaign_id_404347,user_group_id_2.0,age_level_4.0,webpage_id_60305,campaign_id_396664,webpage_id_6970,webpage_id_45962,webpage_id_11085,webpage_id_28529,product_E,is_click
0,0,0,539151,0,13,22,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,1,1,0,0,0,6,0,0,0,1,0,0,0,0,0,0,0
1,0,0,541788,0,12,38,1,1,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,6,0,0,0,0,0,0,0,1,0,0,0
2,0,1,842864,0,19,3,0,1,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,6,0,0,0,0,0,0,0,1,0,0,0
3,1,0,1093941,1,17,3,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,5,0,1,0,0,0,0,0,0,0,0,0
4,0,0,1045160,1,12,34,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,5,0,1,0,0,0,1,0,0,0,0,1
5,1,0,359425,1,11,39,0,1,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0
6,0,0,20431,0,10,39,0,1,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0
7,0,1,206470,0,7,11,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,6,0,0,0,0,0,0,1,0,0,0,0
8,0,0,392133,0,8,50,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,6,0,1,0,0,0,0,0,0,1,0,0
9,0,0,281905,0,10,20,0,1,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,6,0,0,0,1,0,0,0,0,0,0,0


In [31]:
df_knn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55146 entries, 0 to 55145
Data columns (total 46 columns):
webpage_id_13787              55146 non-null int64
weekday_Wednesday             55146 non-null int64
MODE(hist.user_id)            55146 non-null int64
weekday_Sunday                55146 non-null int64
hour                          55146 non-null int64
minutes                       55146 non-null int64
product_category_1_2          55146 non-null int64
age_level_3.0                 55146 non-null int64
product_H                     55146 non-null int64
product_I                     55146 non-null int64
age_level_2.0                 55146 non-null int64
user_group_id_4.0             55146 non-null int64
weekday_Thursday              55146 non-null int64
product_category_1_4          55146 non-null int64
weekday_Monday                55146 non-null int64
product_category_1_5          55146 non-null int64
user_group_id_3.0             55146 non-null int64
city_development_index_2

In [33]:
# find numeric columns
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
cols_numeric = df_knn.select_dtypes(include=numerics).columns.tolist()
print('Selected numerical columns:\n',cols_numeric)    

# min-max normalization to scale [0, 1]
for col in cols_numeric:
    df_knn[col]=(df_knn[col]-df_knn[col].min())/(df_knn[col].max()-df_knn[col].min())

display(HTML(df_knn.head(10).to_html()))

Selected numerical columns:
 ['webpage_id_13787', 'weekday_Wednesday', 'MODE(hist.user_id)', 'weekday_Sunday', 'hour', 'minutes', 'product_category_1_2', 'age_level_3.0', 'product_H', 'product_I', 'age_level_2.0', 'user_group_id_4.0', 'weekday_Thursday', 'product_category_1_4', 'weekday_Monday', 'product_category_1_5', 'user_group_id_3.0', 'city_development_index_2.0', 'weekday_Tuesday', 'product_C', 'MODE(hist.product)_H', 'webpage_id_53587', 'campaign_id_118601', 'city_development_index_3.0', 'campaign_id_405490', 'campaign_id_360936', 'campaign_id_359520', 'city_development_index_4.0', 'var_1_1', 'product_category_1_3', 'MODE(hist.product)_B', 'product_D', 'age_level_5.0', 'webpage_id_51181', 'MODE(hist.DAY(DateTime))', 'campaign_id_404347', 'user_group_id_2.0', 'age_level_4.0', 'webpage_id_60305', 'campaign_id_396664', 'webpage_id_6970', 'webpage_id_45962', 'webpage_id_11085', 'webpage_id_28529', 'product_E', 'is_click']


Unnamed: 0,webpage_id_13787,weekday_Wednesday,MODE(hist.user_id),weekday_Sunday,hour,minutes,product_category_1_2,age_level_3.0,product_H,product_I,age_level_2.0,user_group_id_4.0,weekday_Thursday,product_category_1_4,weekday_Monday,product_category_1_5,user_group_id_3.0,city_development_index_2.0,weekday_Tuesday,product_C,MODE(hist.product)_H,webpage_id_53587,campaign_id_118601,city_development_index_3.0,campaign_id_405490,campaign_id_360936,campaign_id_359520,city_development_index_4.0,var_1_1,product_category_1_3,MODE(hist.product)_B,product_D,age_level_5.0,webpage_id_51181,MODE(hist.DAY(DateTime)),campaign_id_404347,user_group_id_2.0,age_level_4.0,webpage_id_60305,campaign_id_396664,webpage_id_6970,webpage_id_45962,webpage_id_11085,webpage_id_28529,product_E,is_click
0,0.0,0.0,0.472276,0.0,0.565217,0.372881,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.474586,0.0,0.521739,0.644068,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,1.0,0.738326,0.0,0.826087,0.050847,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1.0,0.0,0.958268,1.0,0.73913,0.050847,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.833333,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.915536,1.0,0.521739,0.576271,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.833333,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
5,1.0,0.0,0.314837,1.0,0.478261,0.661017,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.833333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.017881,0.0,0.434783,0.661017,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,1.0,0.18085,0.0,0.304348,0.186441,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.343489,0.0,0.347826,0.847458,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9,0.0,0.0,0.24693,0.0,0.434783,0.338983,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
y=df_knn['is_click']
x=df_knn.drop('is_click',axis=1)

In [36]:
##########Building KNN Model using N-fold cross evaluations ###############################################
from sklearn.model_selection import cross_val_score
from sklearn import neighbors
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
for k in range(1, 20, 2): 
    clf=neighbors.KNeighborsClassifier(k, weights='uniform')
    acc=cross_val_score(clf, x, y, cv=5, scoring='accuracy').mean()
    print('K =', k, ', Accuracy: ',acc)

K = 1 , Accuracy:  0.9169497374358387
K = 3 , Accuracy:  0.9330347050333595
K = 5 , Accuracy:  0.9330166154165891
K = 7 , Accuracy:  0.9304779065633969
K = 9 , Accuracy:  0.9279572281353555
K = 11 , Accuracy:  0.925726818835168
K = 13 , Accuracy:  0.9237139683344562
K = 15 , Accuracy:  0.9229704821256105
K = 17 , Accuracy:  0.921936858325844
K = 19 , Accuracy:  0.9204861670828166
