In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/avazu-ctr-prediction/sampleSubmission.gz
/kaggle/input/avazu-ctr-prediction/train.gz
/kaggle/input/avazu-ctr-prediction/test.gz


## Introduction
In online advertising, click-through rate (CTR) is a very important metric for evaluating ad performance. As a result, click prediction systems are essential and widely used for sponsored search and real-time bidding.

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import gzip
import random
import warnings
warnings.filterwarnings('ignore')

In [3]:
n = 40428967  #total number of records in the clickstream data 
sample_size = 3000000
skip_values = sorted(random.sample(range(1,n), n-sample_size))

with gzip.open('../input/avazu-ctr-prediction/train.gz') as f:
    train = pd.read_csv(f,skiprows = skip_values)
train['hour'] = pd.to_datetime(train['hour'],format = '%y%m%d%H')

As the dataset is huge, we are taking 3000000 samples for the training and testing. The 'hour' feature is changed to datetime.

In [4]:
train.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,10001868339616595934,0,2014-10-21,1005,1,e151e245,7e091613,f028772b,ecad2386,7801e8d9,...,1,0,17747,320,50,1974,2,39,100019,33
1,10004670021948955159,0,2014-10-21,1005,0,543a539e,c7ca3108,3e814130,ecad2386,7801e8d9,...,1,0,20366,320,50,2333,0,39,-1,157
2,10005334911727438633,0,2014-10-21,1010,1,85f751fd,c4e18dd6,50e219e0,ffc6ffd0,7801e8d9,...,4,0,21665,320,50,2493,3,35,-1,117
3,10006789981076459409,0,2014-10-21,1005,0,030440fe,08ba7db9,76b2941d,ecad2386,7801e8d9,...,1,0,20596,320,50,2161,0,35,-1,157
4,10011406079394798455,0,2014-10-21,1005,0,543a539e,c7ca3108,3e814130,ecad2386,7801e8d9,...,1,0,20362,320,50,2333,0,39,-1,157


## Exploratory Data Analysis

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000000 entries, 0 to 2999999
Data columns (total 24 columns):
 #   Column            Dtype         
---  ------            -----         
 0   id                uint64        
 1   click             int64         
 2   hour              datetime64[ns]
 3   C1                int64         
 4   banner_pos        int64         
 5   site_id           object        
 6   site_domain       object        
 7   site_category     object        
 8   app_id            object        
 9   app_domain        object        
 10  app_category      object        
 11  device_id         object        
 12  device_ip         object        
 13  device_model      object        
 14  device_type       int64         
 15  device_conn_type  int64         
 16  C14               int64         
 17  C15               int64         
 18  C16               int64         
 19  C17               int64         
 20  C18               int64         
 21  C19     

The dataset contains 24 columns with object and numerical features.

### Checking Null Values

In [6]:
train.isnull().sum()

id                  0
click               0
hour                0
C1                  0
banner_pos          0
site_id             0
site_domain         0
site_category       0
app_id              0
app_domain          0
app_category        0
device_id           0
device_ip           0
device_model        0
device_type         0
device_conn_type    0
C14                 0
C15                 0
C16                 0
C17                 0
C18                 0
C19                 0
C20                 0
C21                 0
dtype: int64

The dataset contains no null values. So there is no need of imputation or dropping the null values 

In [7]:
train.describe(include = 'all')

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
count,3000000.0,3000000.0,3000000,3000000.0,3000000.0,3000000,3000000,3000000,3000000,3000000,...,3000000.0,3000000.0,3000000.0,3000000.0,3000000.0,3000000.0,3000000.0,3000000.0,3000000.0,3000000.0
unique,,,240,,,3269,3890,24,4533,280,...,,,,,,,,,,
top,,,2014-10-22 09:00:00,,,85f751fd,c4e18dd6,50e219e0,ecad2386,7801e8d9,...,,,,,,,,,,
freq,,,33282,,,1082719,1122548,1226535,1917281,2021519,...,,,,,,,,,,
first,,,2014-10-21 00:00:00,,,,,,,,...,,,,,,,,,,
last,,,2014-10-30 23:00:00,,,,,,,,...,,,,,,,,,,
mean,9.223949e+18,0.1693163,,1004.968,0.2883013,,,,,,...,1.015477,0.3308113,18840.16,318.8885,60.09051,2112.438,1.431823,227.2713,53214.13,83.36979
std,5.327451e+18,0.3750311,,1.094143,0.5066193,,,,,,...,0.5274422,0.8540812,4962.511,21.31097,47.22028,609.691,1.326066,351.0099,49957.0,70.277
min,1398959000000.0,0.0,,1001.0,0.0,,,,,,...,0.0,0.0,375.0,120.0,20.0,112.0,0.0,33.0,-1.0,1.0
25%,4.608505e+18,0.0,,1005.0,0.0,,,,,,...,1.0,0.0,16920.0,320.0,50.0,1863.0,0.0,35.0,-1.0,23.0


#### id

In [None]:
train.drop('id',axis=1,inplace = True)

### Click

In [None]:
sns.countplot(x='click',data=train)
plt.show()
print(train.click.value_counts()/len(train))

#### Banner Position

In [None]:
train.groupby(['banner_pos', 'click']).size().unstack().plot(kind='barh')
train.groupby(['banner_pos','click']).size().unstack().iloc[2:,:].plot(kind='barh')

#### Site Features

In [None]:
site_features = ['site_id', 'site_domain', 'site_category']
train[site_features].describe()

In [8]:
X = train.drop('click',axis=1)
y = train.click

The 'click' column is dropped and labelled as X and y is a Series containing the values for click.


## Splitting the Data

The data is now split into train and test sets. The training set consists of 70% of the total data and remaining is the test set.

In [9]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 42)

Here, the numerical columns and categorical columns are taken seperately for scaling and Hashing.

In [10]:
num_cols = X.select_dtypes(include = ['int','float']).columns.tolist()
categorical_cols = X.select_dtypes(include = ['object']).columns.tolist()
print(num_cols)
print(categorical_cols)

['C1', 'banner_pos', 'device_type', 'device_conn_type', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
['site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 'device_id', 'device_ip', 'device_model']


## Hashing


A hash function is a function that maps a set of objects to a set of integers. When using a hash function, this mapping is performed which takes a key of arbitrary length as input and outputs an integer in a specific range. The purposes of the hashing is to minimize memory consumption by the features.

In [11]:
for col in categorical_cols:
	X_train[col] = X_train[col].apply(lambda x: hash(x))
    
for col in categorical_cols:
    X_test[col] = X_test[col].apply(lambda x:hash(x))

## Scaling the Numerical Values

StandardScaler is used to scale the values and make mean=0 and standard deviation = 1. MinMaxScaler can also be used.

In [12]:
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
X_train[num_cols] = std.fit_transform(X_train[num_cols])
X_test[num_cols] = std.transform(X_test[num_cols])

To make the model give good accuracy, we can also create new features from exisiting features and can drop the old features.

In [13]:
X_train['user_info'] = X_train.device_ip + X_train.device_model + X_train.device_id
X_train = X_train.drop(['device_id','device_ip','device_model','id','hour'],axis=1)
    
X_train['devtry'] = X_train.device_type + X_train.banner_pos + X_train.device_conn_type
X_train = X_train.drop(['banner_pos','device_conn_type','device_type'],axis=1)

X_test['user_info'] = X_test.device_ip + X_test.device_model + X_test.device_id
X_test = X_test.drop(['device_id','device_ip','device_model','id','hour'],axis=1)
    
X_test['devtry'] = X_test.device_type + X_test.banner_pos + X_test.device_conn_type
X_test = X_test.drop(['banner_pos','device_conn_type','device_type'],axis=1)

## Model - Decision Tree Classifier

In [14]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth = 10)
tree.fit(X_train,y_train)
print('Train Score:',tree.score(X_train,y_train))
print('Test Score:',tree.score(X_test,y_test))

Train Score: 0.8345685714285714
Test Score: 0.8330722222222222


Here the train scores and test scores are 0.835 and 0.833, respectively. As the scores are nearly equal, there is no problem with overfitting. This score can be further increased with hyperparameter tuning. 
But Accuracy score is not used for evaluating classification problems. We should use the confusion matrix and ROC-AUC score.

In [15]:
from sklearn.metrics import roc_curve,confusion_matrix,precision_score,recall_score,roc_auc_score
y_score = tree.predict_proba(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_score[:, 1])
roc_auc_score = roc_auc_score(y_test,y_score[:,1])
print(roc_auc_score)

0.7139549159293653


Here, AUC-ROC Score is 0.714, which is less than the average ROC-AUC Score. By the confusion matrix, we can get a clear idea regarding prediction.

In [18]:
matrix = confusion_matrix(y_test,y_pred)
tn, fp, fn, tp = matrix.ravel()
print(matrix)

[[740592   6284]
 [143951   9173]]


The false negatives are huge. False Negatives depicts that, in reality, the customer has clicked on the ad. But our classifier is predicting as not click. This might impact the business decisions a lot, as this model predicts less CTR than real.

Generally, False positives are dealt with Recall. Recall is the total true positives divided by (True Positives + False Negatives). So, more the recall, less the false positives.

In [20]:
y_pred = tree.predict(X_test)
precision = precision_score(y_test, y_pred, average = 'weighted')
recall = recall_score(y_test, y_pred, average = 'weighted')
print("Precision: %s, Recall: %s" %(precision, recall))

Precision: 0.7957787425699582, Recall: 0.8330722222222222


## Hyperparameter Tuning

Here, we will start with maximum depth of the tree then continue with cross validation score.

In [21]:
for max_depth_val in [2, 3, 5, 10, 15, 20]:
    clf = DecisionTreeClassifier(max_depth = max_depth_val)
    print("Evaluating tree with max_depth = %s" %(max_depth_val))
    y_pred = tree.fit(X_train,y_train).predict(X_test) 
    print("Confusion matrix: ")
    print(confusion_matrix(y_test, y_pred))
    prec = precision_score(y_test, y_pred, average = 'weighted')
    recall = recall_score(y_test, y_pred, average = 'weighted')
    print("Precision: %s, Recall: %s" %(prec, recall))

Evaluating tree with max_depth = 2
Confusion matrix: 
[[740595   6281]
 [143952   9172]]
Precision: 0.7957935407939766, Recall: 0.8330744444444445
Evaluating tree with max_depth = 3
Confusion matrix: 
[[740592   6284]
 [143949   9175]]
Precision: 0.795789262284574, Recall: 0.8330744444444445
Evaluating tree with max_depth = 5
Confusion matrix: 
[[740593   6283]
 [143953   9171]]
Precision: 0.7957749061507491, Recall: 0.8330711111111111
Evaluating tree with max_depth = 10
Confusion matrix: 
[[740596   6280]
 [143949   9175]]
Precision: 0.7958160076304498, Recall: 0.8330788888888889
Evaluating tree with max_depth = 15
Confusion matrix: 
[[740593   6283]
 [143948   9176]]
Precision: 0.7958012062113731, Recall: 0.8330766666666667
Evaluating tree with max_depth = 20
Confusion matrix: 
[[740591   6285]
 [143949   9175]]
Precision: 0.795782578060327, Recall: 0.8330733333333333


So, the maximum depth with 10, is the best option as it is having good recall score than other values.

In [22]:
from sklearn.model_selection import KFold,cross_val_score
for max_depth_val in [3, 5, 10]:
    k_fold = KFold(n_splits = 4)
    clf = DecisionTreeClassifier(max_depth = max_depth_val)
    print("Evaluating Decision Tree for max_depth = %s" %(max_depth_val))
    y_pred = tree.fit(X_train, y_train).predict(X_test) 
  
    cv_precision = cross_val_score(clf, X_train, y_train, cv = k_fold, scoring = 'recall_weighted')
    precision = recall_score(y_test, y_pred, average = 'weighted')
    print("Cross validation Recall: %s" %(cv_precision.mean()))
    print("Test Recall: %s" %(precision.mean()))

Evaluating Decision Tree for max_depth = 3
Cross validation Recall: 0.8310357142857143
Test Recall: 0.8330722222222222
Evaluating Decision Tree for max_depth = 5
Cross validation Recall: 0.8321076190476191
Test Recall: 0.8330711111111111
Evaluating Decision Tree for max_depth = 10
Cross validation Recall: 0.8340657142857144
Test Recall: 0.8330744444444445


## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'bootstrap': [True],
    'max_depth': [10], 
    'n_estimators':[2,5,10,20,50],
    'min_samples_split': [2, 3, 4],
    'max_features' : ['log2']
}

rf = RandomForestClassifier(random_state=42)

rf_search = RandomizedSearchCV(estimator = rf, param_distributions=param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, n_iter = 10)

model = rf_search.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


In [None]:
rf_search.best_params_

So, we got the best parameters for the Random Forest Classifier.

In [None]:
clf_rf = RandomForestClassifier(n_estimators= 5,
 min_samples_split= 2,
 max_features= 'log2',
 max_depth= 10,
 bootstrap= True,
 random_state=42)
clf_rf.fit(X_train,y_train)

In [None]:
y_test_pred = clf_rf.predict(X_test)

print(confusion_matrix(y_test, y_test_pred))
prec = precision_score(y_test, y_test_pred, average = 'weighted')
recall = recall_score(y_test, y_test_pred, average = 'weighted')
print("Precision: %s, Recall: %s" %(prec, recall))

In [None]:
rf_prob = clf_rf.predict_proba(X_test)
rf_fpr,rf_tpr,_ = roc_curve(y_test,rf_prob[:,1])

## XGBoost Classifier

In [None]:
import xgboost as xgb
from sklearn.metrics import roc_curve,auc,confusion_matrix,precision_score,recall_score,roc_auc_score
params = {
    "objective": "binary:logistic",
    "booster" : "gbtree",
    "eval_metric": "logloss",
    "eta":0.1,
    "max_depth": 8,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "silent": 1,
}
xgclf=xgb.XGBClassifier(**params)
xgclf.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='logloss',
        verbose=False)
xgpred=xgclf.predict_proba(X_test)

In [None]:
from sklearn import metrics 
#y_score = xgclf.predict_proba(x_test)
#fpr, tpr, thresholds = roc_curve(y_test, y_score[:, 1])
roc_auc_score = metrics.roc_auc_score(y_test,xgpred[:,1])
print(roc_auc_score)
print(xgclf.score(X_test,y_test))
xgb_fpr,xgb_tpr,_ = roc_curve(y_test,xgpred[:,1])

In [None]:
from sklearn.model_selection import KFold,cross_val_score
# Set up k-fold
k_fold = KFold(n_splits = 5)

# Evaluate precision and recall for each fold
precision = cross_val_score(
  xgclf, X_train, y_train, cv = k_fold, scoring = 'precision_weighted')
recall = cross_val_score(
  xgclf, X_train, y_train, cv = k_fold, scoring = 'recall_weighted')
print("Precision scores: %s" %(precision.mean())) 
print("Recall scores: %s" %(recall.mean()))
print(k_fold)

In [None]:
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)

## ROC Curve

In [None]:
plt.style.use('seaborn')
plt.plot(fpr,tpr,linestyle = '--',color = 'green',label='Decision Tree')
plt.plot(rf_fpr,rf_tpr,linestyle = '--',color = 'yellow',label='Random Forest')
plt.plot(xgb_fpr,xgb_tpr,linestyle = '--',color = 'orange',label='XGBoost')
plt.plot(p_fpr,p_tpr,linestyle='--',color = 'blue')
plt.legend()
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('AUC-ROC Curve')

So, out of the 3 classifiers, XGBoost is giving the best AUC-ROC Score.