Benchmarking Notebook
============

In [1]:
import warnings
warnings.simplefilter(action='ignore')

import pandas as pd
import numpy as np
import os
import time
import random
#import resource

from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model

import statsmodels.api as sm
import statsmodels.formula.api as smf


# For Visualization
import matplotlib.pyplot as plt
#displays better in jupyter notebooks
%matplotlib inline

**File descriptions**

    train - Training set. 10 days of click-through data, ordered chronologically. Non-clicks and clicks are subsampled according to different strategies.
    test - Test set. 1 day of ads to for testing your model predictions. 
    sampleSubmission.csv - Sample submission file in the correct format, corresponds to the All-0.5 Benchmark.

**Data fields**

    id: ad identifier
    click: 0/1 for non-click/click
    hour: format is YYMMDDHH, so 14091123 means 23:00 on Sept. 11, 2014 UTC.
    C1 -- anonymized categorical variable
    banner_pos
    site_id
    site_domain
    site_category
    app_id
    app_domain
    app_category
    device_id
    device_ip
    device_model
    device_type
    device_conn_type
    C14-C21 -- anonymized categorical variables


**Data Source(s):**

https://www.kaggle.com/c/avazu-ctr-prediction/data

https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#avazu


In [2]:
train=pd.read_csv(os.getcwd()+'\\train.csv')
test=pd.read_csv(os.getcwd()+'\\test.csv')

In [3]:
train.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000009e+18,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,2,15706,320,50,1722,0,35,-1,79
1,1.000017e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
2,1.000037e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
3,1.000064e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15706,320,50,1722,0,35,100084,79
4,1.000068e+19,0,14102100,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,...,1,0,18993,320,50,2161,0,35,-1,157


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40428967 entries, 0 to 40428966
Data columns (total 24 columns):
id                  float64
click               int64
hour                int64
C1                  int64
banner_pos          int64
site_id             object
site_domain         object
site_category       object
app_id              object
app_domain          object
app_category        object
device_id           object
device_ip           object
device_model        object
device_type         int64
device_conn_type    int64
C14                 int64
C15                 int64
C16                 int64
C17                 int64
C18                 int64
C19                 int64
C20                 int64
C21                 int64
dtypes: float64(1), int64(14), object(9)
memory usage: 7.2+ GB


In [5]:
test.head()

Unnamed: 0,id,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000017e+19,14103100,1005,0,235ba823,f6ebf28e,f028772b,ecad2386,7801e8d9,07d7df22,...,1,0,8330,320,50,761,3,175,100075,23
1,1.000018e+19,14103100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,1,0,22676,320,50,2616,0,35,100083,51
2,1.000055e+19,14103100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,1,0,22676,320,50,2616,0,35,100083,51
3,1.000109e+19,14103100,1005,0,85f751fd,c4e18dd6,50e219e0,51cedd4e,aefc06bd,0f2161f8,...,1,0,18648,320,50,1092,3,809,100156,61
4,1.000138e+19,14103100,1005,0,85f751fd,c4e18dd6,50e219e0,9c13b419,2347f47a,f95efa07,...,1,0,23160,320,50,2667,0,47,-1,221


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4577464 entries, 0 to 4577463
Data columns (total 23 columns):
id                  float64
hour                int64
C1                  int64
banner_pos          int64
site_id             object
site_domain         object
site_category       object
app_id              object
app_domain          object
app_category        object
device_id           object
device_ip           object
device_model        object
device_type         int64
device_conn_type    int64
C14                 int64
C15                 int64
C16                 int64
C17                 int64
C18                 int64
C19                 int64
C20                 int64
C21                 int64
dtypes: float64(1), int64(13), object(9)
memory usage: 803.2+ MB


Site **85f751fd** with greatest number of observations

In [7]:
pd.Series.value_counts(train['site_id'],ascending=False).head()

85f751fd    14596137
1fbe01fe     6486150
e151e245     2637747
d9750ee7      963745
5b08c53b      913325
Name: site_id, dtype: int64

App **ecad2386** with the greatest number of observations

In [8]:
pd.Series.value_counts(train['app_id'],ascending=False).head()

ecad2386    25832830
92f5800b     1555283
e2fcccd2     1129016
febd1138      759098
9c13b419      757812
Name: app_id, dtype: int64

_________________________

Data Preprocessing
=====

Narrowing focus to a particular site & app
------

In [9]:
#Conversion list from Hex to Integers for modeling
convList=['site_id',
          'site_domain',
          'site_category',
          'app_id',
          'app_domain',
          'app_category',
          'device_id',
          'device_ip',
          'device_model',
         ]

#Conversion function
def convHexInt(dataframe):
    for i in convList:
        dataframe[i]=dataframe[i].apply(lambda x: int(x,16))
    return dataframe

Restricting data to *site:* **85f751fd**

**Train**

In [10]:
Train_85f751fd=train[train['site_id']=='85f751fd']
Train_85f751fd.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
14,1.000252e+19,0,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,98fed791,d9b5648e,...,1,0,20984,320,50,2371,0,551,-1,46
19,1.000448e+19,0,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,66a5f0f3,d9b5648e,...,1,0,21234,320,50,2434,3,163,100088,61
25,1.000533e+19,0,14102100,1010,1,85f751fd,c4e18dd6,50e219e0,ffc6ffd0,7801e8d9,...,4,0,21665,320,50,2493,3,35,-1,117
27,1.000561e+19,1,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,54c5d545,2347f47a,...,1,0,21611,320,50,2480,3,297,100111,61
30,1.000619e+19,0,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,685d1c4c,2347f47a,...,1,3,15708,320,50,1722,0,35,-1,79


Convert Hex to Integers

In [11]:
Train_85f751fd=convHexInt(train[train['site_id']=='85f751fd'])
Train_85f751fd.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
14,1.000252e+19,0,14102100,1005,0,2247578109,3303116246,1356995040,2566838161,3652543630,...,1,0,20984,320,50,2371,0,551,-1,46
19,1.000448e+19,0,14102100,1005,0,2247578109,3303116246,1356995040,1722151155,3652543630,...,1,0,21234,320,50,2434,3,163,100088,61
25,1.000533e+19,0,14102100,1010,1,2247578109,3303116246,1356995040,4291231696,2013391065,...,4,0,21665,320,50,2493,3,35,-1,117
27,1.000561e+19,1,14102100,1005,0,2247578109,3303116246,1356995040,1422251333,591918202,...,1,0,21611,320,50,2480,3,297,100111,61
30,1.000619e+19,0,14102100,1005,0,2247578109,3303116246,1356995040,1750932556,591918202,...,1,3,15708,320,50,1722,0,35,-1,79


**Test**

In [12]:
Test_85f751fd=test[test['site_id']=='85f751fd']
Test_85f751fd.head()

Unnamed: 0,id,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
3,1.000109e+19,14103100,1005,0,85f751fd,c4e18dd6,50e219e0,51cedd4e,aefc06bd,0f2161f8,...,1,0,18648,320,50,1092,3,809,100156,61
4,1.000138e+19,14103100,1005,0,85f751fd,c4e18dd6,50e219e0,9c13b419,2347f47a,f95efa07,...,1,0,23160,320,50,2667,0,47,-1,221
7,1.000198e+19,14103100,1005,0,85f751fd,c4e18dd6,50e219e0,388d9bfb,2347f47a,cef3e649,...,1,3,23214,300,250,2675,3,939,100058,100
11,1.000225e+19,14103100,1005,0,85f751fd,c4e18dd6,50e219e0,2d869bee,d9b5648e,0f2161f8,...,1,0,23866,320,50,2736,0,33,100170,246
12,1.000227e+19,14103100,1010,7,85f751fd,c4e18dd6,50e219e0,643e0f88,7801e8d9,0f2161f8,...,4,2,21791,320,480,2513,3,1059,-1,68


Convert Hex to Integers

In [13]:
Test_85f751fd=convHexInt(test[test['site_id']=='85f751fd'])    
Test_85f751fd.head()

Unnamed: 0,id,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
3,1.000109e+19,14103100,1005,0,2247578109,3303116246,1356995040,1372511566,2935752381,253846008,...,1,0,18648,320,50,1092,3,809,100156,61
4,1.000138e+19,14103100,1005,0,2247578109,3303116246,1356995040,2618536985,591918202,4183751175,...,1,0,23160,320,50,2667,0,47,-1,221
7,1.000198e+19,14103100,1005,0,2247578109,3303116246,1356995040,948804603,591918202,3472090697,...,1,3,23214,300,250,2675,3,939,100058,100
11,1.000225e+19,14103100,1005,0,2247578109,3303116246,1356995040,763796462,3652543630,253846008,...,1,0,23866,320,50,2736,0,33,100170,246
12,1.000227e+19,14103100,1010,7,2247578109,3303116246,1356995040,1681788808,2013391065,253846008,...,4,2,21791,320,480,2513,3,1059,-1,68


Restricting data to *app:* **ecad2386**

**Train**

In [14]:
Train_ecad2386=train[train['app_id']=='ecad2386']
Train_ecad2386.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000009e+18,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,2,15706,320,50,1722,0,35,-1,79
1,1.000017e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
2,1.000037e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
3,1.000064e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15706,320,50,1722,0,35,100084,79
4,1.000068e+19,0,14102100,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,...,1,0,18993,320,50,2161,0,35,-1,157


Convert Hex to Integers

In [15]:
Train_ecad2386=convHexInt(train[train['app_id']=='ecad2386'])
Train_ecad2386.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000009e+18,0,14102100,1005,0,532546046,4085536615,680550077,3970769798,2013391065,...,1,2,15706,320,50,1722,0,35,-1,79
1,1.000017e+19,0,14102100,1005,0,532546046,4085536615,680550077,3970769798,2013391065,...,1,0,15704,320,50,1722,0,35,100084,79
2,1.000037e+19,0,14102100,1005,0,532546046,4085536615,680550077,3970769798,2013391065,...,1,0,15704,320,50,1722,0,35,100084,79
3,1.000064e+19,0,14102100,1005,0,532546046,4085536615,680550077,3970769798,2013391065,...,1,0,15706,320,50,1722,0,35,100084,79
4,1.000068e+19,0,14102100,1005,1,4270638152,2439430497,90831144,3970769798,2013391065,...,1,0,18993,320,50,2161,0,35,-1,157


**Test**

In [16]:
Test_ecad2386=test[test['app_id']=='ecad2386']
Test_ecad2386.head()

Unnamed: 0,id,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000017e+19,14103100,1005,0,235ba823,f6ebf28e,f028772b,ecad2386,7801e8d9,07d7df22,...,1,0,8330,320,50,761,3,175,100075,23
1,1.000018e+19,14103100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,1,0,22676,320,50,2616,0,35,100083,51
2,1.000055e+19,14103100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,1,0,22676,320,50,2616,0,35,100083,51
5,1.000152e+19,14103100,1005,1,57fe1b20,5b626596,f028772b,ecad2386,7801e8d9,07d7df22,...,1,0,6563,320,50,572,2,39,-1,32
6,1.000191e+19,14103100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,...,1,0,22813,320,50,2647,2,39,100148,23


Convert Hex to Integers

In [17]:
Test_ecad2386=convHexInt(test[test['app_id']=='ecad2386'])
Test_ecad2386.head()

Unnamed: 0,id,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000017e+19,14103100,1005,0,593209379,4142658190,4029183787,3970769798,2013391065,131587874,...,1,0,8330,320,50,761,3,175,100075,23
1,1.000018e+19,14103100,1005,0,532546046,4085536615,680550077,3970769798,2013391065,131587874,...,1,0,22676,320,50,2616,0,35,100083,51
2,1.000055e+19,14103100,1005,0,532546046,4085536615,680550077,3970769798,2013391065,131587874,...,1,0,22676,320,50,2616,0,35,100083,51
5,1.000152e+19,14103100,1005,1,1476270880,1533175190,4029183787,3970769798,2013391065,131587874,...,1,0,6563,320,50,572,2,39,-1,32
6,1.000191e+19,14103100,1005,0,532546046,4085536615,680550077,3970769798,2013391065,131587874,...,1,0,22813,320,50,2647,2,39,100148,23


In [18]:
train=[]
test=[]

______________

** Logistic Regression (Batch Gradient Descent)**
================

**Outcome**

*The outcome we are trying to predict*

In [19]:
y=pd.DataFrame(Train_85f751fd['click']).copy()
y.head()

Unnamed: 0,click
14,0
19,0
25,0
27,1
30,0


**Features**

*The variables we are going to use to predict the outcome*

*An intercept column was added*

In [20]:
X=Train_85f751fd.iloc[:,1:].copy()
#X=data.iloc[:,1:].copy()
X['click']=1 #Specifically Adding Intercept "Bias" term
#X.rename(index=str, columns={'click': 'intercept'},inplace=True)
X.drop(['click','site_id','site_domain'], axis=1,inplace=True)
X.head()

Unnamed: 0,hour,C1,banner_pos,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
14,14102100,1005,0,1356995040,2566838161,3652543630,253846008,2845778250,1844193174,2866043649,1,0,20984,320,50,2371,0,551,-1,46
19,14102100,1005,0,1356995040,1722151155,3652543630,3472090697,2845778250,4200640363,3031538839,1,0,21234,320,50,2434,3,163,100088,61
25,14102100,1010,1,1356995040,4291231696,2013391065,253846008,4213425475,1770589311,2683280808,4,0,21665,320,50,2493,3,35,-1,117
27,14102100,1005,0,1356995040,1422251333,591918202,253846008,2599974008,707525769,3975219919,1,0,21611,320,50,2480,3,297,100111,61
30,14102100,1005,0,1356995040,1750932556,591918202,2381127546,1788097940,2315340987,2176066856,1,3,15708,320,50,1722,0,35,-1,79


Turn Pandas Columns to Numpy Arrays

In [21]:
colNames=list(X.columns)
#m,n = X.shape
y = np.array(y)
X = np.array(X)
#theta = np.array(np.zeros(n).reshape(n,1))

Predictions of Site and App
--------

**Sci-Kit Learn Predicting Site**

In [22]:
# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model = model.fit(X, y.ravel())

In [23]:
# check the accuracy on the training set
print('The accuracy of the model (R-Square):',model.score(X, y))

The accuracy of the model (R-Square): 0.8811735598261375


In [24]:
# what percentage clicked?
y.mean()

0.11882644017386244

In [25]:
def modelExp(X):
    return 1 / (1 + np.exp(-X))
#Check=modelExp(X * model.coef_ + model.intercept_).ravel()

In [26]:
#def modelExp(X):
#return 1 / (1 + np.exp(-X))

#def SciKitLogRegress(X,y):
#    model = linear_model.LogisticRegression(max_iter=100,tol=0.0001,fit_intercept=False,n_jobs=1,C=100000)
#    model.fit(X, y.ravel())
#    Accuracy=model.score(X, y)
#    print('The accuracy of the model (R-Square):',Accuracy)
#    coeff_df = pd.DataFrame(data=list(zip(X.columns, model.coef_[0].T)),columns=['Coefficient','Estimate'])
#    X=np.array(X)
#    modelExpCoeff=modelExp(X * model.coef_ + model.intercept_)#.ravel()
#    Y_mean=y.mean()
#    print('Target average:',Y_mean)
#    return coeff_df#,modelExpCoeff

In [27]:
coeff_df = pd.DataFrame(data=list(zip(colNames, model.coef_[0].T)),columns=['Coefficient','Estimate'])
coeff_df

Unnamed: 0,Coefficient,Estimate
0,hour,-1.485494e-11
1,C1,-1.065513e-15
2,banner_pos,7.710397999999999e-19
3,site_category,-1.42929e-09
4,app_id,1.679375e-11
5,app_domain,-1.232816e-10
6,app_category,8.622304e-11
7,device_id,-2.394132e-11
8,device_ip,1.510368e-12
9,device_model,1.997202e-11


In [28]:
#time_start = time.clock()

In [29]:
#%%time
#Output,OutputExp=SciKitLogRegress(X,y)
#https://stackoverflow.com/questions/11886862/calculating-computational-time-and-memory-for-a-code-in-python
#resource.getrusage(resource.RUSAGE_SELF).ru_maxrss

In [30]:
#time_elapsed = (time.clock() - time_start)
#time_elapsed

In [31]:
#Output

In [32]:
#modelExpCoeff

In [34]:
logit_mod=sm.Logit(y,X).fit()
print(logit_mod.summary())

Optimization terminated successfully.
         Current function value: 0.349447
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:             14596137
Model:                          Logit   Df Residuals:                 14596122
Method:                           MLE   Df Model:                           14
Date:                Fri, 19 Oct 2018   Pseudo R-squ.:                 0.04151
Time:                        10:21:07   Log-Likelihood:            -5.1006e+06
converged:                       True   LL-Null:                   -5.3215e+06
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1             0.0001   2.82e-06     49.689      0.000       0.000       0.000
x2            -0.2900      0.

In [35]:
print(np.exp(logit_mod.params))

[1.00014023 0.7482992  1.23026432 0.99999876 1.         1.
 1.         1.         1.         1.         1.34095037 0.74045201
 0.99995541 0.99933629 1.0020005  1.00013635 0.92601491 0.99991145
 0.99999575 0.99751822]
