Using lightgbm and simple date features, this script scores ~ 0.674 on Public LB.

### Load libraries and data

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv("train.csv")

In [3]:
test = pd.read_csv("test.csv")

In [4]:
train['click'].value_counts()

0    11700596
1      437214
Name: click, dtype: int64

In [5]:
# check missing values per column
train.isnull().sum(axis=0)/train.shape[0]

ID             0.000000
datetime       0.000000
siteid         0.099896
offerid        0.000000
category       0.000000
merchant       0.000000
countrycode    0.000000
browserid      0.050118
devid          0.149969
click          0.000000
dtype: float64

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12137810 entries, 0 to 12137809
Data columns (total 10 columns):
ID             object
datetime       object
siteid         float64
offerid        int64
category       int64
merchant       int64
countrycode    object
browserid      object
devid          object
click          int64
dtypes: float64(1), int64(4), object(5)
memory usage: 926.0+ MB


### Clean Data and Create Features

In [5]:
train['siteid'].fillna(-999, inplace=True)
test['siteid'].fillna(-999, inplace=True)

train['browserid'].fillna("None", inplace=True)
test['browserid'].fillna("None", inplace=True)

train['devid'].fillna("None", inplace=True)
test['devid'].fillna("None", inplace=True)

In [6]:
# set datatime
train['datetime'] = pd.to_datetime(train['datetime'])
test['datetime'] = pd.to_datetime(test['datetime'])

In [7]:
# create datetime variable
train['tweekday'] = train['datetime'].dt.weekday
train['thour'] = train['datetime'].dt.hour
train['tminute'] = train['datetime'].dt.minute

test['tweekday'] = test['datetime'].dt.weekday
test['thour'] = test['datetime'].dt.hour
test['tminute'] = test['datetime'].dt.minute

In [8]:
cols = ['siteid','offerid','category','merchant']

for x in cols:
    train[x] = train[x].astype('object')
    test[x] = test[x].astype('object')

In [9]:
cat_cols = cols + ['countrycode','browserid','devid']

In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12137810 entries, 0 to 12137809
Data columns (total 13 columns):
ID             object
datetime       datetime64[ns]
siteid         object
offerid        object
category       object
merchant       object
countrycode    object
browserid      object
devid          object
click          int64
tweekday       int64
thour          int64
tminute        int64
dtypes: datetime64[ns](1), int64(4), object(8)
memory usage: 1.2+ GB


In [17]:
train.head()

Unnamed: 0,ID,datetime,siteid,offerid,category,merchant,countrycode,browserid,devid,click,tweekday,thour,tminute
0,IDsrk7SoW,2017-01-14 09:42:09,4709700.0,887235,17714,20301556,e,Firefox,Desktop,0,5,9,42
1,IDmMSxHur,2017-01-18 17:50:53,5189470.0,178235,21407,9434818,b,Mozilla Firefox,Desktop,0,2,17,50
2,IDVLNN0Ut,2017-01-11 12:46:49,98480.0,518539,25085,2050923,a,Edge,Desktop,0,2,12,46
3,ID32T6wwQ,2017-01-17 10:18:43,8896400.0,390352,40339,72089744,c,Firefox,Mobile,0,1,10,18
4,IDqUShzMg,2017-01-14 16:02:33,5635120.0,472937,12052,39507200,d,Mozilla Firefox,Desktop,0,5,16,2


In [11]:
for col in cat_cols:
    lbl = LabelEncoder()
    lbl.fit(list(train[col].values) + list(test[col].values))
    train[col] = lbl.transform(list(train[col].values))
    test[col] = lbl.transform(list(test[col].values))

In [12]:
train.head()

Unnamed: 0,ID,datetime,siteid,offerid,category,merchant,countrycode,browserid,devid,click,tweekday,thour,tminute
0,IDsrk7SoW,2017-01-14 09:42:09,128865,784773,48,127,4,2,2,0,5,9,42
1,IDmMSxHur,2017-01-18 17:50:53,142053,157563,59,65,1,8,0,0,2,17,50
2,IDVLNN0Ut,2017-01-11 12:46:49,2618,458279,69,15,0,1,2,0,2,12,46
3,ID32T6wwQ,2017-01-17 10:18:43,243406,345067,117,507,2,2,1,0,1,10,18
4,IDqUShzMg,2017-01-14 16:02:33,154278,417948,36,276,3,8,0,0,5,16,2


### Model Training

In [13]:
cols_to_use = list(set(train.columns) - set(['ID','datetime','click']))

In [14]:
X_train, X_test, y_train, y_test = train_test_split(train[cols_to_use], train['click'], test_size = 0.5)

In [15]:
dtrain = lgb.Dataset(X_train, y_train)
dval = lgb.Dataset(X_test, y_test)

In [16]:
params = {
    
    'num_leaves' : 256,
    'learning_rate':0.03,
    'metric':'auc',
    'objective':'binary',
    'early_stopping_round': 40,
    'max_depth':10,
    'bagging_fraction':0.5,
    'feature_fraction':0.6,
    'bagging_seed':2017,
    'feature_fraction_seed':2017,
    'verbose' : 1
    
    
}

In [17]:
clf = lgb.train(params, dtrain,num_boost_round=800,valid_sets=dval,verbose_eval=20)

[20]	valid_0's auc: 0.969995
[40]	valid_0's auc: 0.969577
[60]	valid_0's auc: 0.970684
[80]	valid_0's auc: 0.970768
[100]	valid_0's auc: 0.971204
[120]	valid_0's auc: 0.971787
[140]	valid_0's auc: 0.972002
[160]	valid_0's auc: 0.972202
[180]	valid_0's auc: 0.972495
[200]	valid_0's auc: 0.972652
[220]	valid_0's auc: 0.972978
[240]	valid_0's auc: 0.973192
[260]	valid_0's auc: 0.973348
[280]	valid_0's auc: 0.973563
[300]	valid_0's auc: 0.973712
[320]	valid_0's auc: 0.973802
[340]	valid_0's auc: 0.973995
[360]	valid_0's auc: 0.974153
[380]	valid_0's auc: 0.974361
[400]	valid_0's auc: 0.974589
[420]	valid_0's auc: 0.974856
[440]	valid_0's auc: 0.975051
[460]	valid_0's auc: 0.975213
[480]	valid_0's auc: 0.975367
[500]	valid_0's auc: 0.975522
[520]	valid_0's auc: 0.97564
[540]	valid_0's auc: 0.975811
[560]	valid_0's auc: 0.975945
[580]	valid_0's auc: 0.976049
[600]	valid_0's auc: 0.976146
[620]	valid_0's auc: 0.976226
[640]	valid_0's auc: 0.976324
[660]	valid_0's auc: 0.976416
[680]	valid_0's

In [18]:
preds = clf.predict(test[cols_to_use])

In [19]:

sub = pd.DataFrame({'ID':test['ID'], 'click':preds})
sub.to_csv('lgb_pyst.csv', index=False)

In [6]:
train[train['click']==1].groupby('browserid').count()['click']

browserid
Chrome                12547
Edge                   8474
Firefox               16649
Google Chrome        145319
IE                    12444
Internet Explorer      8197
InternetExplorer     187169
Mozilla                8043
Mozilla Firefox        8292
Opera                  4158
Safari                 4208
Name: click, dtype: int64

In [7]:
train['browserid'].value_counts()

Edge                 3456150
Firefox              3347105
Mozilla              1120649
Mozilla Firefox      1008609
InternetExplorer      743821
Google Chrome         700571
IE                    346042
Chrome                345432
Internet Explorer     230818
Opera                 115329
Safari                114957
Name: browserid, dtype: int64

In [8]:
train[train['click']==1].groupby('devid').count()['click']

devid
Desktop    193024
Mobile     159396
Tablet      18761
Name: click, dtype: int64

In [9]:
train['devid'].value_counts()

Mobile     4035596
Tablet     3403479
Desktop    2878436
Name: devid, dtype: int64