In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing, metrics
import lightgbm as lgb
pd.options.display.max_columns = None
pd.options.display.max_rows = None

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
click_data = pd.read_csv('/Users/aryamonani/Downloads/train_sample.csv', parse_dates = ['click_time']) #to use click_time as timestamp data

In [3]:
click_data.describe(include='all')

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
count,2300561.0,2300561.0,2300561.0,2300561.0,2300561.0,2300561,456846,2300561.0
unique,,,,,,251267,182057,
top,,,,,,2017-11-09 04:31:15,2017-11-08 14:16:29,
freq,,,,,,29,14,
first,,,,,,2017-11-06 15:13:23,,
last,,,,,,2017-11-09 16:00:00,,
mean,105605.2,15.50743,23.48307,22.97843,256.2431,,,0.1985803
std,83393.42,21.08502,250.1862,54.83919,129.1065,,,0.3989313
min,1.0,0.0,0.0,0.0,0.0,,,0.0
25%,43837.0,3.0,1.0,13.0,137.0,,,0.0


In [4]:
click_data.head(10)

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,89489,3,1,13,379,2017-11-06 15:13:23,,0
1,204158,35,1,13,21,2017-11-06 15:41:07,2017-11-07 08:17:19,1
2,3437,6,1,13,459,2017-11-06 15:42:32,,0
3,167543,3,1,13,379,2017-11-06 15:56:17,,0
4,147509,3,1,13,379,2017-11-06 15:57:01,,0
5,71421,15,1,13,153,2017-11-06 16:00:00,,0
6,76953,14,1,13,379,2017-11-06 16:00:01,,0
7,187909,2,1,25,477,2017-11-06 16:00:01,,0
8,116779,1,1,8,150,2017-11-06 16:00:01,,0
9,47857,3,1,15,205,2017-11-06 16:00:01,,0


CREATING FEATURES FROM TIMESTAMP DATA SO IT CAN BE USED FOR PREDICTIONS

In [5]:
clicks = click_data.copy()
clicks['day'] = clicks['click_time'].dt.day.astype('uint8')
clicks['hour'] = clicks['click_time'].dt.hour.astype('uint8')
clicks['minute'] = clicks['click_time'].dt.minute.astype('uint8')
clicks['second'] = clicks['click_time'].dt.second.astype('uint8')
clicks.head(3)

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,day,hour,minute,second
0,89489,3,1,13,379,2017-11-06 15:13:23,,0,6,15,13,23
1,204158,35,1,13,21,2017-11-06 15:41:07,2017-11-07 08:17:19,1,6,15,41,7
2,3437,6,1,13,459,2017-11-06 15:42:32,,0,6,15,42,32


USING LABEL ENCODER TO DEAL WITH CATEGORICAL DATA IN THE COLUMNS MENTIONED IN cat_features WITH A '_lables' IN THE END SO WE CAN DIFFERENTIATE BETWEEN THE NEWLY MADE FEATURES AND THE EXISTING ONES.

In [6]:
label_encoder = preprocessing.LabelEncoder()
cat_features = ['ip', 'app', 'device', 'os', 'channel']
for features in cat_features:
    encoded = label_encoder.fit_transform(clicks[features])
    clicks[features + '_labels'] = encoded

In [7]:
clicks.head(10)

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,day,hour,minute,second,ip_labels,app_labels,device_labels,os_labels,channel_labels
0,89489,3,1,13,379,2017-11-06 15:13:23,,0,6,15,13,23,27226,3,1,13,120
1,204158,35,1,13,21,2017-11-06 15:41:07,2017-11-07 08:17:19,1,6,15,41,7,110007,35,1,13,10
2,3437,6,1,13,459,2017-11-06 15:42:32,,0,6,15,42,32,1047,6,1,13,157
3,167543,3,1,13,379,2017-11-06 15:56:17,,0,6,15,56,17,76270,3,1,13,120
4,147509,3,1,13,379,2017-11-06 15:57:01,,0,6,15,57,1,57862,3,1,13,120
5,71421,15,1,13,153,2017-11-06 16:00:00,,0,6,16,0,0,21781,15,1,13,43
6,76953,14,1,13,379,2017-11-06 16:00:01,,0,6,16,0,1,23409,14,1,13,120
7,187909,2,1,25,477,2017-11-06 16:00:01,,0,6,16,0,1,94979,2,1,25,166
8,116779,1,1,8,150,2017-11-06 16:00:01,,0,6,16,0,1,35644,1,1,8,42
9,47857,3,1,15,205,2017-11-06 16:00:01,,0,6,16,0,1,14521,3,1,15,54


Here we'll create training, validation, and test splits. First, clicks DataFrame is sorted in order of increasing time. The first 80% of the rows are the train set, the next 10% are the validation set, and the last 10% are the test set.

In [9]:
features_cols = ['day', 'hour', 'minute', 'second', 'ip_labels',
                'app_labels', 'device_labels', 'os_labels', 'channel_labels']
valid_fraction = 0.1
click_srt = clicks.sort_values('click_time')
valid_rows = int(len(click_srt) * valid_fraction)
train = click_srt[:-valid_rows * 2]
valid = click_srt[-valid_rows * 2:-valid_rows]
test = click_srt[-valid_rows:]

LightGBM 

In [15]:
dtrain = lgb.Dataset(train[features_cols], label=train['is_attributed'])
dvalid = lgb.Dataset(valid[features_cols], label=valid['is_attributed'])
dtest = lgb.Dataset(test[features_cols], label=test['is_attributed'])

param = {'num_leaves':64, 'objective': 'binary' }
param['metric'] = 'auc'
num_round = 150
bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], early_stopping_rounds=10)

[1]	valid_0's auc: 0.948979
Training until validation scores don't improve for 10 rounds
[2]	valid_0's auc: 0.949235
[3]	valid_0's auc: 0.950126
[4]	valid_0's auc: 0.950072
[5]	valid_0's auc: 0.950536
[6]	valid_0's auc: 0.950943
[7]	valid_0's auc: 0.951453
[8]	valid_0's auc: 0.951518
[9]	valid_0's auc: 0.952385
[10]	valid_0's auc: 0.952434
[11]	valid_0's auc: 0.952465
[12]	valid_0's auc: 0.952638
[13]	valid_0's auc: 0.95266
[14]	valid_0's auc: 0.952766
[15]	valid_0's auc: 0.953203
[16]	valid_0's auc: 0.953503
[17]	valid_0's auc: 0.953793
[18]	valid_0's auc: 0.953966
[19]	valid_0's auc: 0.954184
[20]	valid_0's auc: 0.9543
[21]	valid_0's auc: 0.954305
[22]	valid_0's auc: 0.954536
[23]	valid_0's auc: 0.954748
[24]	valid_0's auc: 0.955142
[25]	valid_0's auc: 0.955493
[26]	valid_0's auc: 0.955611
[27]	valid_0's auc: 0.955708
[28]	valid_0's auc: 0.955795
[29]	valid_0's auc: 0.956172
[30]	valid_0's auc: 0.95623
[31]	valid_0's auc: 0.956477
[32]	valid_0's auc: 0.956606
[33]	valid_0's auc: 0.95

In [19]:
ypred = bst.predict(test[features_cols])
score = metrics.roc_auc_score(test['is_attributed'], ypred)
print(f"test score: {score}")

test score: 0.972368083424102
