In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import sys

In [17]:
dirpath = os.getcwd()
dirpath

'C:\\Users\\agupta\\Downloads\\Analytics vidhya\\WNS_hackathon'

In [133]:
input_folder = 'inputs/'

print('Reading input data')
df_item = pd.read_csv(os.path.join(input_folder, 'item_data.csv'))
df_log = pd.read_csv(os.path.join(input_folder, 'view_log.csv'))
df_train = pd.read_csv(os.path.join(input_folder, 'train.csv'))
df_test = pd.read_csv(os.path.join(input_folder, 'test.csv'))


Reading input data


In [134]:
key_cols = ['user_id', 'impression_id']


In [135]:
print('Converting dates')
# conversion for dates
df_log['server_time'] = pd.to_datetime(df_log['server_time'])
df_train['impression_time'] = pd.to_datetime(df_train['impression_time'])
df_test['impression_time'] = pd.to_datetime(df_test['impression_time'])


Converting dates


In [136]:
# get weekday and hour
df_train['weekday'] = df_train['impression_time'].dt.weekday
df_train['hour'] = df_train['impression_time'].dt.hour
df_test['weekday'] = df_test['impression_time'].dt.weekday
df_test['hour'] = df_test['impression_time'].dt.hour


In [57]:
df_train.head()

Unnamed: 0,impression_id,impression_time,user_id,app_code,os_version,is_4G,is_click,weekday,hour
0,c4ca4238a0b923820dcc509a6f75849b,2018-11-15 00:00:00,87862,422,old,0,0,3,0
1,45c48cce2e2d7fbdea1afc51c7c6ad26,2018-11-15 00:01:00,63410,467,latest,1,1,3,0
2,70efdf2ec9b086079795c442636b55fb,2018-11-15 00:02:00,71748,259,intermediate,1,0,3,0
3,8e296a067a37563370ded05f5a3bf3ec,2018-11-15 00:02:00,69209,244,latest,1,0,3,0
4,182be0c5cdcd5072bb1864cdee4d3d6e,2018-11-15 00:02:00,62873,473,latest,0,0,3,0


In [58]:
print('Creating historic CTRs')
# create historic ctr
col_list = ['weekday', 'hour', 'app_code', 'is_4G', 'os_version', ['weekday', 'hour']]
for col in col_list:
    col_name = col
    if (isinstance(col,list)):
        col_name = '_'.join(col)
    df_temp = df_train.groupby(col)\
                      .apply(lambda x: pd.Series([sum(x['is_click'])/len(x)], index=['click_rate']))\
                      .reset_index().rename(columns={'click_rate':col_name+'_ctr'})
    df_train = pd.merge(left=df_train, right=df_temp, how='left', on=col)
    df_test = pd.merge(left=df_test, right=df_temp, how='left', on=col)


Creating historic CTRs


In [59]:
df_train.head()

Unnamed: 0,impression_id,impression_time,user_id,app_code,os_version,is_4G,is_click,weekday,hour,weekday_ctr,hour_ctr,app_code_ctr,is_4G_ctr,os_version_ctr,weekday_hour_ctr
0,c4ca4238a0b923820dcc509a6f75849b,2018-11-15 00:00:00,87862,422,old,0,0,3,0,0.045868,0.045731,0.146835,0.046258,0.04929,0.047569
1,45c48cce2e2d7fbdea1afc51c7c6ad26,2018-11-15 00:01:00,63410,467,latest,1,1,3,0,0.045868,0.045731,0.061611,0.044752,0.041651,0.047569
2,70efdf2ec9b086079795c442636b55fb,2018-11-15 00:02:00,71748,259,intermediate,1,0,3,0,0.045868,0.045731,0.049822,0.044752,0.051762,0.047569
3,8e296a067a37563370ded05f5a3bf3ec,2018-11-15 00:02:00,69209,244,latest,1,0,3,0,0.045868,0.045731,0.123369,0.044752,0.041651,0.047569
4,182be0c5cdcd5072bb1864cdee4d3d6e,2018-11-15 00:02:00,62873,473,latest,0,0,3,0,0.045868,0.045731,0.050202,0.046258,0.041651,0.047569


In [60]:
print('Appending train and test data')
# concatenate train and test files to create a unified file
# is_click will be null for test data
df = pd.concat([df_train, df_test], axis=0, sort=False)
del df_train, df_test

model_base_columns = key_cols + ['impression_time', 'is_click'] + \
                      ['_'.join(x)+'_ctr' if isinstance(x, list) else x+'_ctr' for x in col_list]
df_model = df.loc[:, model_base_columns]

Appending train and test data


In [61]:
print(df_model.shape)
df_model.head()

(328284, 10)


Unnamed: 0,user_id,impression_id,impression_time,is_click,weekday_ctr,hour_ctr,app_code_ctr,is_4G_ctr,os_version_ctr,weekday_hour_ctr
0,87862,c4ca4238a0b923820dcc509a6f75849b,2018-11-15 00:00:00,0.0,0.045868,0.045731,0.146835,0.046258,0.04929,0.047569
1,63410,45c48cce2e2d7fbdea1afc51c7c6ad26,2018-11-15 00:01:00,1.0,0.045868,0.045731,0.061611,0.044752,0.041651,0.047569
2,71748,70efdf2ec9b086079795c442636b55fb,2018-11-15 00:02:00,0.0,0.045868,0.045731,0.049822,0.044752,0.051762,0.047569
3,69209,8e296a067a37563370ded05f5a3bf3ec,2018-11-15 00:02:00,0.0,0.045868,0.045731,0.123369,0.044752,0.041651,0.047569
4,62873,182be0c5cdcd5072bb1864cdee4d3d6e,2018-11-15 00:02:00,0.0,0.045868,0.045731,0.050202,0.046258,0.041651,0.047569


In [62]:
# feature_days_list = [1, 3]
feature_days_list = [1, 3, 7, 14, 30]
min_lag_days, max_lag_days = 1, 7

df = pd.merge(left=df, right=df_log, how='left', on=['user_id'])
df = pd.merge(left=df, right=df_item, how='left', on=['item_id'])
df = df.loc[(df['impression_time']-df['server_time']).dt.days >= min_lag_days, :]

del df_log


In [64]:
print(df.shape)
df.head()

(12087938, 24)


Unnamed: 0,impression_id,impression_time,user_id,app_code,os_version,is_4G,is_click,weekday,hour,weekday_ctr,...,weekday_hour_ctr,server_time,device_type,session_id,item_id,item_price,category_1,category_2,category_3,product_type
0,c4ca4238a0b923820dcc509a6f75849b,2018-11-15 00:00:00,87862,422,old,0,0.0,3,0,0.045868,...,0.047569,2018-10-19 18:11:00,android,190710,43886,2350.0,11.0,35.0,20.0,5622.0
1,45c48cce2e2d7fbdea1afc51c7c6ad26,2018-11-15 00:01:00,63410,467,latest,1,1.0,3,0,0.045868,...,0.047569,2018-10-21 06:13:00,android,356854,43209,3421.0,4.0,74.0,292.0,577.0
2,45c48cce2e2d7fbdea1afc51c7c6ad26,2018-11-15 00:01:00,63410,467,latest,1,1.0,3,0,0.045868,...,0.047569,2018-10-21 06:13:00,android,356854,66370,14166.0,4.0,74.0,292.0,3399.0
3,45c48cce2e2d7fbdea1afc51c7c6ad26,2018-11-15 00:01:00,63410,467,latest,1,1.0,3,0,0.045868,...,0.047569,2018-10-21 17:01:00,android,345289,43209,3421.0,4.0,74.0,292.0,577.0
4,45c48cce2e2d7fbdea1afc51c7c6ad26,2018-11-15 00:01:00,63410,467,latest,1,1.0,3,0,0.045868,...,0.047569,2018-10-22 09:54:00,android,406658,43209,3421.0,4.0,74.0,292.0,577.0


In [194]:
lag_days = 0    
print('Workin on lag_days {:d}'.format(lag_days))
# filter relevant data
df = df.loc[(df['impression_time']-df['server_time']).dt.days >= lag_days, :]
df_model = df_model.loc[:, model_base_columns]

print('Creating visits in last n days')
# count of site visits in last n days
for i in feature_days_list:
    print(lag_days, i)
    df_temp = df.loc[(df['impression_time']-df['server_time']).dt.days <= lag_days+i, :]\
               .groupby(key_cols).count()['item_id']\
               .reset_index()\
               .rename(columns={'item_id':'cnt_visit_l{:d}d'.format(i)})
    df_model = pd.merge(left=df_model, right=df_temp, how='left', on=key_cols)


print('Creating visits in category in last n days')
# count of site visits in different categories in last n days
for col in ['category_1']:
    for cat in list(map(int, df_item[col].unique().tolist())):
        for i in feature_days_list:
            print(lag_days, col, cat, i)
            df_temp = df.loc[((df['impression_time']-df['server_time']).dt.days <= lag_days+i) &\
                             (df[col] == cat), :]\
                        .groupby(key_cols).count()['item_id']\
                        .reset_index()\
                        .rename(columns={'item_id':'cnt_visit_{:s}_{:d}_l{:d}d'.format(col, cat, i)})
            df_model = pd.merge(left=df_model, right=df_temp, how='left', on=key_cols)


print('Creating unique item visits in last n days')
for i in feature_days_list:
    print(lag_days, i)
    df_temp = df.loc[((df['impression_time']-df['server_time']).dt.days <= lag_days+i), :]\
                .drop_duplicates(key_cols+['item_id'])
    # count of unique items
    df_temp2 = df_temp.groupby(key_cols).count()['item_id']\
                .reset_index()\
                .rename(columns={'item_id':'cnt_uniq_item_l{:d}d'.format(i)})
    df_model = pd.merge(left=df_model, right=df_temp2, how='left', on=key_cols)

    # avg price of unique items
    df_temp2 = df_temp.groupby(key_cols).sum()['item_price']\
                .reset_index()\
                .rename(columns={'item_price':'avg_price_uniq_item_l{:d}d'.format(i)})
    df_model = pd.merge(left=df_model, right=df_temp2, how='left', on=key_cols)
    del df_temp2
    df_model['avg_price_uniq_item_l{:d}d'.format(i)] = df_model['avg_price_uniq_item_l{:d}d'.format(i)]\
                                                    /df_model['cnt_uniq_item_l{:d}d'.format(i)]


# count of unique categories visited
print('Creating count of unique categories')
for col in ['category_1', 'category_2', 'category_3', 'product_type']:
    for i in feature_days_list:
        print(lag_days, col, i)
        df_temp = df.loc[((df['impression_time']-df['server_time']).dt.days <= lag_days+i), :]\
                .drop_duplicates(key_cols+[col])\
                .groupby(key_cols).count()[col]\
                .reset_index()\
                .rename(columns={col:'cnt_uniq_{:s}_l{:d}d'.format(col, i)})
    df_model = pd.merge(left=df_model, right=df_temp, how='left', on=key_cols)


print('Creating cnt days since last visit')
df_temp = df.loc[:, :]\
            .sort_values(key_cols+['server_time'], ascending=[1,1,0])\
            .drop_duplicates(key_cols)
df_temp['cnt_days_since_last_visit'] = (df_temp['impression_time']-df_temp['server_time']).dt.total_seconds()/(3600*12)
df_model = pd.merge(left=df_model, right=df_temp[key_cols+['cnt_days_since_last_visit']], how='left', on=key_cols)


# Creating count of unique session_id
print('Creating count of unique session_id')
for i in feature_days_list:
    print(lag_days, i)
    df_temp = df.loc[((df['impression_time']-df['server_time']).dt.days <= lag_days+i), :]\
                .drop_duplicates(key_cols+['session_id'])
    # count of unique session_id 
    df_temp2 = df_temp.groupby(key_cols).count()['session_id']\
                .reset_index()\
                .rename(columns={'session_id':'cnt_session_id_l{:d}d'.format(i)})
    df_model = pd.merge(left=df_model, right=df_temp2, how='left', on=key_cols)

# Creating average time per session
print('Creating average time per session_id')
for i in feature_days_list:
    print(lag_days, i)
    df_temp = df.loc[((df['impression_time']-df['server_time']).dt.days <= lag_days+i), :]\
                .sort_values(key_cols + ['session_id', 'server_time'], ascending = [1,1,1,1])\
                .drop_duplicates(key_cols+['session_id'])
    df_temp2 = df.loc[((df['impression_time']-df['server_time']).dt.days <= lag_days+i), :]\
                .sort_values(key_cols + ['session_id', 'server_time'], ascending = [1,1,1,0])\
                .drop_duplicates(key_cols+['session_id'])
    df_temp = pd.merge(left = df_temp, right = df_temp2, on = key_cols+['session_id'], suffixes = ('_min', '_max'))
    df_temp['session_length'] = (df_temp['server_time_max'] - df_temp['server_time_min']).dt.total_seconds()/60
    df_temp_g = df_temp.groupby(key_cols)
    df_temp = df_temp_g.sum()['session_length'].reset_index()          
    df_temp2 = df_temp_g.count()['session_id'].reset_index()
    df_temp = pd.merge(left=df_temp, right=df_temp2, how='inner', on=key_cols)
    df_temp['avg_session_time'] = df_temp['session_length']/df_temp['session_id']
    df_model = pd.merge(left=df_model, right=df_temp[key_cols+['avg_session_time']], how='left', on=key_cols)



del df_temp
print('Saving file')
df_model.to_csv(os.path.join(input_folder, 'df_model_lag_days_session_{:d}.csv'.format(lag_days)), index=False)


Workin on lag_days 0
Creating visits in last n days
0 1
0 3
0 7
0 14
0 30
Creating visits in category in last n days
0 category_1 11 1
0 category_1 11 3
0 category_1 11 7
0 category_1 11 14
0 category_1 11 30
0 category_1 12 1
0 category_1 12 3
0 category_1 12 7
0 category_1 12 14
0 category_1 12 30
0 category_1 17 1
0 category_1 17 3
0 category_1 17 7
0 category_1 17 14
0 category_1 17 30
0 category_1 13 1
0 category_1 13 3
0 category_1 13 7
0 category_1 13 14
0 category_1 13 30
0 category_1 9 1
0 category_1 9 3
0 category_1 9 7
0 category_1 9 14
0 category_1 9 30
0 category_1 16 1
0 category_1 16 3
0 category_1 16 7
0 category_1 16 14
0 category_1 16 30
0 category_1 7 1
0 category_1 7 3
0 category_1 7 7
0 category_1 7 14
0 category_1 7 30
0 category_1 10 1
0 category_1 10 3
0 category_1 10 7
0 category_1 10 14
0 category_1 10 30
0 category_1 4 1
0 category_1 4 3
0 category_1 4 7
0 category_1 4 14
0 category_1 4 30
0 category_1 1 1
0 category_1 1 3
0 category_1 1 7
0 category_1 1 14
0

In [233]:
df_model_v2 = df_model.copy()

In [234]:
# Create no of ads show in last n days
print('Creating no of ads shown in last n days')
df_temp = df.drop_duplicates(key_cols)[['user_id', 'impression_time', 'impression_id']]
df_temp = pd.merge(left=df_temp, right=df_temp, how='inner', on='user_id', suffixes=('_x', '_y'))
for i in feature_days_list[::-1]:
    print(i)
    df_temp = df_temp.loc[(df_temp['impression_time_x'] - df_temp['impression_time_y']).dt.days <= lag_days+i, :]\
                     .rename(columns={'impression_id_x':'impression_id'})
    print(df_temp.shape)
    df_temp2 = df_temp.groupby(key_cols).count()['impression_time_x']\
                      .reset_index().rename(columns={'impression_time_x':'cnt_ads_l{:d}d'.format(i)})
    df_model_v2 = pd.merge(left=df_model_v2, right=df_temp2[key_cols+['cnt_ads_l{:d}d'.format(i)]].copy(), 
                           how='left', on=key_cols)


Creating no of ads shown in last n days
30
(4246637, 5)
14
(3759247, 5)
7
(3252047, 5)
3
(2903966, 5)
1
(2653217, 5)


In [237]:
print(df_model_v2.shape)
(df_model_v2['cnt_ads_l30d'] == df_model_v2['cnt_ads_l1d']).sum()

(328284, 126)


133315

In [238]:
df_model = df_model_v2.copy()
del df_model_v2
print(df_model.shape)
df_model.head()

(328284, 126)


Unnamed: 0,user_id,impression_id,impression_time,is_click,weekday_ctr,hour_ctr,app_code_ctr,is_4G_ctr,os_version_ctr,weekday_hour_ctr,...,cnt_session_id_l3d,cnt_session_id_l7d,cnt_session_id_l14d,cnt_session_id_l30d,avg_session_time,cnt_ads_l30d,cnt_ads_l14d,cnt_ads_l7d,cnt_ads_l3d,cnt_ads_l1d
0,87862,c4ca4238a0b923820dcc509a6f75849b,2018-11-15 00:00:00,0.0,0.045868,0.045731,0.146835,0.046258,0.04929,0.047569,...,,,,1.0,0.0,3.0,3.0,3.0,3.0,3.0
1,63410,45c48cce2e2d7fbdea1afc51c7c6ad26,2018-11-15 00:01:00,1.0,0.045868,0.045731,0.061611,0.044752,0.041651,0.047569,...,,,,5.0,0.6,61.0,61.0,61.0,61.0,61.0
2,71748,70efdf2ec9b086079795c442636b55fb,2018-11-15 00:02:00,0.0,0.045868,0.045731,0.049822,0.044752,0.051762,0.047569,...,,,,1.0,0.0,13.0,13.0,13.0,13.0,13.0
3,69209,8e296a067a37563370ded05f5a3bf3ec,2018-11-15 00:02:00,0.0,0.045868,0.045731,0.123369,0.044752,0.041651,0.047569,...,2.0,2.0,2.0,2.0,1.0,4.0,4.0,4.0,4.0,4.0
4,62873,182be0c5cdcd5072bb1864cdee4d3d6e,2018-11-15 00:02:00,0.0,0.045868,0.045731,0.050202,0.046258,0.041651,0.047569,...,1.0,3.0,3.0,5.0,1.4,16.0,16.0,16.0,16.0,16.0


In [196]:
# del df_model['avg_session_time_x']
# del df_model['avg_session_time_y']


In [188]:
user_eda = df_model.loc[df_model['user_id'] == 63410,:]

In [189]:
print(user_eda.shape)
user_eda.sort_values('impression_time')

(61, 121)


Unnamed: 0,user_id,impression_id,impression_time,is_click,weekday_ctr,hour_ctr,app_code_ctr,is_4G_ctr,os_version_ctr,weekday_hour_ctr,...,cnt_uniq_category_2_l30d,cnt_uniq_category_3_l30d,cnt_uniq_product_type_l30d,cnt_days_since_last_visit,cnt_session_id_l1d,cnt_session_id_l3d,cnt_session_id_l7d,cnt_session_id_l14d,cnt_session_id_l30d,avg_session_time
1,63410,45c48cce2e2d7fbdea1afc51c7c6ad26,2018-11-15 00:01:00,1.0,0.045868,0.045731,0.061611,0.044752,0.041651,0.047569,...,2.0,2.0,3.0,5987.6,,,,,5.0,0.6
19776,63410,f0e52b27a7a5d6a1a87373dffa53dbe5,2018-11-15 00:56:00,0.0,0.045868,0.045731,0.061611,0.044752,0.041651,0.047569,...,2.0,2.0,3.0,5998.6,,,,,5.0,0.6
4635,63410,6b18886bc278247582704943f5c66eb9,2018-11-16 16:31:00,0.0,0.044730,0.042994,0.061611,0.044752,0.041651,0.041467,...,2.0,2.0,3.0,6473.6,,,,,5.0,0.6
6088,63410,cbd7fb1bba4de3486f76147a75f3ec89,2018-11-16 18:14:00,0.0,0.044730,0.044512,0.061611,0.044752,0.041651,0.048454,...,2.0,2.0,3.0,6494.2,,,,,5.0,0.6
29329,63410,339e2f61ba171de04b12646bb30b329b,2018-11-17 00:02:00,0.0,0.048106,0.045731,0.061611,0.044752,0.041651,0.047955,...,2.0,2.0,3.0,6563.8,,,,,5.0,0.6
12204,63410,9fd93cfddc3568482b5eb057efe06f16,2018-11-17 01:54:00,0.0,0.048106,0.048101,0.061611,0.044752,0.041651,0.050062,...,2.0,2.0,3.0,6586.2,,,,,5.0,0.6
13964,63410,5c7368d5679563a902701bf8b46575fc,2018-11-18 07:48:00,0.0,0.047977,0.039409,0.057551,0.044752,0.041651,0.051220,...,2.0,2.0,3.0,6945.0,,,,,5.0,0.6
8851,63410,0b6cf75f58aca50278bdcfc9c46576f4,2018-11-18 08:09:00,0.0,0.047977,0.043200,0.061611,0.044752,0.041651,0.046178,...,2.0,2.0,3.0,6949.2,,,,,5.0,0.6
14779,63410,c86027cb65669a6bec290bd22c969f2d,2018-11-19 02:04:00,0.0,0.046698,0.046141,0.061611,0.044752,0.041651,0.039432,...,2.0,2.0,3.0,7164.2,,,,,5.0,0.6
41995,63410,0b9dcc39ddecd3359566b59d3c9a6b9c,2018-11-19 02:19:00,0.0,0.046698,0.046141,0.057551,0.044752,0.041651,0.039432,...,2.0,2.0,3.0,7167.2,,,,,5.0,0.6


In [183]:
df_log.loc[df_log['user_id'] == 63410,:].sort_values('server_time')

Unnamed: 0,server_time,device_type,session_id,user_id,item_id
284024,2018-10-21 06:13:00,android,356854,63410,43209
284028,2018-10-21 06:13:00,android,356854,63410,66370
309046,2018-10-21 17:01:00,android,345289,63410,43209
339224,2018-10-22 09:54:00,android,406658,63410,43209
449354,2018-10-24 11:14:00,android,15568,63410,71877
486317,2018-10-25 05:00:00,android,343558,63410,43209
486325,2018-10-25 05:03:00,android,343558,63410,43209
2115995,2018-11-24 05:29:00,android,983607,63410,97274
3028698,2018-12-10 19:48:00,android,135003,63410,109074
3028895,2018-12-10 19:51:00,android,135003,63410,58018


In [178]:
# (df_log.loc[df_log['user_id'] == 63410,:].max() - df_log.loc[df_log['user_id'] == 63410,:].min()).dt.total_seconds()

# ('2018-12-10 20:03:00' - '2018-10-21 06:13:00')

In [140]:
df_train.loc[df_train['user_id'] == 63410,:]

Unnamed: 0,impression_id,impression_time,user_id,app_code,os_version,is_4G,is_click,weekday,hour
1,45c48cce2e2d7fbdea1afc51c7c6ad26,2018-11-15 00:01:00,63410,467,latest,1,1,3,0
4635,6b18886bc278247582704943f5c66eb9,2018-11-16 16:31:00,63410,467,latest,1,0,4,16
6088,cbd7fb1bba4de3486f76147a75f3ec89,2018-11-16 18:14:00,63410,467,latest,1,0,4,18
8851,0b6cf75f58aca50278bdcfc9c46576f4,2018-11-18 08:09:00,63410,467,latest,1,0,6,8
12204,9fd93cfddc3568482b5eb057efe06f16,2018-11-17 01:54:00,63410,467,latest,1,0,5,1
13964,5c7368d5679563a902701bf8b46575fc,2018-11-18 07:48:00,63410,283,latest,1,0,6,7
14779,c86027cb65669a6bec290bd22c969f2d,2018-11-19 02:04:00,63410,467,latest,1,0,0,2
19776,f0e52b27a7a5d6a1a87373dffa53dbe5,2018-11-15 00:56:00,63410,467,latest,1,0,3,0
21732,9a9fdbd56eb7c442646ab01c6bb3a70f,2018-11-20 09:08:00,63410,467,latest,1,0,1,9
26203,5e1982541fb01c50b6509e8ad3b5221c,2018-11-21 00:25:00,63410,467,latest,1,0,2,0


In [193]:
df_test.loc[df_test['user_id'] == 63410,:].sort_values('impression_time')

Unnamed: 0,user_id,impression_id,impression_time,is_click,weekday_ctr,hour_ctr,app_code_ctr,is_4G_ctr,os_version_ctr,weekday_hour_ctr,...,cnt_uniq_category_2_l30d,cnt_uniq_category_3_l30d,cnt_uniq_product_type_l30d,cnt_days_since_last_visit,cnt_session_id_l1d,cnt_session_id_l3d,cnt_session_id_l7d,cnt_session_id_l14d,cnt_session_id_l30d,avg_session_time
256101,63410,6f059407b46bbb889765d131a22ed9b1,2018-12-12 06:51:00,,0.044517,0.039703,0.061611,0.044752,0.041651,0.040404,...,4.0,4.0,4.0,417.6,1.0,1.0,1.0,1.0,2.0,7.5
271038,63410,e2a48d513077cac0c7e253f4b0ed3732,2018-12-14 06:01:00,,0.04473,0.039703,0.057551,0.044752,0.041651,0.025271,...,4.0,4.0,4.0,983.6,0.0,1.0,1.0,1.0,2.0,7.5
249445,63410,4accedb339f288c2d6d0e371eadbd005,2018-12-14 06:12:00,,0.04473,0.039703,0.061611,0.044752,0.041651,0.025271,...,4.0,4.0,4.0,985.8,0.0,1.0,1.0,1.0,2.0,7.5
265538,63410,b40f4afc450f22e64f8887f67551c571,2018-12-15 00:06:00,,0.048106,0.045731,0.061611,0.044752,0.041651,0.047955,...,4.0,4.0,4.0,1200.6,0.0,0.0,1.0,1.0,2.0,7.5
281456,63410,94d45db5228872804c5a09395e4013c2,2018-12-15 00:13:00,,0.048106,0.045731,0.057551,0.044752,0.041651,0.047955,...,4.0,4.0,4.0,1202.0,0.0,0.0,1.0,1.0,2.0,7.5
287546,63410,9c0ce50a4e8dc1c685beeac5359c0c39,2018-12-16 00:04:00,,0.047977,0.045731,0.061611,0.044752,0.041651,0.042228,...,4.0,4.0,4.0,1488.2,0.0,0.0,1.0,1.0,2.0,7.5
296606,63410,97cbac6884e629707c374ef5db2784d9,2018-12-16 00:08:00,,0.047977,0.045731,0.061611,0.044752,0.041651,0.042228,...,4.0,4.0,4.0,1489.0,0.0,0.0,1.0,1.0,2.0,7.5
313463,63410,376b799547eb6cdaede52738096ba662,2018-12-17 06:24:00,,0.046698,0.039703,0.061611,0.044752,0.041651,0.041872,...,4.0,4.0,4.0,1852.2,0.0,0.0,1.0,1.0,2.0,7.5
302518,63410,a6ba50c21c5abdaeac32f131dfdc5075,2018-12-17 07:06:00,,0.046698,0.039409,0.061611,0.044752,0.041651,0.02819,...,4.0,4.0,4.0,1860.6,0.0,0.0,1.0,1.0,2.0,7.5


In [150]:
df_model.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
user_id,328284.0,46444.357724,26811.701121,0.000000,23158.000000,46591.500000,69699.000000,92586.000000
is_click,237609.0,0.045714,0.208864,0.000000,0.000000,0.000000,0.000000,1.000000
weekday_ctr,328284.0,0.045787,0.001795,0.043007,0.044517,0.045868,0.047977,0.048106
hour_ctr,328284.0,0.045643,0.003656,0.037903,0.044055,0.045731,0.048101,0.058692
app_code_ctr,328229.0,0.045295,0.033409,0.000000,0.014265,0.044444,0.068259,1.000000
is_4G_ctr,328284.0,0.045715,0.000723,0.044752,0.044752,0.046258,0.046258,0.046258
os_version_ctr,328284.0,0.045726,0.004513,0.041651,0.041651,0.041651,0.049290,0.051762
weekday_hour_ctr,328284.0,0.045740,0.006877,0.019608,0.041244,0.045517,0.049920,0.108974
cnt_visit_l1d,61940.0,4.242686,5.829274,1.000000,1.000000,2.000000,5.000000,150.000000
cnt_visit_l3d,128135.0,6.146002,8.825318,1.000000,1.000000,3.000000,7.000000,194.000000


In [270]:
df_train = df_model.loc[(df_model.impression_time < "2018-12-07")  & (df_model.is_click.notnull()),:]
df_valid = df_model.loc[(df_model.impression_time >= "2018-12-07")  & (df_model.is_click.notnull()),:]
df_test = df_model.loc[df_model.is_click.isnull(),:]

In [271]:
df_train.shape, df_valid.shape, df_test.shape

((197093, 126), (40516, 126), (90675, 126))

In [241]:
input_cols = [x for x in df_model.columns.tolist() if x not in ['user_id',
'impression_id',
'impression_time',
'is_click',
'cnt_visit_category_1_5_l1d',
'cnt_visit_category_1_5_l3d',
'cnt_visit_category_1_5_l7d',
'cnt_visit_category_1_5_l14d',
'cnt_visit_category_1_5_l30d',
'cnt_visit_category_1_2_l1d',
'cnt_visit_category_1_2_l3d',
'cnt_visit_category_1_2_l7d',
'cnt_visit_category_1_2_l14d',
'cnt_visit_category_1_2_l30d']]

In [242]:
input_cols

['weekday_ctr',
 'hour_ctr',
 'app_code_ctr',
 'is_4G_ctr',
 'os_version_ctr',
 'weekday_hour_ctr',
 'cnt_visit_l1d',
 'cnt_visit_l3d',
 'cnt_visit_l7d',
 'cnt_visit_l14d',
 'cnt_visit_l30d',
 'cnt_visit_category_1_11_l1d',
 'cnt_visit_category_1_11_l3d',
 'cnt_visit_category_1_11_l7d',
 'cnt_visit_category_1_11_l14d',
 'cnt_visit_category_1_11_l30d',
 'cnt_visit_category_1_12_l1d',
 'cnt_visit_category_1_12_l3d',
 'cnt_visit_category_1_12_l7d',
 'cnt_visit_category_1_12_l14d',
 'cnt_visit_category_1_12_l30d',
 'cnt_visit_category_1_17_l1d',
 'cnt_visit_category_1_17_l3d',
 'cnt_visit_category_1_17_l7d',
 'cnt_visit_category_1_17_l14d',
 'cnt_visit_category_1_17_l30d',
 'cnt_visit_category_1_13_l1d',
 'cnt_visit_category_1_13_l3d',
 'cnt_visit_category_1_13_l7d',
 'cnt_visit_category_1_13_l14d',
 'cnt_visit_category_1_13_l30d',
 'cnt_visit_category_1_9_l1d',
 'cnt_visit_category_1_9_l3d',
 'cnt_visit_category_1_9_l7d',
 'cnt_visit_category_1_9_l14d',
 'cnt_visit_category_1_9_l30d',
 'c

In [243]:
df_train.loc[:,input_cols] = df_train[input_cols].fillna(0)
df_valid.loc[:,input_cols] =  df_valid[input_cols].fillna(0)
df_test.loc[:,input_cols] = df_test[input_cols].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [244]:
df_train.head()

Unnamed: 0,user_id,impression_id,impression_time,is_click,weekday_ctr,hour_ctr,app_code_ctr,is_4G_ctr,os_version_ctr,weekday_hour_ctr,...,cnt_session_id_l3d,cnt_session_id_l7d,cnt_session_id_l14d,cnt_session_id_l30d,avg_session_time,cnt_ads_l30d,cnt_ads_l14d,cnt_ads_l7d,cnt_ads_l3d,cnt_ads_l1d
0,87862,c4ca4238a0b923820dcc509a6f75849b,2018-11-15 00:00:00,0.0,0.045868,0.045731,0.146835,0.046258,0.04929,0.047569,...,0.0,0.0,0.0,1.0,0.0,3.0,3.0,3.0,3.0,3.0
1,63410,45c48cce2e2d7fbdea1afc51c7c6ad26,2018-11-15 00:01:00,1.0,0.045868,0.045731,0.061611,0.044752,0.041651,0.047569,...,0.0,0.0,0.0,5.0,0.6,61.0,61.0,61.0,61.0,61.0
2,71748,70efdf2ec9b086079795c442636b55fb,2018-11-15 00:02:00,0.0,0.045868,0.045731,0.049822,0.044752,0.051762,0.047569,...,0.0,0.0,0.0,1.0,0.0,13.0,13.0,13.0,13.0,13.0
3,69209,8e296a067a37563370ded05f5a3bf3ec,2018-11-15 00:02:00,0.0,0.045868,0.045731,0.123369,0.044752,0.041651,0.047569,...,2.0,2.0,2.0,2.0,1.0,4.0,4.0,4.0,4.0,4.0
4,62873,182be0c5cdcd5072bb1864cdee4d3d6e,2018-11-15 00:02:00,0.0,0.045868,0.045731,0.050202,0.046258,0.041651,0.047569,...,1.0,3.0,3.0,5.0,1.4,16.0,16.0,16.0,16.0,16.0


In [274]:
x_train, y_train = df_train[input_cols].values, df_train['is_click'].values
x_valid, y_valid = df_valid[input_cols].values, df_valid['is_click'].values
x_test, y_test = df_test[input_cols].values, df_test['is_click'].values

In [258]:
import sklearn
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [275]:
model = XGBClassifier()
xgb_clf = model

In [276]:
xgb_clf.fit(x_train, y_train, early_stopping_rounds=10,
            eval_metric="logloss", eval_set=[(x_valid, y_valid)])

[0]	validation_0-logloss:0.614218
Will train until validation_0-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.549826
[2]	validation_0-logloss:0.496209
[3]	validation_0-logloss:0.451317
[4]	validation_0-logloss:0.413191
[5]	validation_0-logloss:0.380823
[6]	validation_0-logloss:0.353113
[7]	validation_0-logloss:0.329125
[8]	validation_0-logloss:0.308529
[9]	validation_0-logloss:0.290697
[10]	validation_0-logloss:0.275272
[11]	validation_0-logloss:0.261836
[12]	validation_0-logloss:0.250219
[13]	validation_0-logloss:0.240076
[14]	validation_0-logloss:0.231243
[15]	validation_0-logloss:0.223564
[16]	validation_0-logloss:0.216936
[17]	validation_0-logloss:0.211134
[18]	validation_0-logloss:0.206106
[19]	validation_0-logloss:0.201728
[20]	validation_0-logloss:0.19794
[21]	validation_0-logloss:0.194665
[22]	validation_0-logloss:0.191824
[23]	validation_0-logloss:0.189386
[24]	validation_0-logloss:0.187245
[25]	validation_0-logloss:0.185441
[26]	validation_0-logloss:0.18387

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [277]:
pd.DataFrame({'feature':input_cols, 'importance':xgb_clf.feature_importances_}).sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
0,app_code_ctr,0.270584
4,cnt_uniq_item_l14d,0.087444
2,cnt_uniq_item_l7d,0.085407
3,cnt_session_id_l7d,0.070569
1,cnt_visit_l7d,0.070058
5,cnt_visit_l3d,0.054157
15,cnt_ads_l3d,0.053166
9,cnt_session_id_l3d,0.050896
10,cnt_visit_category_1_1_l3d,0.038395
8,avg_price_uniq_item_l7d,0.035804


In [278]:
xgb_clf.fit(x_train, y_train, early_stopping_rounds=10,
            eval_metric="auc", eval_set=[(x_valid, y_valid)])

[0]	validation_0-auc:0.71461
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.710491
[2]	validation_0-auc:0.716407
[3]	validation_0-auc:0.71603
[4]	validation_0-auc:0.717078
[5]	validation_0-auc:0.716623
[6]	validation_0-auc:0.716804
[7]	validation_0-auc:0.717563
[8]	validation_0-auc:0.718286
[9]	validation_0-auc:0.718279
[10]	validation_0-auc:0.71816
[11]	validation_0-auc:0.718071
[12]	validation_0-auc:0.717779
[13]	validation_0-auc:0.717267
[14]	validation_0-auc:0.71648
[15]	validation_0-auc:0.716256
[16]	validation_0-auc:0.716016
[17]	validation_0-auc:0.71647
[18]	validation_0-auc:0.715592
Stopping. Best iteration:
[8]	validation_0-auc:0.718286



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [279]:
y_pred = xgb_clf.predict_proba(x_valid)[:,1]
y_pred_bin = xgb_clf.predict(x_valid)
predictions_bin = [round(value) for value in y_pred_bin]
predictions = y_pred

In [280]:
from sklearn import metrics

print(metrics.accuracy_score(y_valid, predictions_bin))
print(metrics.confusion_matrix(y_valid, predictions_bin))
print(metrics.roc_auc_score(y_valid, predictions))

0.95327771744496
[[38623     0]
 [ 1893     0]]
0.7182863923093432


In [250]:
y_test_pred = xgb_clf.predict_proba(x_test)[:,1]

In [251]:
df_submission = pd.DataFrame()
df_submission['impression_id'] = df_test.impression_id
df_submission['is_click'] = y_test_pred

In [252]:
df_submission.to_csv(dirpath + '/Submissions/submission_xgboost_last_ads_click.csv', index = False)

In [272]:
input_cols_imp = ['app_code_ctr',
'cnt_visit_l7d',
'cnt_uniq_item_l7d',
'cnt_session_id_l7d',
'cnt_uniq_item_l14d',
'cnt_visit_l3d',
'avg_price_uniq_item_l3d',
'cnt_visit_category_1_7_l7d',
'avg_price_uniq_item_l7d',
'cnt_session_id_l3d',
'cnt_visit_category_1_1_l3d',
'cnt_visit_category_1_11_l3d',
'cnt_visit_category_1_11_l30d',
'cnt_session_id_l14d',
'cnt_visit_category_1_1_l1d',
'cnt_ads_l3d',
'cnt_visit_category_1_15_l14d']

In [273]:
input_cols = input_cols_imp

# random forest

In [167]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_valid = sc.fit_transform(x_valid)
x_test = sc.transform(x_test)

In [190]:
from sklearn.ensemble import RandomForestClassifier

regressor = RandomForestClassifier(n_estimators=20, random_state=0)
regressor.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [191]:
y_pred = xgb_clf.predict_proba(x_valid)[:,1]
y_pred_bin = xgb_clf.predict(x_valid)
predictions_bin = [round(value) for value in y_pred_bin]
predictions = y_pred

In [175]:
# predictions_bin

In [192]:
from sklearn import metrics

print(metrics.accuracy_score(y_valid, predictions_bin))
print(metrics.confusion_matrix(y_valid, predictions_bin))
print(metrics.roc_auc_score(y_valid, predictions))

0.9523891795833744
[[38574    49]
 [ 1880    13]]
0.679055288392724


# Logistic model

In [95]:
from sklearn.linear_model import LogisticRegression
lr_classify = LogisticRegression(class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=500, n_jobs=1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [117]:
x_train

array([[0.04586778, 0.04573099, 0.14683544, ..., 0.        , 1.        ,
        0.        ],
       [0.04586778, 0.04573099, 0.06161137, ..., 0.        , 5.        ,
        0.6       ],
       [0.04586778, 0.04573099, 0.04982206, ..., 0.        , 1.        ,
        0.        ],
       ...,
       [0.04586778, 0.04700262, 0.02684564, ..., 0.        , 0.        ,
        0.        ],
       [0.04586778, 0.04700262, 0.0080766 , ..., 2.        , 4.        ,
        0.75      ],
       [0.04586778, 0.04700262, 0.0080766 , ..., 1.        , 1.        ,
        0.        ]])

In [118]:
lr_classify.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=500,
          multi_class='warn', n_jobs=1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [119]:
y_pred_log = lr_classify.predict_proba(x_valid)[:,1]
y_pred_bin_log = lr_classify.predict(x_valid)
predictions_bin_log = [round(value) for value in y_pred_bin]
predictions_log = y_pred_log

In [120]:
print(metrics.accuracy_score(y_valid, predictions_bin_log))
print(metrics.confusion_matrix(y_valid, predictions_bin_log))
print(metrics.roc_auc_score(y_valid, predictions_log))

0.9533270806594926
[[38623     0]
 [ 1891     2]]
0.7169568469578445


In [122]:
ensemble_predictions = (predictions+predictions_log)/2

In [123]:
#print(metrics.accuracy_score(y_valid, predictions_bin_log))
#print(metrics.confusion_matrix(y_valid, predictions_bin_log))
print(metrics.roc_auc_score(y_valid, ensemble_predictions))

0.7186205379021193


In [124]:
y_test_pred_log = lr_classify.predict_proba(x_test)[:,1]

In [125]:
y_test_pred_ensemble = (y_test_pred+y_test_pred_log)/2

In [126]:
df_submission = pd.DataFrame()
df_submission['impression_id'] = df_test.impression_id
df_submission['is_click'] = y_test_pred_ensemble

In [127]:
df_submission.to_csv(dirpath + '/Submissions/submission_ensemble_viewlog_session_ids.csv', index = False)

# Light GBM

In [265]:
import lightgbm as lgb

# display(train_data.head())
# train = train_data
# X_train = train.loc[:, ['user_id', 'is_4G', 'day',  'app_code', 'os_version_num']]# train.columns != ['is_click', 'impression_time']]
# y_target = train.is_click.values
# #create lightgbm dataset

msk = np.random.rand(len(df_train)) < 0.8
lgb_train = lgb.Dataset(df_train[input_cols], df_train['is_click'])
lgb_eval = lgb.Dataset(df_valid[input_cols], df_valid['is_click'], reference=lgb_train)

# specify your configurations as a dict
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': { 'auc'},
    'num_leaves': 31, # defauly leaves(31) amount for each tree
    'learning_rate': 0.08,
    'feature_fraction': 0.7, # will select 70% features before training each tree
    'bagging_fraction': 0.3, #feature_fraction, but this will random select part of data
    'bagging_freq': 5, #  perform bagging at every 5 iteration
    'verbose': 1
}

print('Start training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=4000,
                valid_sets=lgb_eval,
                early_stopping_rounds=1500)



Start training...
[1]	valid_0's auc: 0.559333
Training until validation scores don't improve for 1500 rounds.
[2]	valid_0's auc: 0.700025
[3]	valid_0's auc: 0.708161
[4]	valid_0's auc: 0.709234
[5]	valid_0's auc: 0.711
[6]	valid_0's auc: 0.712418
[7]	valid_0's auc: 0.713773
[8]	valid_0's auc: 0.714567
[9]	valid_0's auc: 0.715194
[10]	valid_0's auc: 0.715547
[11]	valid_0's auc: 0.716554
[12]	valid_0's auc: 0.717508
[13]	valid_0's auc: 0.717519
[14]	valid_0's auc: 0.717936
[15]	valid_0's auc: 0.718711
[16]	valid_0's auc: 0.719044
[17]	valid_0's auc: 0.719015
[18]	valid_0's auc: 0.718595
[19]	valid_0's auc: 0.718076
[20]	valid_0's auc: 0.718211
[21]	valid_0's auc: 0.717957
[22]	valid_0's auc: 0.717951
[23]	valid_0's auc: 0.717825
[24]	valid_0's auc: 0.718326
[25]	valid_0's auc: 0.717911
[26]	valid_0's auc: 0.717995
[27]	valid_0's auc: 0.718044
[28]	valid_0's auc: 0.718944
[29]	valid_0's auc: 0.718628
[30]	valid_0's auc: 0.718545
[31]	valid_0's auc: 0.718247
[32]	valid_0's auc: 0.718295
[3

[278]	valid_0's auc: 0.700552
[279]	valid_0's auc: 0.70026
[280]	valid_0's auc: 0.700413
[281]	valid_0's auc: 0.700511
[282]	valid_0's auc: 0.700418
[283]	valid_0's auc: 0.700419
[284]	valid_0's auc: 0.700416
[285]	valid_0's auc: 0.700485
[286]	valid_0's auc: 0.700355
[287]	valid_0's auc: 0.700553
[288]	valid_0's auc: 0.700742
[289]	valid_0's auc: 0.700618
[290]	valid_0's auc: 0.700948
[291]	valid_0's auc: 0.700713
[292]	valid_0's auc: 0.700531
[293]	valid_0's auc: 0.700337
[294]	valid_0's auc: 0.700301
[295]	valid_0's auc: 0.700093
[296]	valid_0's auc: 0.699745
[297]	valid_0's auc: 0.699792
[298]	valid_0's auc: 0.699975
[299]	valid_0's auc: 0.700263
[300]	valid_0's auc: 0.700135
[301]	valid_0's auc: 0.69989
[302]	valid_0's auc: 0.699828
[303]	valid_0's auc: 0.699665
[304]	valid_0's auc: 0.699945
[305]	valid_0's auc: 0.699869
[306]	valid_0's auc: 0.699768
[307]	valid_0's auc: 0.699496
[308]	valid_0's auc: 0.6996
[309]	valid_0's auc: 0.699466
[310]	valid_0's auc: 0.699532
[311]	valid_0'

[557]	valid_0's auc: 0.691853
[558]	valid_0's auc: 0.692084
[559]	valid_0's auc: 0.692454
[560]	valid_0's auc: 0.692352
[561]	valid_0's auc: 0.692363
[562]	valid_0's auc: 0.69202
[563]	valid_0's auc: 0.69193
[564]	valid_0's auc: 0.691667
[565]	valid_0's auc: 0.691706
[566]	valid_0's auc: 0.691733
[567]	valid_0's auc: 0.691534
[568]	valid_0's auc: 0.691718
[569]	valid_0's auc: 0.691584
[570]	valid_0's auc: 0.691375
[571]	valid_0's auc: 0.691566
[572]	valid_0's auc: 0.691173
[573]	valid_0's auc: 0.691163
[574]	valid_0's auc: 0.691146
[575]	valid_0's auc: 0.690984
[576]	valid_0's auc: 0.690913
[577]	valid_0's auc: 0.690843
[578]	valid_0's auc: 0.690831
[579]	valid_0's auc: 0.690815
[580]	valid_0's auc: 0.690728
[581]	valid_0's auc: 0.690688
[582]	valid_0's auc: 0.690269
[583]	valid_0's auc: 0.69013
[584]	valid_0's auc: 0.690234
[585]	valid_0's auc: 0.690092
[586]	valid_0's auc: 0.689893
[587]	valid_0's auc: 0.689769
[588]	valid_0's auc: 0.689875
[589]	valid_0's auc: 0.68995
[590]	valid_0'

[833]	valid_0's auc: 0.683433
[834]	valid_0's auc: 0.683231
[835]	valid_0's auc: 0.683173
[836]	valid_0's auc: 0.683214
[837]	valid_0's auc: 0.683373
[838]	valid_0's auc: 0.68335
[839]	valid_0's auc: 0.683588
[840]	valid_0's auc: 0.683522
[841]	valid_0's auc: 0.683471
[842]	valid_0's auc: 0.68361
[843]	valid_0's auc: 0.683609
[844]	valid_0's auc: 0.683691
[845]	valid_0's auc: 0.683945
[846]	valid_0's auc: 0.683718
[847]	valid_0's auc: 0.683563
[848]	valid_0's auc: 0.68362
[849]	valid_0's auc: 0.683777
[850]	valid_0's auc: 0.683871
[851]	valid_0's auc: 0.683673
[852]	valid_0's auc: 0.683632
[853]	valid_0's auc: 0.683771
[854]	valid_0's auc: 0.68364
[855]	valid_0's auc: 0.683443
[856]	valid_0's auc: 0.683455
[857]	valid_0's auc: 0.683574
[858]	valid_0's auc: 0.683672
[859]	valid_0's auc: 0.683599
[860]	valid_0's auc: 0.683481
[861]	valid_0's auc: 0.683434
[862]	valid_0's auc: 0.683434
[863]	valid_0's auc: 0.683126
[864]	valid_0's auc: 0.683254
[865]	valid_0's auc: 0.68318
[866]	valid_0's

[1105]	valid_0's auc: 0.677832
[1106]	valid_0's auc: 0.677865
[1107]	valid_0's auc: 0.677721
[1108]	valid_0's auc: 0.677758
[1109]	valid_0's auc: 0.677644
[1110]	valid_0's auc: 0.677423
[1111]	valid_0's auc: 0.677157
[1112]	valid_0's auc: 0.677431
[1113]	valid_0's auc: 0.677413
[1114]	valid_0's auc: 0.677405
[1115]	valid_0's auc: 0.67713
[1116]	valid_0's auc: 0.676942
[1117]	valid_0's auc: 0.676834
[1118]	valid_0's auc: 0.676758
[1119]	valid_0's auc: 0.676717
[1120]	valid_0's auc: 0.676432
[1121]	valid_0's auc: 0.676335
[1122]	valid_0's auc: 0.676397
[1123]	valid_0's auc: 0.676537
[1124]	valid_0's auc: 0.676632
[1125]	valid_0's auc: 0.676697
[1126]	valid_0's auc: 0.676744
[1127]	valid_0's auc: 0.676522
[1128]	valid_0's auc: 0.676469
[1129]	valid_0's auc: 0.67637
[1130]	valid_0's auc: 0.676305
[1131]	valid_0's auc: 0.676407
[1132]	valid_0's auc: 0.676318
[1133]	valid_0's auc: 0.676283
[1134]	valid_0's auc: 0.676328
[1135]	valid_0's auc: 0.67628
[1136]	valid_0's auc: 0.676054
[1137]	vali

[1371]	valid_0's auc: 0.67363
[1372]	valid_0's auc: 0.673671
[1373]	valid_0's auc: 0.673527
[1374]	valid_0's auc: 0.673398
[1375]	valid_0's auc: 0.673563
[1376]	valid_0's auc: 0.673408
[1377]	valid_0's auc: 0.67351
[1378]	valid_0's auc: 0.673764
[1379]	valid_0's auc: 0.674047
[1380]	valid_0's auc: 0.674066
[1381]	valid_0's auc: 0.67408
[1382]	valid_0's auc: 0.674097
[1383]	valid_0's auc: 0.6741
[1384]	valid_0's auc: 0.673943
[1385]	valid_0's auc: 0.673983
[1386]	valid_0's auc: 0.673796
[1387]	valid_0's auc: 0.673594
[1388]	valid_0's auc: 0.673416
[1389]	valid_0's auc: 0.673158
[1390]	valid_0's auc: 0.673036
[1391]	valid_0's auc: 0.673021
[1392]	valid_0's auc: 0.672972
[1393]	valid_0's auc: 0.672945
[1394]	valid_0's auc: 0.672803
[1395]	valid_0's auc: 0.672787
[1396]	valid_0's auc: 0.672842
[1397]	valid_0's auc: 0.672418
[1398]	valid_0's auc: 0.672327
[1399]	valid_0's auc: 0.672089
[1400]	valid_0's auc: 0.672024
[1401]	valid_0's auc: 0.671776
[1402]	valid_0's auc: 0.67153
[1403]	valid_0

In [264]:
!pip install lightgbm

Collecting lightgbm
  Downloading https://files.pythonhosted.org/packages/00/37/a392e669a83fef72b916009c438a924d2a9d70bc8aea62662b207105ed98/lightgbm-2.2.3-py2.py3-none-win_amd64.whl (515kB)
Installing collected packages: lightgbm
Successfully installed lightgbm-2.2.3
