In [1]:
import numpy as np
from numpy import logical_and as _and
import pandas as pd
import datetime as dt
import lightgbm as lgb

In [2]:
%%time
dates = [['event_datetime_m'], ['date_time'], ['create_datetime']]
X_train = pd.read_csv('data/champ_10_datas/impressions.csv', 
                          parse_dates=dates[0], dayfirst=True, infer_datetime_format=True)
X_test = pd.read_csv('data/champ_10_datas/test.csv', 
                   parse_dates=dates[0], dayfirst=True, infer_datetime_format=True)
show_data = pd.read_csv('data/champ_10_datas/show_data.csv').drop_duplicates('id_show')
show_rating = pd.read_csv('data/champ_10_datas/show_rating.csv', 
                          parse_dates=dates[1], dayfirst=True, infer_datetime_format=True)
client_data = pd.read_csv('data/champ_10_datas/client_data.csv', 
                          parse_dates=dates[2], dayfirst=True, 
                          infer_datetime_format=True)

CPU times: user 14.5 s, sys: 523 ms, total: 15 s
Wall time: 18.6 s


#### Отрицательные значения даты

In [3]:
%%time
client_data.drop('create_datetime', axis=1, inplace=True)
neg_idx = client_data['age'] < 0
client_data.loc[neg_idx, 'age'] = -client_data.loc[neg_idx, 'age']

show_data.drop(['child_genre_id', 'parent_genre_id'], axis=1, inplace=True)
show_data.drop('IdBuilding', axis=1, inplace=True)

CPU times: user 53.9 ms, sys: 60 µs, total: 53.9 ms
Wall time: 80.8 ms


#### Сгенерируем ctr

In [4]:
def ctr_generate(train, test, groupby):
    val_1 = train.loc[train['event_datetime_m'] < dt.datetime(2017, 2, 1)]
    val_2 = train.loc[_and(train['event_datetime_m'] >= dt.datetime(2017, 2, 1), 
                          train['event_datetime_m'] < dt.datetime(2017, 3, 1))]
    val_3 = train.loc[train['event_datetime_m'] >= dt.datetime(2017, 3, 1)]
    
    val_12 = val_1.append(val_2)
    val_23 = val_2.append(val_3)
    val_13 = val_1.append(val_3)
    
    ctr12 = val_12.groupby([groupby])[['is_clicked']].mean()
    ctr12.columns = ['ctr_' + groupby]
    ctr12[groupby] = ctr12.index
    val_3 = val_3.merge(ctr12, on=[groupby], how='left')
    
    ctr23 = val_23.groupby([groupby])[['is_clicked']].mean()
    ctr23.columns = ['ctr_' + groupby]
    ctr23[groupby] = ctr23.index
    val_1 = val_1.merge(ctr23, on=[groupby], how='left')
    
    ctr13 = val_13.groupby([groupby])[['is_clicked']].mean()
    ctr13.columns = ['ctr_' + groupby]
    ctr13[groupby] = ctr13.index
    val_2 = val_2.merge(ctr13, on=[groupby], how='left')
    
    ctr12.columns = ['ctr12_' + groupby, groupby]
    ctr23.columns = ['ctr23_' + groupby, groupby]
    ctr13.columns = ['ctr13_' + groupby, groupby]
    
    test = test.merge(ctr12, on=[groupby], how='left')
    test = test.merge(ctr23, on=[groupby], how='left')
    test = test.merge(ctr13, on=[groupby], how='left')
    
    del ctr12
    del ctr13
    del ctr23
    
    test['ctr12_' + groupby].fillna(0, inplace=True)
    test['ctr23_' + groupby].fillna(0, inplace=True)
    test['ctr13_' + groupby].fillna(0, inplace=True)
    
    test['ctr_' + groupby] = (test['ctr12_' + groupby] + 8 * test['ctr23_' + groupby] + 4 * test['ctr13_' + groupby]) / 13
    test.drop('ctr12_' + groupby, inplace=True, axis=1)
    test.drop('ctr23_' + groupby, inplace=True, axis=1)
    test.drop('ctr13_' + groupby, inplace=True, axis=1)
    
    train = val_1.append(val_2).append(val_3)
    del val_1
    del val_2
    del val_3
    
    return train, test

In [5]:
%%time
X_train, X_test = ctr_generate(X_train, X_test, 'id_show')
X_train, X_test = ctr_generate(X_train, X_test, 'id_user')

show_data['organizer_id'] = show_data['organizer_id'].fillna(0).astype('int64')
X_train = X_train.merge(show_data, on='id_show', how='left')
X_test = X_test.merge(show_data, on='id_show', how='left')
del show_data

X_train, X_test = ctr_generate(X_train, X_test, 'organizer_id')

show_rating = show_rating.groupby('id_show').mean().reset_index()
X_train = X_train.merge(show_rating, how='left', on='id_show')
X_test = X_test.merge(show_rating, how='left', on='id_show')
del show_rating

X_train = X_train.merge(client_data, how='left', on='id_user')
X_test = X_test.merge(client_data, how='left', on='id_user')
del client_data

CPU times: user 8.69 s, sys: 1.51 s, total: 10.2 s
Wall time: 10.2 s


In [6]:
%%time
X_train['ones'] = np.ones(len(X_train), dtype=np.int32)

X_test['ones'] = np.ones(len(X_test), dtype=np.int32)

show_dt_count = X_train.groupby(['id_show', 'event_datetime_m'])[['ones']].sum()
show_dt_count.columns = ['show_dt_count']
show_dt_count.reset_index(inplace=True)
X_train = X_train.merge(show_dt_count, on=['id_show', 'event_datetime_m'], how='left')

show_dt_count = X_test.groupby(['id_show', 'event_datetime_m'])[['ones']].sum()
show_dt_count.columns = ['show_dt_count']
show_dt_count.reset_index(inplace=True)
X_test = X_test.merge(show_dt_count, on=['id_show', 'event_datetime_m'], how='left')
del show_dt_count


user_dt_count = X_train.groupby(['id_user', 'event_datetime_m'])[['ones']].sum()
user_dt_count.columns = ['user_dt_count']
user_dt_count.reset_index(inplace=True)
X_train = X_train.merge(user_dt_count, on=['id_user', 'event_datetime_m'], how='left')

user_dt_count = X_test.groupby(['id_user', 'event_datetime_m'])[['ones']].sum()
user_dt_count.columns = ['user_dt_count']
user_dt_count.reset_index(inplace=True)
X_test = X_test.merge(user_dt_count, on=['id_user', 'event_datetime_m'], how='left')
del user_dt_count


show_user_dt_count = X_train.groupby(['id_user', 'id_show', 'event_datetime_m'])[['ones']].sum()
show_user_dt_count.columns = ['show_user_dt_count']
show_user_dt_count.reset_index(inplace=True)
X_train = X_train.merge(show_user_dt_count, on=['id_user', 'id_show', 'event_datetime_m'], how='left')

show_user_dt_count = X_test.groupby(['id_user', 'id_show', 'event_datetime_m'])[['ones']].sum()
show_user_dt_count.columns = ['show_user_dt_count']
show_user_dt_count.reset_index(inplace=True)
X_test = X_test.merge(show_user_dt_count, on=['id_user', 'id_show', 'event_datetime_m'], how='left')
del show_user_dt_count


X_train.drop('ones', inplace=True, axis=1)
X_test.drop('ones', inplace=True, axis=1)

CPU times: user 13 s, sys: 818 ms, total: 13.8 s
Wall time: 13.9 s


In [7]:
%%time

rank_i = 'rank' + str(1)
X_train.loc[X_train['rank'] == 1, rank_i] = 1
X_train.loc[X_train['rank'] != 1, rank_i] = 0
X_train[rank_i] = X_train[rank_i].astype('int8')
    
X_test.loc[X_test['rank'] == 1, rank_i] = 1
X_test.loc[X_test['rank'] != 1, rank_i] = 0
X_test[rank_i] = X_test[rank_i].astype('int8')
    
show_dt_count = X_train.groupby(['id_show', 'event_datetime_m'])[[rank_i]].sum()
show_dt_count.columns = [rank_i + '_show_dt_count']
show_dt_count.reset_index(inplace=True)
X_train = X_train.merge(show_dt_count, on=['id_show', 'event_datetime_m'], how='left')
    
show_dt_count = X_test.groupby(['id_show', 'event_datetime_m'])[[rank_i]].sum()
show_dt_count.columns = [rank_i + '_show_dt_count']
show_dt_count.reset_index(inplace=True)
X_test = X_test.merge(show_dt_count, on=['id_show', 'event_datetime_m'], how='left')
del show_dt_count
    
    
user_dt_count = X_train.groupby(['id_user', 'event_datetime_m'])[[rank_i]].sum()
user_dt_count.columns = [rank_i + '_user_dt_count']
user_dt_count.reset_index(inplace=True)
X_train = X_train.merge(user_dt_count, on=['id_user', 'event_datetime_m'], how='left')
    
user_dt_count = X_test.groupby(['id_user', 'event_datetime_m'])[[rank_i]].sum()
user_dt_count.columns = [rank_i + '_user_dt_count']
user_dt_count.reset_index(inplace=True)
X_test = X_test.merge(user_dt_count, on=['id_user', 'event_datetime_m'], how='left')
del user_dt_count


show_user_dt_count = X_train.groupby(['id_user', 'id_show', 'event_datetime_m'])[[rank_i]].sum()
show_user_dt_count.columns = [rank_i + '_show_user_dt_count']
show_user_dt_count.reset_index(inplace=True)
X_train = X_train.merge(show_user_dt_count, on=['id_user', 'id_show', 'event_datetime_m'], how='left')

show_user_dt_count = X_test.groupby(['id_user', 'id_show', 'event_datetime_m'])[[rank_i]].sum()
show_user_dt_count.columns = [rank_i + '_show_user_dt_count']
show_user_dt_count.reset_index(inplace=True)
X_test = X_test.merge(show_user_dt_count, on=['id_user', 'id_show', 'event_datetime_m'], how='left')
del show_user_dt_count
    
X_train.drop(rank_i, inplace=True, axis=1)
X_test.drop(rank_i, inplace=True, axis=1)

CPU times: user 12.2 s, sys: 1.4 s, total: 13.6 s
Wall time: 13.6 s


In [8]:
%%time
all_data = X_train.append(X_test)
all_data.index = np.arange(len(all_data))
del X_train
del X_test
all_data['ones'] = np.ones(len(all_data), dtype=np.int32)

tmp_sum = all_data.sort_values('event_datetime_m', 
                               ascending=True).groupby(['id_show', 'event_datetime_m'])[['ones']].cumsum()
tmp_sum.columns = ['sum_open_show_event_datetime']
all_data.loc[tmp_sum.index, 'sum_open_show_event_datetime'] = tmp_sum['sum_open_show_event_datetime'].astype('int32')

tmp_sum = all_data.sort_values('event_datetime_m', 
                               ascending=False).groupby(['id_show', 'event_datetime_m'])[['ones']].cumsum()
tmp_sum.columns = ['sum_open_show_event_datetime_2']
all_data.loc[tmp_sum.index, 'sum_open_show_event_datetime_2'] = tmp_sum['sum_open_show_event_datetime_2'].astype('int32')

tmp_sum = all_data.sort_values('event_datetime_m', 
                               ascending=True).groupby(['id_user', 'event_datetime_m'])[['ones']].cumsum()
tmp_sum.columns = ['sum_open_user_event_datetime']
all_data.loc[tmp_sum.index, 'sum_open_user_event_datetime'] = tmp_sum['sum_open_user_event_datetime'].astype('int32')

tmp_sum = all_data.sort_values('event_datetime_m', 
                               ascending=False).groupby(['id_user', 'event_datetime_m'])[['ones']].cumsum()
tmp_sum.columns = ['sum_open_user_event_datetime_2']
all_data.loc[tmp_sum.index, 'sum_open_user_event_datetime_2'] = tmp_sum['sum_open_user_event_datetime_2'].astype('int32')

tmp_sum = all_data.sort_values('event_datetime_m', 
                               ascending=True).groupby(['id_user', 'id_show', 'event_datetime_m'])[['ones']].cumsum()
tmp_sum.columns = ['sum_open_user_show_event']
all_data.loc[tmp_sum.index, 'sum_open_user_show_event'] = tmp_sum['sum_open_user_show_event'].astype('int32')

tmp_sum = all_data.sort_values('event_datetime_m', 
                               ascending=False).groupby(['id_user', 'id_show', 'event_datetime_m'])[['ones']].cumsum()
tmp_sum.columns = ['sum_open_user_show_event_2']
all_data.loc[tmp_sum.index, 'sum_open_user_show_event_2'] = tmp_sum['sum_open_user_show_event_2'].astype('int32')
all_data.drop('ones', inplace=True, axis=1)

CPU times: user 26.3 s, sys: 7.22 s, total: 33.5 s
Wall time: 33.6 s


In [9]:
%%time
rank_i = 'rank' + str(1)
all_data.loc[all_data['rank'] == 1, rank_i] = 1
all_data.loc[all_data['rank'] != 1, rank_i] = 0
all_data[rank_i] = all_data[rank_i].astype('int32')
    
user_dt = all_data.sort_values('event_datetime_m', 
                                   ascending=True).groupby(['id_user', 'event_datetime_m'])[[rank_i]].cumsum()
user_dt.columns = [rank_i + '_user_dt']
all_data.loc[user_dt.index, rank_i + '_user_dt'] = user_dt.astype('int32')
    
user_dt = all_data.sort_values('event_datetime_m', 
                                   ascending=False).groupby(['id_user', 'event_datetime_m'])[[rank_i]].cumsum()
user_dt.columns = [rank_i + '_user_dt_2']
all_data.loc[user_dt.index, rank_i + '_user_dt_2'] = user_dt.astype('int32')
del user_dt


show_dt = all_data.sort_values('event_datetime_m', 
                                   ascending=True).groupby(['id_show', 'event_datetime_m'])[[rank_i]].cumsum()
show_dt.columns = [rank_i + '_show_dt']
all_data.loc[show_dt.index, rank_i + '_show_dt'] = show_dt.astype('int32')
    
show_dt = all_data.sort_values('event_datetime_m', 
                                   ascending=False).groupby(['id_show', 'event_datetime_m'])[[rank_i]].cumsum()
show_dt.columns = [rank_i + '_show_dt_2']
all_data.loc[show_dt.index, rank_i + '_show_dt_2'] = show_dt.astype('int32')
del show_dt
    

user_show_dt = all_data.sort_values('event_datetime_m', 
                                   ascending=True).groupby(['id_user', 
                                                            'id_show', 
                                                            'event_datetime_m'])[[rank_i]].cumsum()
user_show_dt.columns = [rank_i + '_user_show_dt']
all_data.loc[user_show_dt.index, rank_i + '_user_show_dt'] = user_show_dt.astype('int32')
    
user_show_dt = all_data.sort_values('event_datetime_m', 
                                   ascending=False).groupby(['id_user', 
                                                             'id_show', 
                                                             'event_datetime_m'])[[rank_i]].cumsum()
user_show_dt.columns = [rank_i + '_user_show_dt_2']
all_data.loc[user_show_dt.index, rank_i + '_user_show_dt_2'] = user_show_dt.astype('int32')
del user_show_dt

CPU times: user 27.2 s, sys: 7.94 s, total: 35.2 s
Wall time: 35.3 s


In [10]:
%%time
test_mask = all_data['id'].notnull()
X_test = all_data.loc[test_mask]
X_train = all_data.loc[~test_mask]
X_train.drop('id', inplace=True, axis=1)
X_test.drop('is_clicked', inplace=True, axis=1)

CPU times: user 907 ms, sys: 614 ms, total: 1.52 s
Wall time: 18.8 s


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [11]:
%%time
y_train = X_train['is_clicked'].values.astype('int32')
X_train.drop('is_clicked', inplace=True, axis=1)

CPU times: user 237 ms, sys: 103 ms, total: 340 ms
Wall time: 358 ms


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [12]:
%%time
X_train['sex'] = X_train['sex'].apply(lambda x: x == 'female')
X_test['sex'] = X_test['sex'].apply(lambda x: x == 'female')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


CPU times: user 1.23 s, sys: 0 ns, total: 1.23 s
Wall time: 1.29 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [13]:
%%time
y_val = y_train[X_train['event_datetime_m'] > dt.datetime(2017, 3, 10)]
y_train = y_train[X_train['event_datetime_m'] <= dt.datetime(2017, 3, 10)]
X_val = X_train.loc[X_train['event_datetime_m'] > dt.datetime(2017, 3, 10)]
X_train = X_train.loc[X_train['event_datetime_m'] <= dt.datetime(2017, 3, 10)]

CPU times: user 365 ms, sys: 55.8 ms, total: 421 ms
Wall time: 633 ms


In [14]:
%%time
X_train['event_datetime_m'] = X_train['event_datetime_m'].apply(lambda x: x.day)
X_val['event_datetime_m'] = X_val['event_datetime_m'].apply(lambda x: x.day)
X_test['event_datetime_m'] = X_test['event_datetime_m'].apply(lambda x: x.day)

CPU times: user 19.5 s, sys: 239 ms, total: 19.7 s
Wall time: 19.9 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [15]:
%%time
X_train.index = np.arange(len(X_train))
X_val.index = np.arange(len(X_val))
X_test.index = np.arange(len(X_test))

id_test = X_test['id'].values.astype('int')
X_test.drop('id', axis=1, inplace=True)

CPU times: user 109 ms, sys: 8.23 ms, total: 118 ms
Wall time: 117 ms


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [16]:
X_train.columns

Index(['age', 'age_category', 'ctr_id_show', 'ctr_id_user', 'ctr_organizer_id',
       'duration', 'event_datetime_m', 'id_show', 'id_user', 'organizer_id',
       'rank', 'rank1_show_dt_count', 'rank1_show_user_dt_count',
       'rank1_user_dt_count', 'rating', 'rating_count', 'review_count', 'sex',
       'show_dt_count', 'show_maxprice', 'show_meanprice', 'show_minprice',
       'show_stdprice', 'show_user_dt_count', 'user_dt_count',
       'sum_open_show_event_datetime', 'sum_open_show_event_datetime_2',
       'sum_open_user_event_datetime', 'sum_open_user_event_datetime_2',
       'sum_open_user_show_event', 'sum_open_user_show_event_2', 'rank1',
       'rank1_user_dt', 'rank1_user_dt_2', 'rank1_show_dt', 'rank1_show_dt_2',
       'rank1_user_show_dt', 'rank1_user_show_dt_2'],
      dtype='object')

In [17]:
use_cols = ['age', 'age_category', 'ctr_id_show', 'ctr_id_user', 'ctr_organizer_id',
       'duration', 'rank', 'rank1_show_dt_count', 'rank1_show_user_dt_count',
       'rank1_user_dt_count', 'rating',
       'rating_count', 'review_count', 'sex', 'show_dt_count', 'show_maxprice',
       'show_meanprice', 'show_minprice', 'show_stdprice',
       'show_user_dt_count', 'user_dt_count',
       'sum_open_show_event_datetime',
       'sum_open_show_event_datetime_2', 'sum_open_user_event_datetime',
       'sum_open_user_event_datetime_2', 'sum_open_user_show_event',
       'sum_open_user_show_event_2', 'rank1_user_dt', 'rank1_user_dt_2', 'rank1_show_dt', 'rank1_show_dt_2',
       'rank1_user_show_dt', 'rank1_user_show_dt_2']

In [18]:
dtrain = lgb.Dataset(X_train[use_cols], label=y_train, silent=True)
dvalid = dtrain.create_valid(X_val[use_cols], label=y_val, silent=True)

In [19]:
%%time
params = {'max_depth':5, 
         'learning_rate':0.05, 
         'silent':1,
         'objective':'binary',
         'bagging_freq':1,
         'bagging_seed':517,
         'bagging_fraction':0.8,
         'reg_lambda':3.0, 
         'reg_alpha':1.0,
         'seed':0,
         'metric':'binary_logloss'
        }
n_estimators = 1000
model = lgb.train(params, dtrain, valid_sets=[dtrain, dvalid],
            early_stopping_rounds=25, num_boost_round=n_estimators, verbose_eval=25)

Training until validation scores don't improve for 25 rounds.
[25]	training's binary_logloss: 0.212629	valid_1's binary_logloss: 0.214642
[50]	training's binary_logloss: 0.11997	valid_1's binary_logloss: 0.122301
[75]	training's binary_logloss: 0.0983833	valid_1's binary_logloss: 0.100895
[100]	training's binary_logloss: 0.0931387	valid_1's binary_logloss: 0.0956932
[125]	training's binary_logloss: 0.0917728	valid_1's binary_logloss: 0.0943862
[150]	training's binary_logloss: 0.0911145	valid_1's binary_logloss: 0.0938335
[175]	training's binary_logloss: 0.0905999	valid_1's binary_logloss: 0.0934033
[200]	training's binary_logloss: 0.0902563	valid_1's binary_logloss: 0.0931211
[225]	training's binary_logloss: 0.0899696	valid_1's binary_logloss: 0.0928578
[250]	training's binary_logloss: 0.0896353	valid_1's binary_logloss: 0.0925961
[275]	training's binary_logloss: 0.0894202	valid_1's binary_logloss: 0.0924868
[300]	training's binary_logloss: 0.0891855	valid_1's binary_logloss: 0.0923366

In [20]:
X_train = pd.concat([X_train[use_cols], X_val[use_cols]])
del X_val
y_train = np.concatenate((y_train, y_val))
del y_val
dtrain = lgb.Dataset(X_train, label=y_train)

#### Итоговая модель

In [21]:
%%time
params = {'max_depth':5, 
         'learning_rate':0.05, 
         'silent':1,
         'objective':'binary',
         'bagging_freq':1,
         'bagging_seed':517,
         'bagging_fraction':0.8,
         'reg_lambda':3.0, 
         'reg_alpha':1.0,
         'seed':0,
         'metric':'binary_logloss'
        }
model = lgb.train(params, dtrain, num_boost_round=579)

CPU times: user 16min 53s, sys: 1min 36s, total: 18min 29s
Wall time: 6min 31s


In [22]:
%%time
y_pred = model.predict(X_test[use_cols])
ans = np.zeros(len(id_test))
ans[id_test] = y_pred
pd.DataFrame({'_ID_': np.arange(len(id_test)), '_VAL_': ans}).to_csv('submits/final_submit.csv', index=False)

CPU times: user 59.2 s, sys: 784 ms, total: 59.9 s
Wall time: 26.7 s
