In [3]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from utils import RepeatedStratifiedGroupKFold
import lightgbm as lgb

import pandas_profiling
from tqdm import tqdm_notebook
import numpy as np
%matplotlib inline

In [4]:
df_train = pd.read_csv('input/onetwotrip_challenge_train.csv')
df_test = pd.read_csv('input/onetwotrip_challenge_test.csv')

In [5]:
ltr = len(df_train)
df_train = df_train.sort_values(by=['userid', 'field4'])
df_test = df_test.sort_values(by=['userid', 'field4'])
df_test['orderid'] += ltr

In [6]:
data = pd.concat([df_train, df_test], axis=0).reset_index(drop=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [7]:
data['userid'] = LabelEncoder().fit_transform(data['userid'])

data['order_diff'] = data['orderid'] - data.groupby('userid')['orderid'].shift()
data['order_pct_change'] = data.groupby('userid')['orderid'].pct_change() 
data['order_diff_shift'] = data.groupby('userid')['order_diff'].shift(-1)
data['diff_order_cumsum'] = data.groupby('userid')['order_diff'].cumsum()

data['num_orders'] = data.groupby('userid')['userid'].transform('count')
data['num_orders_bin'] = np.where(data['num_orders'] > 1, 1, 0)

data['adult_pct'] = data['field24'] / data['field15']
data['child_pct'] = data['field28'] / data['field15']
data['baby_pct'] = data['field9'] / data['field15']

data['is_first'] = np.where(data['field4'] == 1, 1, 0)
data['first_order_diff'] = data.groupby('userid')['field0'].cumsum()
data['field0_adj'] = data.groupby('userid')['field0'].apply(lambda x: x.replace(0, method='ffill'))

In [8]:
categorical_cols = {'field2', 'field3', 'field5', 'field7', 'field8', 'field9', 'field10', 'field18', 'field19', 
                    'field20', 'field21', 'field24', 'field27', 'field28', 'field29', 'indicator_goal21', 
                    'indicator_goal22', 'indicator_goal23', 'indicator_goal24', 'indicator_goal25'}
useful_cols = list(set(data.columns) - {'goal1', 'goal21', 'goal22', 'goal23', 'goal24', 'goal25', 'orderid'})

for col in tqdm_notebook(useful_cols):
    data['vc_' + col] = data.groupby(col)[col].transform("count")
    data['mean_' + col] = data.groupby(col)[col].transform("mean")
    data['std_' + col] = data.groupby(col)[col].transform("std")
    data['prev_' + col] = data.groupby('userid')[col].shift(1)
    data['next_' + col] = data.groupby('userid')[col].shift(-1)
    data['prev2_' + col] = data.groupby('userid')[col].shift(2)
    data['next2_' + col] = data.groupby('userid')[col].shift(-2)
    data['prev3_' + col] = data.groupby('userid')[col].shift(3)
    data['next3_' + col] = data.groupby('userid')[col].shift(-3)
    data['cnt_userid_' + col] = data['userid'].map(data.groupby('userid')[col].apply(lambda x: x.unique().size).to_dict())
    data['ratio_userid_' + col] = data['userid'].map(data.groupby('userid')[col].apply(lambda x: x.unique().size / len(x)).to_dict())

HBox(children=(IntProgress(value=0, max=48), HTML(value='')))




In [9]:
def process_column(data, df_train, enc_col, col):
    df_group = df_train.groupby(col)[enc_col]
    data[f'mean_{enc_col}_{col}'] = df_group.transform('mean')
    data[f'std_{enc_col}_{col}'] = df_group.transform('std')

def feature_engineering(data, df_train):
    for enc_col in ['goal1', 'goal21', 'goal22', 'goal23', 'goal24', 'goal25']:
        for col in tqdm_notebook(categorical_cols):
            process_column(data, df_train, enc_col, col)
    return data

In [10]:
train_idx = data[~data['goal1'].isna()].index
test_idx = data[data['goal1'].isna()].index

In [12]:
param_lgb = {
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'boost': 'gbdt',
    'feature_fraction': 0.8,
    'learning_rate': 0.01,
    'metric':'auc',
    'num_leaves': 31,
    'num_threads': 8,
    'objective': 'binary',
}

excluded_cols = {'goal1', 'goal21', 'goal22', 'goal23', 'goal24', 'goal25', 'orderid'}
train_cols = list(set(data.columns) - excluded_cols)


kf = RepeatedStratifiedGroupKFold(n_splits=10)


pred = pd.DataFrame()
score = []

for i , (train_index, test_index) in enumerate(kf.split(data.loc[train_idx, :], 
                                                        data.loc[train_idx, 'goal1'].astype('int'),
                                                        groups=data.loc[train_idx, 'userid'],
                                                       )):
    data = feature_engineering(data, data.iloc[train_index])
    X_train, y_train = data.loc[train_index, train_cols], data.loc[train_index, 'goal1']
    X_test, y_test = data.loc[test_index, train_cols], data.loc[test_index, 'goal1']
    tr = lgb.Dataset(np.array(X_train), np.array(y_train))
    te = lgb.Dataset(np.array(X_test), np.array(y_test), reference=tr)
    bst = lgb.train(param_lgb, tr, num_boost_round=10000, 
            valid_sets=te, early_stopping_rounds=int(5 / param_lgb['learning_rate']), verbose_eval=100)
    score.append(bst.best_score['valid_0']['auc'])
    
    pred[str(i)] = bst.predict(data.loc[test_idx, train_cols])

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=113902), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

Training until validation scores don't improve for 500 rounds.
[100]	valid_0's auc: 0.687967
[200]	valid_0's auc: 0.69604
[300]	valid_0's auc: 0.69875
[400]	valid_0's auc: 0.700795
[500]	valid_0's auc: 0.701601
[600]	valid_0's auc: 0.701391
[700]	valid_0's auc: 0.702992
[800]	valid_0's auc: 0.703749
[900]	valid_0's auc: 0.704844
[1000]	valid_0's auc: 0.705937
[1100]	valid_0's auc: 0.706367
[1200]	valid_0's auc: 0.706235
[1300]	valid_0's auc: 0.706145
[1400]	valid_0's auc: 0.706482
[1500]	valid_0's auc: 0.705845
[1600]	valid_0's auc: 0.705829
Early stopping, best iteration is:
[1173]	valid_0's auc: 0.706598


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

Training until validation scores don't improve for 500 rounds.
[100]	valid_0's auc: 0.690233
[200]	valid_0's auc: 0.699967
[300]	valid_0's auc: 0.70048
[400]	valid_0's auc: 0.699095
[500]	valid_0's auc: 0.698423
[600]	valid_0's auc: 0.698497
[700]	valid_0's auc: 0.696989
Early stopping, best iteration is:
[272]	valid_0's auc: 0.70069


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

Training until validation scores don't improve for 500 rounds.
[100]	valid_0's auc: 0.703994
[200]	valid_0's auc: 0.710086
[300]	valid_0's auc: 0.712125
[400]	valid_0's auc: 0.713773
[500]	valid_0's auc: 0.714358
[600]	valid_0's auc: 0.715729
[700]	valid_0's auc: 0.716412
[800]	valid_0's auc: 0.717493
[900]	valid_0's auc: 0.717521
[1000]	valid_0's auc: 0.717662
[1100]	valid_0's auc: 0.717912
[1200]	valid_0's auc: 0.718223
[1300]	valid_0's auc: 0.71826
[1400]	valid_0's auc: 0.718447
[1500]	valid_0's auc: 0.718378
[1600]	valid_0's auc: 0.717774
[1700]	valid_0's auc: 0.71712
[1800]	valid_0's auc: 0.716901
Early stopping, best iteration is:
[1326]	valid_0's auc: 0.7186


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

Training until validation scores don't improve for 500 rounds.
[100]	valid_0's auc: 0.697409
[200]	valid_0's auc: 0.706525
[300]	valid_0's auc: 0.709027
[400]	valid_0's auc: 0.710903
[500]	valid_0's auc: 0.712162
[600]	valid_0's auc: 0.713171
[700]	valid_0's auc: 0.712746
[800]	valid_0's auc: 0.712924
[900]	valid_0's auc: 0.713275
[1000]	valid_0's auc: 0.713371
[1100]	valid_0's auc: 0.713139
[1200]	valid_0's auc: 0.712549
[1300]	valid_0's auc: 0.713023
[1400]	valid_0's auc: 0.713083
Early stopping, best iteration is:
[973]	valid_0's auc: 0.713674


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

Training until validation scores don't improve for 500 rounds.
[100]	valid_0's auc: 0.688148
[200]	valid_0's auc: 0.697315
[300]	valid_0's auc: 0.70196
[400]	valid_0's auc: 0.704476
[500]	valid_0's auc: 0.705137
[600]	valid_0's auc: 0.705469
[700]	valid_0's auc: 0.705486
[800]	valid_0's auc: 0.705855
[900]	valid_0's auc: 0.705812
[1000]	valid_0's auc: 0.705167
[1100]	valid_0's auc: 0.704863
[1200]	valid_0's auc: 0.703845
[1300]	valid_0's auc: 0.703367
Early stopping, best iteration is:
[834]	valid_0's auc: 0.706117


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

Training until validation scores don't improve for 500 rounds.
[100]	valid_0's auc: 0.675566
[200]	valid_0's auc: 0.687704
[300]	valid_0's auc: 0.693089
[400]	valid_0's auc: 0.694851
[500]	valid_0's auc: 0.696441
[600]	valid_0's auc: 0.696507
[700]	valid_0's auc: 0.695761
[800]	valid_0's auc: 0.694919
[900]	valid_0's auc: 0.694947
[1000]	valid_0's auc: 0.693958
Early stopping, best iteration is:
[583]	valid_0's auc: 0.696911


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

Training until validation scores don't improve for 500 rounds.
[100]	valid_0's auc: 0.717497
[200]	valid_0's auc: 0.72445
[300]	valid_0's auc: 0.727934
[400]	valid_0's auc: 0.729517
[500]	valid_0's auc: 0.729821
[600]	valid_0's auc: 0.729381
[700]	valid_0's auc: 0.728372
[800]	valid_0's auc: 0.727332
[900]	valid_0's auc: 0.725952
Early stopping, best iteration is:
[462]	valid_0's auc: 0.730604


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

Training until validation scores don't improve for 500 rounds.
[100]	valid_0's auc: 0.706145
[200]	valid_0's auc: 0.710628
[300]	valid_0's auc: 0.716204
[400]	valid_0's auc: 0.718566
[500]	valid_0's auc: 0.71952
[600]	valid_0's auc: 0.719223
[700]	valid_0's auc: 0.718889
[800]	valid_0's auc: 0.719814
[900]	valid_0's auc: 0.719683
[1000]	valid_0's auc: 0.718432
[1100]	valid_0's auc: 0.717155
[1200]	valid_0's auc: 0.71738
[1300]	valid_0's auc: 0.717111
Early stopping, best iteration is:
[849]	valid_0's auc: 0.72009


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

Training until validation scores don't improve for 500 rounds.
[100]	valid_0's auc: 0.680962
[200]	valid_0's auc: 0.686925
[300]	valid_0's auc: 0.691706
[400]	valid_0's auc: 0.695281
[500]	valid_0's auc: 0.697603
[600]	valid_0's auc: 0.697397
[700]	valid_0's auc: 0.696971
[800]	valid_0's auc: 0.696609
[900]	valid_0's auc: 0.696829
[1000]	valid_0's auc: 0.696594
[1100]	valid_0's auc: 0.695579
Early stopping, best iteration is:
[621]	valid_0's auc: 0.697618


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

Training until validation scores don't improve for 500 rounds.
[100]	valid_0's auc: 0.698496
[200]	valid_0's auc: 0.704722
[300]	valid_0's auc: 0.70897
[400]	valid_0's auc: 0.711173
[500]	valid_0's auc: 0.712323
[600]	valid_0's auc: 0.711667
[700]	valid_0's auc: 0.711298
[800]	valid_0's auc: 0.711315
[900]	valid_0's auc: 0.710469
[1000]	valid_0's auc: 0.708414
Early stopping, best iteration is:
[528]	valid_0's auc: 0.712588



In [13]:
scores_df = pd.read_csv('val_scores.csv')
scores_df.columns = scores_df.columns.astype('int')
scores_df = scores_df.append(pd.Series(score, index=range(10)), ignore_index=True)
scores_df.to_csv('val_scores.csv', index=False)

In [17]:
pred.index = data.loc[test_idx, 'orderid'] - ltr
pred = pred.sort_index()

In [19]:
ans = pd.DataFrame(index=pred.index)

ans['proba'] = pred.mean(axis=1)
ans.to_csv('mean.csv')

cv_weights = score / sum(score)
ans['proba'] = (cv_weights * pred).sum(axis=1)
ans.to_csv('cv_weights.csv')