In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from utils import RepeatedStratifiedGroupKFold
import lightgbm as lgb

from tqdm import tqdm_notebook
import numpy as np
%matplotlib inline

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
df_train = pd.read_csv('input/onetwotrip_challenge_train.csv')
df_test = pd.read_csv('input/onetwotrip_challenge_test.csv')

In [4]:
ltr = len(df_train)
df_train = df_train.sort_values(by=['userid', 'field4'])
df_test = df_test.sort_values(by=['userid', 'field4'])
df_test['orderid'] += ltr

In [None]:
data = pd.concat([df_train, df_test], axis=0).reset_index(drop=True)
data['userid'] = LabelEncoder().fit_transform(data['userid'])

In [None]:
data['order_diff'] = data['orderid'] - data.groupby('userid')['orderid'].shift()
data['order_pct_change'] = data.groupby('userid')['orderid'].pct_change() 
data['order_diff_shift'] = data.groupby('userid')['order_diff'].shift(-1)
data['diff_order_cumsum'] = data.groupby('userid')['order_diff'].cumsum()

data['num_orders'] = data.groupby('userid')['userid'].transform('count')
data['num_orders_bin'] = np.where(data['num_orders'] > 1, 1, 0)

data['adult_pct'] = data['field24'] / data['field15']
data['child_pct'] = data['field28'] / data['field15']
data['baby_pct'] = data['field9'] / data['field15']

data['is_first'] = np.where(data['field4'] == 1, 1, 0)
data['first_order_diff'] = data.groupby('userid')['field0'].cumsum()
data['field0_adj'] = data.groupby('userid')['field0'].apply(lambda x: x.replace(0, method='ffill'))

In [None]:
categorical_cols = {'field2', 'field3', 'field5', 'field7', 'field8', 'field9', 'field10', 'field18', 'field19', 
                    'field20', 'field21', 'field24', 'field27', 'field28', 'field29', 'indicator_goal21', 
                    'indicator_goal22', 'indicator_goal23', 'indicator_goal24', 'indicator_goal25'}
useful_cols = list(set(data.columns) - {'goal1', 'goal21', 'goal22', 'goal23', 'goal24', 'goal25', 'orderid'})

for col in tqdm_notebook(useful_cols):
    data['vc_' + col] = data.groupby(col)[col].transform("count")
    data['mean_' + col] = data.groupby(col)[col].transform("mean")
    data['std_' + col] = data.groupby(col)[col].transform("std")
    data['prev_' + col] = data.groupby('userid')[col].shift(1)
    data['next_' + col] = data.groupby('userid')[col].shift(-1)
    data['prev2_' + col] = data.groupby('userid')[col].shift(2)
    data['next2_' + col] = data.groupby('userid')[col].shift(-2)
    data['prev3_' + col] = data.groupby('userid')[col].shift(3)
    data['next3_' + col] = data.groupby('userid')[col].shift(-3)
    data['cnt_userid_' + col] = data['userid'].map(data.groupby('userid')[col].apply(lambda x: x.unique().size).to_dict())
    data['ratio_userid_' + col] = data['userid'].map(data.groupby('userid')[col].apply(lambda x: x.unique().size / len(x)).to_dict())


In [None]:
def process_column(data, df_train, enc_col, col):
    df_group = df_train.groupby(col)[enc_col]
    data[f'mean_{enc_col}_{col}'] = df_group.transform('mean')
    data[f'std_{enc_col}_{col}'] = df_group.transform('std')

def feature_engineering(data, df_train):
    for enc_col in ['goal1', 'goal21', 'goal22', 'goal23', 'goal24', 'goal25']:
        for col in tqdm_notebook(categorical_cols):
            process_column(data, df_train, enc_col, col)
    return data

In [None]:
train_idx = data[~data['goal1'].isna()].index
test_idx = data[data['goal1'].isna()].index

In [None]:
param_lgb = {
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'boost': 'gbdt',
    'feature_fraction': 0.8,
    'learning_rate': 0.01,
    'metric':'auc',
    'num_leaves': 31,
    'num_threads': 8,
    'objective': 'binary',
}

excluded_cols = {'goal1', 'goal21', 'goal22', 'goal23', 'goal24', 'goal25', 'orderid'}
train_cols = list(set(data.columns) - excluded_cols)


kf = RepeatedStratifiedGroupKFold(n_splits=10)
ans = pd.DataFrame(index=df_test['orderid'] - ltr)

for goal in {'goal21', 'goal22', 'goal23', 'goal24', 'goal25'}:
    pred = pd.DataFrame()
    score = []

    for i , (train_index, test_index) in enumerate(kf.split(data.loc[train_idx, :], 
                                                            data.loc[train_idx, goal].astype('int'),
                                                            groups=data.loc[train_idx, 'userid'],
                                                           )):
        data = feature_engineering(data, data.iloc[train_index])
        X_train, y_train = data.loc[train_index, train_cols], data.loc[train_index, goal]
        X_test, y_test = data.loc[test_index, train_cols], data.loc[test_index, goal]
        tr = lgb.Dataset(np.array(X_train), np.array(y_train))
        te = lgb.Dataset(np.array(X_test), np.array(y_test), reference=tr)
        bst = lgb.train(param_lgb, tr, num_boost_round=10000, 
                valid_sets=te, early_stopping_rounds=int(5 / param_lgb['learning_rate']), verbose_eval=100)
        score.append(bst.best_score['valid_0']['auc'])

        pred[str(i)] = bst.predict(data.loc[test_idx, train_cols])
        
    pred.index = data.loc[test_idx, 'orderid'] - ltr
    ans[goal] = pred.mean(axis=1)

In [None]:
ans = ans.sort_index()
ans.to_csv('task2_.csv')