# Solution

- We concatenated data for the same Ids.
- We added 30 Random Fourier Features to our data.
- Then we run LightGBM. We useed optuna for hyperparameter tuning.

In [44]:
import pandas as pd
import numpy as np

import optuna
from lightgbm import LGBMClassifier

from sklearn.model_selection import StratifiedKFold
from optuna.integration import LightGBMPruningCallback
from sklearn.metrics import roc_auc_score

from functools import reduce
from sklearn.kernel_approximation import RBFSampler

In [45]:
def data_pipe(data):
    weeks = []

    for week in data.Week.unique():
        weeks.append(data[data.Week == week]\
                     .add_prefix(f'w{week}_')\
                     .rename(columns={f'w{week}_Id': 'Id'}))

    merge = lambda l,r : l.merge(r, on='Id', how='left')
    return reduce(merge, weeks)

In [46]:
#df = data_pipe(pd.read_csv('train.csv')).drop(columns='Id')

sub = data_pipe(pd.read_csv('test.csv'))
sub_id = sub.Id
sub = sub.drop(columns='Id')

In [47]:
df = data_pipe(pd.read_csv('train.csv')).drop(columns='Id')
train_x = df.drop(columns=['w0_target', 'w1_target', 'w2_target', 'w3_target'])
train_y = df.w0_target

In [48]:
sampler = RBFSampler(n_components=30)
sampler.fit(train_x.fillna(0), train_y)
rff_train = pd.DataFrame(sampler.transform(train_x.fillna(0)))
sub_rff = pd.DataFrame(sampler.transform(sub.fillna(0)))

In [49]:
y = df.w0_target
X = df.drop(columns=['w0_target', 'w1_target', 'w2_target', 'w3_target'])
X = pd.DataFrame(np.hstack([X.values, rff_train]))
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [50]:
sub = pd.DataFrame(np.hstack([sub.values, sub_rff]))

In [56]:
def objective(trial, X, y):
    param_grid = {
        "n_estimators": trial.suggest_categorical("n_estimators", [5000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.2, 0.9, step=0.1
        ),
        "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.2, 0.9, step=0.1
        ),
    }
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    cv_scores = np.empty(5)

    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = LGBMClassifier(objective="binary",
                               random_state=42,
                               silent=True,
                               n_jobs=8,
                               class_weight='balanced',
                               **param_grid
                              )

        model.fit(X_train, y_train,
                  eval_set=[(X_test, y_test)],
                  eval_metric="auc",
                  callbacks=[
                 LightGBMPruningCallback(trial, "auc")
            ],
                 )

        preds = model.predict_proba(X_test)
        cv_scores[idx] = roc_auc_score(y_test, preds[:, 1])

        return np.mean(cv_scores)

In [None]:
study = optuna.create_study(direction="maximize", study_name="lgbm")

study.optimize(lambda trial: objective(trial, train_x, train_y), n_trials=20)

[32m[I 2022-01-25 00:14:16,317][0m A new study created in memory with name: lgbm[0m


[1]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[2]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[3]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[5]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[6]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[7]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[8]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[9]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[10]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[11]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[12]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[13]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[14]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[15]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[16]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[17]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[18]	v

[32m[I 2022-01-25 00:14:21,761][0m Trial 0 finished with value: 0.1 and parameters: {'n_estimators': 5000, 'learning_rate': 0.0331985927702527, 'num_leaves': 2300, 'max_depth': 4, 'min_data_in_leaf': 5200, 'lambda_l1': 0, 'lambda_l2': 45, 'min_gain_to_split': 4.2701387811604, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001}. Best is trial 0 with value: 0.1.[0m



[4915]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4916]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4917]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4918]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4919]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4920]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4921]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4922]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4923]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4924]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4925]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4926]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4927]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4928]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4929]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4930]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4931]	valid_0's auc: 0

[32m[I 2022-01-25 00:14:31,309][0m Trial 1 finished with value: 0.19098423209232973 and parameters: {'n_estimators': 5000, 'learning_rate': 0.07146593845851541, 'num_leaves': 180, 'max_depth': 7, 'min_data_in_leaf': 1200, 'lambda_l1': 40, 'lambda_l2': 10, 'min_gain_to_split': 12.494193221833743, 'bagging_fraction': 0.6000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.7}. Best is trial 1 with value: 0.19098423209232973.[0m


[4943]	valid_0's auc: 0.954921	valid_0's binary_logloss: 0.297094
[4944]	valid_0's auc: 0.954921	valid_0's binary_logloss: 0.297094
[4945]	valid_0's auc: 0.954921	valid_0's binary_logloss: 0.297094
[4946]	valid_0's auc: 0.954921	valid_0's binary_logloss: 0.297094
[4947]	valid_0's auc: 0.954921	valid_0's binary_logloss: 0.297094
[4948]	valid_0's auc: 0.954921	valid_0's binary_logloss: 0.297094
[4949]	valid_0's auc: 0.954921	valid_0's binary_logloss: 0.297094
[4950]	valid_0's auc: 0.954921	valid_0's binary_logloss: 0.297094
[4951]	valid_0's auc: 0.954921	valid_0's binary_logloss: 0.297094
[4952]	valid_0's auc: 0.954921	valid_0's binary_logloss: 0.297094
[4953]	valid_0's auc: 0.954921	valid_0's binary_logloss: 0.297094
[4954]	valid_0's auc: 0.954921	valid_0's binary_logloss: 0.297094
[4955]	valid_0's auc: 0.954921	valid_0's binary_logloss: 0.297094
[4956]	valid_0's auc: 0.954921	valid_0's binary_logloss: 0.297094
[4957]	valid_0's auc: 0.954921	valid_0's binary_logloss: 0.297094
[4958]	val

[32m[I 2022-01-25 00:14:35,502][0m Trial 2 finished with value: 0.1 and parameters: {'n_estimators': 5000, 'learning_rate': 0.061078888701916814, 'num_leaves': 2840, 'max_depth': 5, 'min_data_in_leaf': 3300, 'lambda_l1': 25, 'lambda_l2': 25, 'min_gain_to_split': 5.462624273145374, 'bagging_fraction': 0.2, 'bagging_freq': 1, 'feature_fraction': 0.7}. Best is trial 1 with value: 0.19098423209232973.[0m


[4898]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4899]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4900]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4901]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4902]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4903]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4904]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4905]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4906]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4907]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4908]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4909]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4910]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4911]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4912]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4913]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4914]	valid_0's auc: 0.

[32m[I 2022-01-25 00:15:27,311][0m Trial 3 finished with value: 0.1 and parameters: {'n_estimators': 5000, 'learning_rate': 0.26479567239399887, 'num_leaves': 60, 'max_depth': 8, 'min_data_in_leaf': 4900, 'lambda_l1': 80, 'lambda_l2': 10, 'min_gain_to_split': 6.342232850835301, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.7}. Best is trial 1 with value: 0.19098423209232973.[0m


[1]	valid_0's auc: 0.825806	valid_0's binary_logloss: 0.61937
[2]	valid_0's auc: 0.866334	valid_0's binary_logloss: 0.570276
[3]	valid_0's auc: 0.884422	valid_0's binary_logloss: 0.537455
[4]	valid_0's auc: 0.891246	valid_0's binary_logloss: 0.508916
[5]	valid_0's auc: 0.90401	valid_0's binary_logloss: 0.481753
[6]	valid_0's auc: 0.911869	valid_0's binary_logloss: 0.462001
[7]	valid_0's auc: 0.912063	valid_0's binary_logloss: 0.448986
[8]	valid_0's auc: 0.917088	valid_0's binary_logloss: 0.42998
[9]	valid_0's auc: 0.921558	valid_0's binary_logloss: 0.416047
[10]	valid_0's auc: 0.92511	valid_0's binary_logloss: 0.40453
[11]	valid_0's auc: 0.926084	valid_0's binary_logloss: 0.396213
[12]	valid_0's auc: 0.931244	valid_0's binary_logloss: 0.385069
[13]	valid_0's auc: 0.934555	valid_0's binary_logloss: 0.372594
[14]	valid_0's auc: 0.93935	valid_0's binary_logloss: 0.361305
[15]	valid_0's auc: 0.941839	valid_0's binary_logloss: 0.354146
[16]	valid_0's auc: 0.943133	valid_0's binary_logloss: 

[32m[I 2022-01-25 00:16:18,863][0m Trial 4 finished with value: 0.19319561363342844 and parameters: {'n_estimators': 5000, 'learning_rate': 0.27022325317930074, 'num_leaves': 240, 'max_depth': 6, 'min_data_in_leaf': 900, 'lambda_l1': 10, 'lambda_l2': 95, 'min_gain_to_split': 7.989675760827905, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.2}. Best is trial 4 with value: 0.19319561363342844.[0m


[1]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[2]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[3]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[4]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[5]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[6]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[7]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[8]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[9]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[10]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[11]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[12]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[13]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[14]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[15]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[16]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[17]	valid_0's auc: 0.5	valid_0's binary_logloss: 0.693147
[18]	v

[32m[I 2022-01-25 00:17:26,718][0m Trial 5 finished with value: 0.1 and parameters: {'n_estimators': 5000, 'learning_rate': 0.0770260032531524, 'num_leaves': 2760, 'max_depth': 10, 'min_data_in_leaf': 9400, 'lambda_l1': 35, 'lambda_l2': 10, 'min_gain_to_split': 2.5928419597670587, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001}. Best is trial 4 with value: 0.19319561363342844.[0m


[1]	valid_0's auc: 0.797497	valid_0's binary_logloss: 0.679009
[2]	valid_0's auc: 0.856005	valid_0's binary_logloss: 0.665907
[3]	valid_0's auc: 0.87806	valid_0's binary_logloss: 0.654594
[4]	valid_0's auc: 0.887975	valid_0's binary_logloss: 0.641885
[5]	valid_0's auc: 0.889451	valid_0's binary_logloss: 0.63104
[6]	valid_0's auc: 0.8989	valid_0's binary_logloss: 0.622899
[7]	valid_0's auc: 0.897708	valid_0's binary_logloss: 0.614227
[8]	valid_0's auc: 0.903025	valid_0's binary_logloss: 0.605628
[9]	valid_0's auc: 0.906405	valid_0's binary_logloss: 0.596792
[10]	valid_0's auc: 0.90985	valid_0's binary_logloss: 0.587189
[11]	valid_0's auc: 0.914459	valid_0's binary_logloss: 0.578145
[12]	valid_0's auc: 0.916946	valid_0's binary_logloss: 0.571914
[13]	valid_0's auc: 0.915566	valid_0's binary_logloss: 0.565149
[14]	valid_0's auc: 0.915385	valid_0's binary_logloss: 0.556784
[15]	valid_0's auc: 0.915647	valid_0's binary_logloss: 0.549168
[16]	valid_0's auc: 0.915189	valid_0's binary_logloss:

[32m[I 2022-01-25 00:17:46,188][0m Trial 6 finished with value: 0.19212015544832278 and parameters: {'n_estimators': 5000, 'learning_rate': 0.044074506412716964, 'num_leaves': 980, 'max_depth': 5, 'min_data_in_leaf': 400, 'lambda_l1': 30, 'lambda_l2': 15, 'min_gain_to_split': 7.739678175178268, 'bagging_fraction': 0.4, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001}. Best is trial 4 with value: 0.19319561363342844.[0m



[4975]	valid_0's auc: 0.960495	valid_0's binary_logloss: 0.281165
[4976]	valid_0's auc: 0.960495	valid_0's binary_logloss: 0.281165
[4977]	valid_0's auc: 0.960495	valid_0's binary_logloss: 0.281165
[4978]	valid_0's auc: 0.960495	valid_0's binary_logloss: 0.281165
[4979]	valid_0's auc: 0.960495	valid_0's binary_logloss: 0.281165
[4980]	valid_0's auc: 0.960495	valid_0's binary_logloss: 0.281165
[4981]	valid_0's auc: 0.960495	valid_0's binary_logloss: 0.281165
[4982]	valid_0's auc: 0.960495	valid_0's binary_logloss: 0.281165
[4983]	valid_0's auc: 0.960495	valid_0's binary_logloss: 0.281165
[4984]	valid_0's auc: 0.960495	valid_0's binary_logloss: 0.281165
[4985]	valid_0's auc: 0.960495	valid_0's binary_logloss: 0.281165
[4986]	valid_0's auc: 0.960495	valid_0's binary_logloss: 0.281165
[4987]	valid_0's auc: 0.960495	valid_0's binary_logloss: 0.281165
[4988]	valid_0's auc: 0.960495	valid_0's binary_logloss: 0.281165
[4989]	valid_0's auc: 0.960495	valid_0's binary_logloss: 0.281165
[4990]	va

[32m[I 2022-01-25 00:19:01,774][0m Trial 7 finished with value: 0.1 and parameters: {'n_estimators': 5000, 'learning_rate': 0.2625372929573469, 'num_leaves': 300, 'max_depth': 5, 'min_data_in_leaf': 10000, 'lambda_l1': 50, 'lambda_l2': 0, 'min_gain_to_split': 3.9112676184945987, 'bagging_fraction': 0.30000000000000004, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001}. Best is trial 4 with value: 0.19319561363342844.[0m


In [32]:
clf = LGBMClassifier(objective="binary",
                               random_state=42,
                               silent=True,
                               n_jobs=8,
                               class_weight='balanced',
                               **study.best_params
                              )

clf.fit(X, y)



LGBMClassifier(bagging_fraction=0.6000000000000001, bagging_freq=1,
               class_weight='balanced', feature_fraction=0.8, lambda_l1=55,
               lambda_l2=40, learning_rate=0.2680163741968798, max_depth=9,
               min_data_in_leaf=800, min_gain_to_split=0.9026135572126254,
               n_estimators=5000, n_jobs=8, num_leaves=420, objective='binary',
               random_state=42)

In [None]:
pred = clf.predict(sub)

In [36]:
sub['Predicted'] = pred
sub['Id'] = sub_id

In [40]:
sub[['Id', 'Predicted']].to_csv('rff.csv', index=False)

In [41]:
sub['Predicted'].value_counts()

0.0    2494
1.0     813
Name: Predicted, dtype: int64