In [1]:
import warnings
warnings.filterwarnings("ignore")

### Imports

In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from datetime import datetime, timedelta
from tqdm import tqdm
tqdm.pandas(desc="datetime_bar")

import lightgbm as lgb
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, cross_val_predict, cross_val_score
from category_encoders import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

import seaborn as sns
plt.style.use("bmh")
plt.rcParams['font.family'] = 'DejaVu Sans'

In [3]:
import sys
sys.path.append('functions/')
from smooth_statistic import *

### Datasets

In [4]:
train = pd.read_pickle('./train_best.pkl')
test = pd.read_pickle('./test_best.pkl')

### Cols

In [5]:
key_cols = ['app_id', 'target']
cat_cols_for_smooth = ['currency_mode', 'card_type_mode', 'operation_type_mode', 'operation_type_group_mode',
                       'mcc_mode', 'city_mode', 'mcc_category_mode', 'day_of_week_mode', 'hour_mode', 'last_day_mcc']

# Add LR scores

###### Mcc

In [6]:
train_lr = pd.read_pickle('./tables_for_lr/train_ft_mcc.pkl')
train_lr = train_lr[['app_id', 'mcc', 'flag']]

test_lr = pd.read_pickle('./tables_for_lr/test_ft_mcc.pkl')
test_lr = test_lr[['app_id', 'mcc']]

tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(train_lr['mcc'])
X_test_tfidf = tfidf.transform(test_lr['mcc'])

model = LogisticRegression(C=5)
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
lr_cv = cross_val_predict(estimator=model, X=X_tfidf, y=train_lr['flag'].values,
                          cv=kf, n_jobs=15, method='predict_proba')[:, 1]

model.fit(X_tfidf, train_lr['flag'].values)
lr_test = model.predict_proba(X_test_tfidf)[:, 1]

train_lr['score_mcc_lr'] = lr_cv
train_lr = train_lr[['app_id', 'score_mcc_lr']]
test_lr['score_mcc_lr'] = lr_test
test_lr = test_lr[['app_id', 'score_mcc_lr']]

train = train.merge(train_lr, how='left', on=['app_id'])
test = test.merge(test_lr, how='left', on=['app_id'])

# Add fasttext scores

###### Mcc

In [7]:
ft_cv = pd.read_csv('./tables_ft/ft_cv_mcc.csv')
ft_test = pd.read_csv('./tables_ft/ft_test_mcc.csv')
train = train.merge(ft_cv, how='inner', on=['app_id'])
test = test.merge(ft_test, how='inner', on=['app_id'])

###### Catr_type

In [8]:
ft_cv = pd.read_csv('./tables_ft/ft_cv_card_type.csv')
ft_test = pd.read_csv('./tables_ft/ft_test_card_type.csv')
train = train.merge(ft_cv, how='inner', on=['app_id'])
test = test.merge(ft_test, how='inner', on=['app_id'])

# Smooth statistics

In [9]:
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
train, test = calc_smooth_statistics_features(train, test, cat_cols_for_smooth, 'target', kf=kf)

100%|██████████| 10/10 [04:09<00:00, 24.98s/it]


# Categories

In [10]:
test = test[train.columns]

In [11]:
from category_encoders import OneHotEncoder
ohe = OneHotEncoder(cols=['product'], use_cat_names=True)
train = ohe.fit_transform(train)
test = ohe.transform(test)

# Add smooth

In [14]:
train_smooth = pd.read_pickle('./train_smooth_.pkl').drop('target', axis=1)
test_smooth = pd.read_pickle('./test_smooth_.pkl').drop('target', axis=1)

In [16]:
train = train.merge(train_smooth, how='inner', on=['app_id'])
test = test.merge(test_smooth, how='inner', on=['app_id'])

_____

# Model

In [14]:
train_cols = train.columns.difference(key_cols + cat_cols_for_smooth)
df_seed = test[['app_id']]
train_data = lgb.Dataset(train[train_cols], label=train['target'])

In [15]:
%%time
for i in tqdm(range(25)):
    params = {'metric' : 'auc',
              'objective': 'binary',
              'learning_rate': 0.03,
              'boosting_type' : 'gbdt',
              'n_jobs' : 15,
              'verbose' : -1,

              'num_leaves': 16,
              'min_child_samples': 120,
              'max_depth': 5,
              'min_split_gain': 0.0,
              'reg_alpha': 0.03,
              'reg_lambda': 0.03,
              'feature_fraction': 0.7,
              'bagging_freq': 2,
              'bagging_fraction': 0.85,

              'seed': i
    }
    lgbm = lgb.train(params, train_data, num_boost_round=2650, valid_sets=[train_data], verbose_eval=3000)
    df_seed['model_' + str(i)] = lgbm.predict(test[train_cols])

100%|██████████| 25/25 [6:08:39<00:00, 884.77s/it]   

CPU times: user 2d 3h 8min 24s, sys: 11min 53s, total: 2d 3h 20min 17s
Wall time: 6h 8min 39s





###### Submission

In [19]:
predictions_table = test[['app_id']]
predictions_table['flag'] = df_seed[['model_' + str(i) for i in range(25)]].mean(1)
submission = pd.read_csv('./tables/alfabattle2_alpha_sample.csv')
submission = submission[['app_id']]
submission = submission.merge(predictions_table, how='inner', on=['app_id'])
submission = submission[['app_id', 'flag']]

_____

# Blend with RNN

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

Use answers from advanced baseline of epoches 3, 4, 5

In [None]:
ep3 = pd.read_csv('./tables/master/nn_submissions/1_epoch_3_val_0.795.pt.csv').rename({'score': 'score3'}, axis=1)
ep4 = pd.read_csv('./tables/master/nn_submissions/1_epoch_4_val_0.790.pt.csv').rename({'score': 'score4'}, axis=1)
ep5 = pd.read_csv('./tables/master/nn_submissions/1_epoch_5_val_0.792.pt.csv').rename({'score': 'score5'}, axis=1)
ep = ep3.merge(ep4, on=['app_id']).merge(ep5, on=['app_id'])
ep['flag_nn'] = (ep['score3'] + ep['score4'] + ep['score5']) / 3
ep['flag_nn'] = sigmoid(ep['flag_nn'])

In [None]:
submission = submission.merge(ep, on=['app_id'])
submission['flag'] = 0.6 * submission['flag'] + 0.4 * submission['flag_nn']
submission[['app_id', 'flag']].to_csv('./submission.csv', index=False)

_____

_____

_____

_____

_____

_____

_____

_____

_____

_____

_____

_____

_____

_____

______

______

______

______

______

______

______

______

______

______

______

______

______

______

______

______

______

______

______

______

______

______