# Addressing outliers in the Kaggle ELO chanllege

This kernel is inspired by https://www.kaggle.com/waitingli/combining-your-model-with-a-model-without-outlier

In [1]:
import numpy as np
import pandas as pd
import time
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


## Training the model without outliers

In [3]:
%%time
train_df = pd.read_csv('../train_clean.csv')
test_df = pd.read_csv('../test_clean.csv')

CPU times: user 6.27 s, sys: 370 ms, total: 6.64 s
Wall time: 6.64 s


In [5]:
train_df.head()

Unnamed: 0,first_active_month,card_id,feature_3,target,month,year,elapsed_time,feature_1_1,feature_1_2,feature_1_3,...,new_month_lag_min,new_month_lag_max,new_category_2_3.0_mean,new_category_2_2.0_mean,new_purchase_date_ptp,new_purchase_date_min,new_purchase_date_max,new_category_3_B_mean,new_city_id_nunique,outliers
0,2017-06-01,C_ID_92a2005557,1,-0.820283,6,2017,245,0,0,0,...,1.0,2.0,0.0,0.0,4742309.0,1520259000.0,1525001000.0,0.0,3.0,0
1,2017-01-01,C_ID_3d0044924f,0,0.392913,1,2017,396,0,0,0,...,1.0,2.0,0.0,0.0,4887632.0,1517505000.0,1522393000.0,1.0,1.0,0
2,2016-08-01,C_ID_d639edf6cd,0,0.688056,8,2016,549,0,1,0,...,2.0,2.0,0.0,0.0,0.0,1524937000.0,1524937000.0,0.0,1.0,0
3,2017-09-01,C_ID_186d6a6901,0,0.142495,9,2017,153,0,0,0,...,1.0,2.0,0.0,0.0,3625505.0,1520424000.0,1524049000.0,0.857143,2.0,0
4,2017-11-01,C_ID_cdbd2c0db2,0,-0.159749,11,2017,92,1,0,0,...,1.0,2.0,0.194444,0.0,4949682.0,1519992000.0,1524941000.0,0.944444,5.0,0


In [7]:
train_df = train_df[train_df['outliers'] == 0]
target = train_df['target']
del train_df['target']
features = [c for c in train_df.columns if c not in ['card_id', 'first_active_month','outliers']]
categorical_feats = [c for c in features if 'feature_' in c]

In [8]:
param = {'objective':'regression',
         'num_leaves': 31,
         'min_data_in_leaf': 25,
         'max_depth': 7,
         'learning_rate': 0.01,
         'lambda_l1':0.13,
         "boosting": "gbdt",
         "feature_fraction":0.85,
         'bagging_freq':8,
         "bagging_fraction": 0.9 ,
         "metric": 'rmse',
         "verbosity": -1,
         "random_state": 2333}

In [None]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2333)
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df,train_df['outliers'].values)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx])#, categorical_feature=categorical_feats)
    val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx])#, categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval= 100, early_stopping_rounds = 200)
    oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test_df[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(mean_squared_error(oof, target)**0.5))

fold 0
