# Addressing outliers in the Kaggle ELO chanllege

This kernel is inspired by https://www.kaggle.com/waitingli/combining-your-model-with-a-model-without-outlier

In [1]:
import numpy as np
import pandas as pd
import time
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error,log_loss

import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

import warnings
warnings.filterwarnings("ignore")


This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

## Training the model without outliers

In [2]:
%%time
train_df = pd.read_csv('../train_v4_agg.csv')
test_df = pd.read_csv('../test_v4_agg.csv')

#train_df = reduce_mem_usage(train_df)
#test_df = reduce_mem_usage(test_df)
#del train_df['Unnamed: 0']
#del test_df['Unnamed: 0']

CPU times: user 10.2 s, sys: 632 ms, total: 10.9 s
Wall time: 10.9 s


In [3]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target,elasped_time,hist_hist_transactions_count,hist_category_1_sum,...,city_id_purchase_amount_min,city_id_purchase_amount_max,city_id_purchase_amount_std,category_1_installments_mean,category_1_installments_min,category_1_installments_max,category_1_installments_std,authorized_flag_mean,year,month
0,0,2017-06-01,C_ID_92a2005557,5,2,1,-0.820283,245,13.0,0.0,...,-0.606593,-0.296112,0.155803,0.0,0.0,0.0,,0.95,2017,6
1,1,2017-01-01,C_ID_3d0044924f,4,1,0,0.392913,396,11.0,2.0,...,-0.725956,-0.725956,,1.0,1.0,1.0,,0.968571,2017,1
2,2,2016-08-01,C_ID_d639edf6cd,2,2,0,0.688056,549,2.0,0.0,...,-0.700326,-0.700326,,0.0,0.0,0.0,,0.953488,2016,8
3,3,2017-09-01,C_ID_186d6a6901,4,3,0,0.142495,153,,,...,-0.665244,-0.66291,0.00165,0.833333,0.666667,1.0,0.235702,1.0,2017,9
4,4,2017-11-01,C_ID_cdbd2c0db2,1,3,0,-0.159749,92,5.0,3.0,...,-0.671174,-0.326762,0.150674,1.220588,0.941176,1.5,0.395148,0.962406,2017,11


In [4]:
train_df['outliers'] = 0
train_df.loc[train_df['target']<-30,'outliers']=1
train_df['outliers'].value_counts()

0    199710
1      2207
Name: outliers, dtype: int64

In [5]:
train_df = train_df[train_df['outliers'] == 0]
target = train_df['target']
#del train_df['target']
to_drop = ['target','Unnamed: 0']
train_df.drop(labels=to_drop ,axis = 1,inplace = True)
test_df.drop('Unnamed: 0',axis = 1,inplace = True)

In [6]:
train_df.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,elasped_time,hist_hist_transactions_count,hist_category_1_sum,hist_category_1_mean,hist_category_2_1.0_mean,...,city_id_purchase_amount_max,city_id_purchase_amount_std,category_1_installments_mean,category_1_installments_min,category_1_installments_max,category_1_installments_std,authorized_flag_mean,year,month,outliers
0,2017-06-01,C_ID_92a2005557,5,2,1,245,13.0,0.0,0.0,1.0,...,-0.296112,0.155803,0.0,0.0,0.0,,0.95,2017,6,0
1,2017-01-01,C_ID_3d0044924f,4,1,0,396,11.0,2.0,0.181818,0.818182,...,-0.725956,,1.0,1.0,1.0,,0.968571,2017,1,0
2,2016-08-01,C_ID_d639edf6cd,2,2,0,549,2.0,0.0,0.0,0.0,...,-0.700326,,0.0,0.0,0.0,,0.953488,2016,8,0
3,2017-09-01,C_ID_186d6a6901,4,3,0,153,,,,,...,-0.66291,0.00165,0.833333,0.666667,1.0,0.235702,1.0,2017,9,0
4,2017-11-01,C_ID_cdbd2c0db2,1,3,0,92,5.0,3.0,0.6,0.0,...,-0.326762,0.150674,1.220588,0.941176,1.5,0.395148,0.962406,2017,11,0


In [7]:
features = [c for c in train_df.columns if c not in ['card_id', 'first_active_month','outliers']]
#categorical_feats = [c for c in features if 'feature_' in c]
categorical_feats = ['feature_1','feature_2', 'feature_3','year','month']

In [8]:
features

['feature_1',
 'feature_2',
 'feature_3',
 'elasped_time',
 'hist_hist_transactions_count',
 'hist_category_1_sum',
 'hist_category_1_mean',
 'hist_category_2_1.0_mean',
 'hist_category_2_2.0_mean',
 'hist_category_2_3.0_mean',
 'hist_category_2_4.0_mean',
 'hist_category_2_5.0_mean',
 'hist_category_3_A_mean',
 'hist_category_3_B_mean',
 'hist_category_3_C_mean',
 'hist_merchant_id_nunique',
 'hist_merchant_category_id_nunique',
 'hist_state_id_nunique',
 'hist_city_id_nunique',
 'hist_subsector_id_nunique',
 'hist_year_nunique',
 'hist_month_nunique',
 'hist_weekofyear_nunique',
 'hist_weekend_sum',
 'hist_weekend_mean',
 'hist_weekday_sum',
 'hist_weekday_mean',
 'hist_purchase_amount_sum',
 'hist_purchase_amount_median',
 'hist_purchase_amount_max',
 'hist_purchase_amount_min',
 'hist_purchase_amount_std',
 'hist_installments_sum',
 'hist_installments_median',
 'hist_installments_max',
 'hist_installments_min',
 'hist_installments_std',
 'hist_purchase_date_ptp',
 'hist_month_lag_m

In [None]:
param = {'objective':'regression',
         'num_leaves': 31,
         'min_data_in_leaf': 25,
         'max_depth': 7,
         'learning_rate': 0.01,
         'lambda_l1':0.13,
         "boosting": "gbdt",
         "feature_fraction":0.85,
         'bagging_freq':8,
         "bagging_fraction": 0.9 ,
         "metric": 'rmse',
         "verbosity": -1}

In [9]:
param = {'num_leaves': 111,
         'min_data_in_leaf': 149, 
         'objective':'regression',
         'max_depth': 9,
         'learning_rate': 0.005,
         "boosting": "gbdt",
         "feature_fraction": 0.7522,
         "bagging_freq": 1,
         "bagging_fraction": 0.7083 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.2634,
         "random_state": 133,
         "verbosity": -1}

In [10]:
folds = KFold(n_splits=5, shuffle=True, random_state=133)
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values,target.values)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx],categorical_feature=categorical_feats)
    val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx],categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval= 100, early_stopping_rounds = 200)
    oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test_df[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(mean_squared_error(oof, target)**0.5))

fold 0
Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 1.62871	valid_1's rmse: 1.63699
[200]	training's rmse: 1.58557	valid_1's rmse: 1.59981
[300]	training's rmse: 1.56126	valid_1's rmse: 1.58164
[400]	training's rmse: 1.5456	valid_1's rmse: 1.57194
[500]	training's rmse: 1.53401	valid_1's rmse: 1.56617
[600]	training's rmse: 1.52468	valid_1's rmse: 1.56251
[700]	training's rmse: 1.51661	valid_1's rmse: 1.56004
[800]	training's rmse: 1.50988	valid_1's rmse: 1.55836
[900]	training's rmse: 1.5038	valid_1's rmse: 1.55709
[1000]	training's rmse: 1.49837	valid_1's rmse: 1.55614
[1100]	training's rmse: 1.49311	valid_1's rmse: 1.5554
[1200]	training's rmse: 1.48855	valid_1's rmse: 1.55485
[1300]	training's rmse: 1.48437	valid_1's rmse: 1.55438
[1400]	training's rmse: 1.48017	valid_1's rmse: 1.55409
[1500]	training's rmse: 1.47662	valid_1's rmse: 1.55392
[1600]	training's rmse: 1.47267	valid_1's rmse: 1.5537
[1700]	training's rmse: 1.46882	valid_1's rmse:

In [11]:
model_without_outliers = pd.DataFrame({"card_id":test_df["card_id"].values})
model_without_outliers["target"] = predictions

## Training model for outliers classification

In [12]:
train_df = pd.read_csv('../train_v4_agg.csv')
test_df = pd.read_csv('../test_v4_agg.csv')

In [14]:
train_df['outliers'] = 0
train_df.loc[train_df['target']<-30,'outliers']=1

In [None]:
target = train_df['outliers']
del train_df['outliers']
del train_df['target']

In [19]:
train_df.drop('Unnamed: 0' ,axis = 1,inplace = True)
test_df.drop('Unnamed: 0',axis = 1,inplace = True)

In [20]:
features = [c for c in train_df.columns if c not in ['card_id', 'first_active_month']]
#categorical_feats = [c for c in features if 'feature_' in c]
categorical_feats = ['feature_1','feature_2', 'feature_3','year','month']

In [21]:
features

['feature_1',
 'feature_2',
 'feature_3',
 'elasped_time',
 'hist_hist_transactions_count',
 'hist_category_1_sum',
 'hist_category_1_mean',
 'hist_category_2_1.0_mean',
 'hist_category_2_2.0_mean',
 'hist_category_2_3.0_mean',
 'hist_category_2_4.0_mean',
 'hist_category_2_5.0_mean',
 'hist_category_3_A_mean',
 'hist_category_3_B_mean',
 'hist_category_3_C_mean',
 'hist_merchant_id_nunique',
 'hist_merchant_category_id_nunique',
 'hist_state_id_nunique',
 'hist_city_id_nunique',
 'hist_subsector_id_nunique',
 'hist_year_nunique',
 'hist_month_nunique',
 'hist_weekofyear_nunique',
 'hist_weekend_sum',
 'hist_weekend_mean',
 'hist_weekday_sum',
 'hist_weekday_mean',
 'hist_purchase_amount_sum',
 'hist_purchase_amount_median',
 'hist_purchase_amount_max',
 'hist_purchase_amount_min',
 'hist_purchase_amount_std',
 'hist_installments_sum',
 'hist_installments_median',
 'hist_installments_max',
 'hist_installments_min',
 'hist_installments_std',
 'hist_purchase_date_ptp',
 'hist_month_lag_m

In [22]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'binary',
         'max_depth': 6,
         'learning_rate': 0.01,
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'binary_logloss',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "random_state": 2333}

In [24]:
folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))
feature_importance_df = pd.DataFrame()

start = time.time()


for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
    val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx], categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 200)
    oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test_df[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(log_loss(target, oof)))

fold n°0
Training until validation scores don't improve for 200 rounds.
[100]	training's binary_logloss: 0.0431461	valid_1's binary_logloss: 0.0471357
[200]	training's binary_logloss: 0.0393119	valid_1's binary_logloss: 0.0451603
[300]	training's binary_logloss: 0.0369321	valid_1's binary_logloss: 0.0445454
[400]	training's binary_logloss: 0.035152	valid_1's binary_logloss: 0.0443119
[500]	training's binary_logloss: 0.0335519	valid_1's binary_logloss: 0.044214
[600]	training's binary_logloss: 0.0320973	valid_1's binary_logloss: 0.0441767
[700]	training's binary_logloss: 0.0306716	valid_1's binary_logloss: 0.0441624
[800]	training's binary_logloss: 0.0294403	valid_1's binary_logloss: 0.044192
Early stopping, best iteration is:
[674]	training's binary_logloss: 0.0310365	valid_1's binary_logloss: 0.0441595
fold n°1
Training until validation scores don't improve for 200 rounds.
[100]	training's binary_logloss: 0.0435314	valid_1's binary_logloss: 0.0457959
[200]	training's binary_logloss: 0

In [25]:
df_outlier_prob = pd.DataFrame({"card_id":test_df["card_id"].values})
df_outlier_prob["target"] = predictions
df_outlier_prob.head()

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,0.1218
1,C_ID_130fd0cbdd,0.002262
2,C_ID_b709037bc5,0.012658
3,C_ID_d27d835a9f,0.000173
4,C_ID_2b5e3df5c2,0.002207


## Combine for submission

In [26]:
# In case missing some predictable outlier, we choose top 25000 with highest outliers likelyhood.
outlier_id = pd.DataFrame(df_outlier_prob.sort_values(by='target',ascending = False).head(25000)['card_id'])

In [27]:
best_submission = pd.read_csv('../3.695.csv')

In [28]:
most_likely_liers = best_submission.merge(outlier_id,how='right')
most_likely_liers.head()

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,-2.502326
1,C_ID_b709037bc5,-0.959266
2,C_ID_f7cada36d3,0.411326
3,C_ID_6d8dba8475,-0.893964
4,C_ID_7f1041e8e1,-4.872942


In [None]:
#%%time
#for card_id in most_likely_liers['card_id']:
#    model_without_outliers.loc[model_without_outliers['card_id']==card_id,'target']\
#    = most_likely_liers.loc[most_likely_liers['card_id']==card_id,'target'].values

In [29]:
ix1 = model_without_outliers["card_id"].isin(outlier_id["card_id"].values)
ix2 = best_submission["card_id"].isin(outlier_id["card_id"].values)
model_without_outliers.loc[ix1, "target"] = best_submission[ix2]["target"]

In [30]:
model_without_outliers.to_csv("../submit_v4_combining.csv", index=False)