In [1]:
import re
import pickle
from IPython.display import display

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer

from pipelines import Pipeline
from utils import (
    camelcase_to_underscore, save_model, load_data, split_columns_by_types, kfold_with_respect_to_groups,
    reduce_mem_usage,
)

In [2]:
df = load_data('train', 'input', sample_size=10000)
df.head()

Loading ...
Compressing ...
Memory usage of dataframe is 2.21 MB
Memory usage after optimization is: 0.64 MB
Decreased by 71.1%


Unnamed: 0,id,group_id,match_id,assists,boosts,damage_dealt,dbn_os,headshot_kills,heals,kill_place,...,revives,ride_distance,road_kills,swim_distance,team_kills,vehicle_destroys,walk_distance,weapons_acquired,win_points,win_place_perc
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,...,0,0.0,0,0.0,0,0,244.75,1,1466,0.444336
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.5,0,0,0,57,...,0,0.004501,0,11.039062,0,0,1434.0,5,0,0.640137
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,...,0,0.0,0,0.0,0,0,161.75,2,0,0.775391
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.90625,0,0,0,75,...,0,0.0,0,0.0,0,0,202.75,3,0,0.166748
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,...,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


In [3]:
columns = split_columns_by_types(df)
display(df[columns['id']].head(), df[columns['target']].head())

Unnamed: 0,id,group_id,match_id
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6


0    0.444336
1    0.640137
2    0.775391
3    0.166748
4    0.187500
Name: win_place_perc, dtype: float16

Drop NaN target

In [4]:
df.drop(df[df['win_place_perc'].isnull()].index, inplace=True)

Train

In [5]:
%%time
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error

splits = kfold_with_respect_to_groups(df, n_splits=5)
log = []
for train_index, valid_index in splits:
    step = dict()
    pipeline = Pipeline(
        id_columns=columns['id'], 
        numerical_columns=columns['numeric'],
        categorical_columns=columns['categorical'],
        target_column=columns['target'],
    )
    x_train = pipeline.fit_transform(df.loc[train_index, :])
    y_train = df.loc[train_index, columns['target']]
    y_train.fillna(0, inplace=True) 
    x_valid = pipeline.transform(df.loc[valid_index, :])
    y_valid = df.loc[valid_index, columns['target']]
    
    print('Fitting ...')
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_eval = lgb.Dataset(x_valid, y_valid, reference=lgb_train)
    lgbm_params = dict(
        objective='regression',
        metric='mae',
        n_jobs=-1,
        verbose=2,
        learning_rate=0.1,
        n_estimators=2000,
    )
    model = lgb.train(
        lgbm_params, 
        lgb_train, 
        valid_sets=lgb_eval, 
        early_stopping_rounds=20,
    )
    step['train_score'] = mean_absolute_error(y_train, model.predict(x_train))
#     del x_train, y_train
    
    step['valid_score'] = mean_absolute_error(y_valid, model.predict(x_valid))
    step['model'] = model
    step['pipeline'] = pipeline
    step['train_index'] = train_index
    step['valid_index'] = valid_index
    try:
        save_model(step)
    except Exception:
        print("Warning: Couldn't save the model")
    print(step['train_score'], step['valid_score'])
    log.append(step)
#     break

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you won't need to install the gcc compiler anymore.
Instead of that, you'll need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


Transforming ...
FeatureGenerator ...
Preprocessor ...
Transforming ...
FeatureGenerator ...
Preprocessor ...
Fitting ...
[1]	valid_0's l1: 0.241396
Training until validation scores don't improve for 20 rounds.


  return self.partial_fit(X, y)


[2]	valid_0's l1: 0.222247
[3]	valid_0's l1: 0.204782
[4]	valid_0's l1: 0.189829
[5]	valid_0's l1: 0.175982
[6]	valid_0's l1: 0.164141
[7]	valid_0's l1: 0.153593
[8]	valid_0's l1: 0.144287
[9]	valid_0's l1: 0.135912
[10]	valid_0's l1: 0.1283
[11]	valid_0's l1: 0.121136
[12]	valid_0's l1: 0.115239
[13]	valid_0's l1: 0.109946
[14]	valid_0's l1: 0.105781
[15]	valid_0's l1: 0.101604
[16]	valid_0's l1: 0.0987851
[17]	valid_0's l1: 0.0961331
[18]	valid_0's l1: 0.0940454
[19]	valid_0's l1: 0.0915877
[20]	valid_0's l1: 0.0898738
[21]	valid_0's l1: 0.0884584
[22]	valid_0's l1: 0.0868911
[23]	valid_0's l1: 0.0853341
[24]	valid_0's l1: 0.0839067
[25]	valid_0's l1: 0.0827586
[26]	valid_0's l1: 0.0817461
[27]	valid_0's l1: 0.0807885
[28]	valid_0's l1: 0.0799816
[29]	valid_0's l1: 0.0792972
[30]	valid_0's l1: 0.0787532
[31]	valid_0's l1: 0.0781366
[32]	valid_0's l1: 0.077555
[33]	valid_0's l1: 0.0772503
[34]	valid_0's l1: 0.0772713
[35]	valid_0's l1: 0.0771092
[36]	valid_0's l1: 0.0770116
[37]	valid

  return self.partial_fit(X, y)


[1]	valid_0's l1: 0.243674
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's l1: 0.222299
[3]	valid_0's l1: 0.203227
[4]	valid_0's l1: 0.186733
[5]	valid_0's l1: 0.171843
[6]	valid_0's l1: 0.159589
[7]	valid_0's l1: 0.147955
[8]	valid_0's l1: 0.137492
[9]	valid_0's l1: 0.128345
[10]	valid_0's l1: 0.120581
[11]	valid_0's l1: 0.114379
[12]	valid_0's l1: 0.10876
[13]	valid_0's l1: 0.103746
[14]	valid_0's l1: 0.0993262
[15]	valid_0's l1: 0.095461
[16]	valid_0's l1: 0.0923784
[17]	valid_0's l1: 0.0897641
[18]	valid_0's l1: 0.0872269
[19]	valid_0's l1: 0.0851378
[20]	valid_0's l1: 0.0835827
[21]	valid_0's l1: 0.0821621
[22]	valid_0's l1: 0.0808169
[23]	valid_0's l1: 0.0798271
[24]	valid_0's l1: 0.0789068
[25]	valid_0's l1: 0.0780944
[26]	valid_0's l1: 0.077588
[27]	valid_0's l1: 0.0771273
[28]	valid_0's l1: 0.0768372
[29]	valid_0's l1: 0.076211
[30]	valid_0's l1: 0.075769
[31]	valid_0's l1: 0.0753999
[32]	valid_0's l1: 0.0751064
[33]	valid_0's l1: 0.0749516
[34]	val

  return self.partial_fit(X, y)


[1]	valid_0's l1: 0.240402
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's l1: 0.218682
[3]	valid_0's l1: 0.199955
[4]	valid_0's l1: 0.183292
[5]	valid_0's l1: 0.168741
[6]	valid_0's l1: 0.155619
[7]	valid_0's l1: 0.144129
[8]	valid_0's l1: 0.13421
[9]	valid_0's l1: 0.125037
[10]	valid_0's l1: 0.117342
[11]	valid_0's l1: 0.110732
[12]	valid_0's l1: 0.105327
[13]	valid_0's l1: 0.100381
[14]	valid_0's l1: 0.0963837
[15]	valid_0's l1: 0.0926817
[16]	valid_0's l1: 0.0899014
[17]	valid_0's l1: 0.0875017
[18]	valid_0's l1: 0.085296
[19]	valid_0's l1: 0.083248
[20]	valid_0's l1: 0.0816282
[21]	valid_0's l1: 0.0804204
[22]	valid_0's l1: 0.0792719
[23]	valid_0's l1: 0.0783247
[24]	valid_0's l1: 0.0776084
[25]	valid_0's l1: 0.0767072
[26]	valid_0's l1: 0.0759851
[27]	valid_0's l1: 0.0754491
[28]	valid_0's l1: 0.0749791
[29]	valid_0's l1: 0.0745144
[30]	valid_0's l1: 0.074089
[31]	valid_0's l1: 0.0737679
[32]	valid_0's l1: 0.0733393
[33]	valid_0's l1: 0.0729523
[34]	va

  return self.partial_fit(X, y)


[1]	valid_0's l1: 0.243726
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's l1: 0.222259
[3]	valid_0's l1: 0.202915
[4]	valid_0's l1: 0.186229
[5]	valid_0's l1: 0.170961
[6]	valid_0's l1: 0.158132
[7]	valid_0's l1: 0.146801
[8]	valid_0's l1: 0.136862
[9]	valid_0's l1: 0.128159
[10]	valid_0's l1: 0.121094
[11]	valid_0's l1: 0.114561
[12]	valid_0's l1: 0.109178
[13]	valid_0's l1: 0.104352
[14]	valid_0's l1: 0.100072
[15]	valid_0's l1: 0.0963371
[16]	valid_0's l1: 0.0932962
[17]	valid_0's l1: 0.0904281
[18]	valid_0's l1: 0.0879722
[19]	valid_0's l1: 0.0861662
[20]	valid_0's l1: 0.0843718
[21]	valid_0's l1: 0.0828826
[22]	valid_0's l1: 0.0815868
[23]	valid_0's l1: 0.080514
[24]	valid_0's l1: 0.0794505
[25]	valid_0's l1: 0.0787346
[26]	valid_0's l1: 0.0778927
[27]	valid_0's l1: 0.0772752
[28]	valid_0's l1: 0.076614
[29]	valid_0's l1: 0.0760422
[30]	valid_0's l1: 0.0755428
[31]	valid_0's l1: 0.0750169
[32]	valid_0's l1: 0.0745555
[33]	valid_0's l1: 0.0741598
[34]	v

  return self.partial_fit(X, y)


[1]	valid_0's l1: 0.247395
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's l1: 0.225753
[3]	valid_0's l1: 0.206658
[4]	valid_0's l1: 0.189355
[5]	valid_0's l1: 0.173943
[6]	valid_0's l1: 0.161129
[7]	valid_0's l1: 0.149362
[8]	valid_0's l1: 0.139408
[9]	valid_0's l1: 0.130866
[10]	valid_0's l1: 0.122906
[11]	valid_0's l1: 0.115797
[12]	valid_0's l1: 0.109898
[13]	valid_0's l1: 0.105249
[14]	valid_0's l1: 0.100558
[15]	valid_0's l1: 0.0965997
[16]	valid_0's l1: 0.0932522
[17]	valid_0's l1: 0.0906997
[18]	valid_0's l1: 0.0883086
[19]	valid_0's l1: 0.0866291
[20]	valid_0's l1: 0.0847027
[21]	valid_0's l1: 0.0832024
[22]	valid_0's l1: 0.0819308
[23]	valid_0's l1: 0.0807544
[24]	valid_0's l1: 0.0798854
[25]	valid_0's l1: 0.0791341
[26]	valid_0's l1: 0.0785654
[27]	valid_0's l1: 0.0778867
[28]	valid_0's l1: 0.07722
[29]	valid_0's l1: 0.0768465
[30]	valid_0's l1: 0.0764407
[31]	valid_0's l1: 0.0762234
[32]	valid_0's l1: 0.0760498
[33]	valid_0's l1: 0.0757478
[34]	v