In [1]:
import gc
import re
import pickle
from IPython.display import display

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer

from pipelines import Pipeline
from utils import (
    camelcase_to_underscore, save_model, load_data, split_columns_by_types, kfold_with_respect_to_groups,
    reduce_mem_usage, Timer
)

%load_ext memory_profiler

In [2]:
df = load_data('train', 'input', sample_size=10000)
df.head()

Memory usage of dataframe is 2.21 MB
Memory usage after optimization is: 0.64 MB
Decreased by 71.1%


Unnamed: 0,id,group_id,match_id,assists,boosts,damage_dealt,dbn_os,headshot_kills,heals,kill_place,...,revives,ride_distance,road_kills,swim_distance,team_kills,vehicle_destroys,walk_distance,weapons_acquired,win_points,win_place_perc
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,...,0,0.0,0,0.0,0,0,244.75,1,1466,0.444336
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.5,0,0,0,57,...,0,0.004501,0,11.039062,0,0,1434.0,5,0,0.640137
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,...,0,0.0,0,0.0,0,0,161.75,2,0,0.775391
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.90625,0,0,0,75,...,0,0.0,0,0.0,0,0,202.75,3,0,0.166748
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,...,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


In [3]:
columns = split_columns_by_types(df)
display(df[columns['id']].head(), df[columns['target']].head())

Unnamed: 0,id,group_id,match_id
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6


0    0.444336
1    0.640137
2    0.775391
3    0.166748
4    0.187500
Name: win_place_perc, dtype: float16

Drop NaN target

In [4]:
df.drop(df[df['win_place_perc'].isnull()].index, inplace=True)

Train

In [5]:
%%time
%%memit
from lightgbm import LGBMModel
from sklearn.metrics import mean_absolute_error

from assess import assess


model_params = dict(
    objective='regression',
    metric='mae',
#     n_estimators=20000,
    n_estimators=2000,
    num_leaves=31,
    learning_rate=0.05,
    bagging_fraction=0.7,
    bagging_seed=0,
    num_threads=4,
    colsample_bytree=0.7
)

assessment_log = assess(
    LGBMModel(**model_params), 
    df, 
    columns,
    metrics=mean_absolute_error,
    n_splits=1,
    early_stopping_rounds=200,
    verbose=1,
)

# df_assessment = pd.DataFrame(assessment_log)
# df_assessment[['train_score', 'valid_score']].plot()

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you won't need to install the gcc compiler anymore.
Instead of that, you'll need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.



---------------------------
features.SimpleFeatureGenerator.transform 0:00:00.030029
features.GroupAggregatedFeatureGenerator.transform 0:00:00.234780
features.FeatureGenerator.transform 0:00:00.370718
preprocessing.Preprocessor.fit_transform 0:00:00.055989
pipelines.Pipeline.fit_transform: 0:00:00.485833
features.SimpleFeatureGenerator.transform 0:00:00.009285
features.GroupAggregatedFeatureGenerator.transform 0:00:00.068977
features.FeatureGenerator.transform 0:00:00.147432
preprocessing.Preprocessor.transform 0:00:00.005254
pipelines.Pipeline.transform: 0:00:00.201230
Data Preparation: 0:00:00.750307
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[225]	valid_0's l1: 0.0615967
Fitting: 0:00:04.187153
Saving: 0:00:00.235373
0.0489957882644485 0.06159670711616151
---------------------------

Erasing cache ...
peak memory: 206.88 MiB, increment: 57.30 MiB
CPU times: user 8.86 s, sys: 2.61 s, total: 11.5 s
Wall time: 5.54 s


In [7]:
best_model = [step for step in assessment_log if step['best']].pop()

df_test = load_data('test', 'input')
pipeline = best_model['pipeline']
model = best_model['model']
x_test = pipeline.transform(df_test)
pred_test = model.predict(x_test)
del df_test, x_test

Memory usage of dataframe is 413.18 MB
Memory usage after optimization is: 121.74 MB
Decreased by 70.5%
features.SimpleFeatureGenerator.transform 0:00:01.868474
features.GroupAggregatedFeatureGenerator.transform 0:00:43.444118
features.FeatureGenerator.transform 0:00:50.933524
preprocessing.Preprocessor.transform 0:01:06.470119
pipelines.Pipeline.transform: 0:01:57.943525


In [21]:
from utils import postprocessing

df_sub = load_data('sub', 'input', normilize_names=False)
df_sub['winPlacePerc'] = pred_test
df_sub_adjusted = postprocessing(pred_test, 'input')
df_sub.to_csv('submission.csv', index=False)
df_sub_adjusted.to_csv('submission_adjusted.csv', index=False)
print(np.corrcoef(df_sub['winPlacePerc'], df_sub_adjusted['winPlacePerc']))

Memory usage of dataframe is 29.51 MB
Memory usage after optimization is: 16.60 MB
Decreased by 43.7%
Memory usage of dataframe is 29.51 MB
Memory usage after optimization is: 16.60 MB
Decreased by 43.7%
Memory usage of dataframe is 413.18 MB
Memory usage after optimization is: 121.74 MB
Decreased by 70.5%
[[1.         0.98046236]
 [0.98046236 1.        ]]


In [38]:
import os

os.mkdir('asdf')
os.chmod('asdf', 0o777)