In [3]:
import re
import pickle
from IPython.display import display

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer

from pipelines import Pipeline
from utils import camelcase_to_underscore, save_model

In [15]:
df = pd.read_csv('input/train_V2.csv')
# df = pd.read_csv('input/train_V2.csv', nrows=100000)
df.columns = [camelcase_to_underscore(col) for col in df.columns]
# display(df.head(), df.shape, list(df.columns))

drop nan

In [16]:
df.drop(df[df['win_place_perc'].isnull()].index, inplace=True)

column types

In [17]:
id_features = ['id', 'group_id', 'match_id']
categorical_features = ['match_type', ]
target_feature = 'win_place_perc'
base_features = [col for col in df.columns if col not in id_features + categorical_features + [target_feature]] 

Train

Стоит заметить, что на лидерборде этот подход дал 0.049, а на валидации 0.062, возможно дело в том, что не стоит раскидывать игроков одного пати в матче в разные сплиты.

In [18]:
%%time
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

kfold = KFold(n_splits = 2)
kfold.get_n_splits(df)
log = []
for train_index, valid_index in kfold.split(df):
    step = dict()
    pipeline = Pipeline(
        id_columns=id_features, 
        numerical_columns=base_features,
        categorical_columns=categorical_features,
        target_column=target_feature,
    )
    x_train = pipeline.fit_transform(df.loc[train_index, :])
    y_train = df.loc[train_index, target_feature]
    y_train.fillna(0, inplace=True) 
    x_valid = pipeline.transform(df.loc[valid_index, :])
    y_valid = df.loc[valid_index, target_feature]
    
    print('Fitting ...')
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_eval = lgb.Dataset(x_valid, y_valid, reference=lgb_train)
    lgbm_params = dict(
        objective='regression',
        metric='mae',
        n_jobs=-1,
        verbose=2,
        learning_rate=0.1,
        n_estimators=2000,
    )
    model = lgb.train(
        lgbm_params, 
        lgb_train, 
        valid_sets=lgb_eval, 
        early_stopping_rounds=20,
    )
#     model.fit(x_train, y_train)
    step['train_score'] = mean_absolute_error(y_train, model.predict(x_train))
#     del x_train, y_train
    
    step['valid_score'] = mean_absolute_error(y_valid, model.predict(x_valid))
    step['model'] = model
    step['pipeline'] = pipeline
    step['train_index'] = train_index
    step['valid_index'] = valid_index
    try:
        save_model(step)
    except Exception:
        print("Warning: Couldn't save the model")
    print(step['train_score'], step['valid_score'])
    log.append(step)
    break

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Transforming ...
FeatureGenerator ...
Preprocessor ...


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]


Transforming ...
FeatureGenerator ...
Preprocessor ...
Fitting ...




[1]	valid_0's l1: 0.246634
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's l1: 0.228223
[3]	valid_0's l1: 0.212114
[4]	valid_0's l1: 0.196795
[5]	valid_0's l1: 0.183793
[6]	valid_0's l1: 0.172232
[7]	valid_0's l1: 0.161213
[8]	valid_0's l1: 0.151427
[9]	valid_0's l1: 0.143032
[10]	valid_0's l1: 0.135116
[11]	valid_0's l1: 0.128153
[12]	valid_0's l1: 0.121624
[13]	valid_0's l1: 0.116468
[14]	valid_0's l1: 0.111306
[15]	valid_0's l1: 0.10656
[16]	valid_0's l1: 0.102766
[17]	valid_0's l1: 0.0988835
[18]	valid_0's l1: 0.0958594
[19]	valid_0's l1: 0.0931965
[20]	valid_0's l1: 0.0909657
[21]	valid_0's l1: 0.0890002
[22]	valid_0's l1: 0.0874118
[23]	valid_0's l1: 0.0859108
[24]	valid_0's l1: 0.0843731
[25]	valid_0's l1: 0.0831124
[26]	valid_0's l1: 0.0817853
[27]	valid_0's l1: 0.0807551
[28]	valid_0's l1: 0.0798906
[29]	valid_0's l1: 0.0790306
[30]	valid_0's l1: 0.0784385
[31]	valid_0's l1: 0.077812
[32]	valid_0's l1: 0.0769843
[33]	valid_0's l1: 0.0763544
[34]	val

Adjustment

In [19]:
pred = model.predict(x_valid)

Emulate submission for validation data

In [51]:
df_valid = df.loc[valid_index, :]
df_sub = df_valid.loc[:, ['id', target_feature]].copy()
df_sub['winPlacePerc'] = pred

Когда челики играют в пати, то все они в итоге получают последнего выжевшено (максимум по пати), вроде как adjusted_perc это и учитывает.

Если в матче max_place < 100, то на это также нужно делать корректировку.

In [52]:
df_sub = df_sub.merge(df_valid[["id", "match_id", "group_id", "max_place", "num_groups"]], on="id", how="left")

# Sort, rank, and assign adjusted ratio
df_sub_group = df_sub.groupby(["match_id", "group_id"]).first().reset_index()
df_sub_group["rank"] = df_sub_group.groupby(["match_id"])["win_place_perc"].rank()
df_sub_group = df_sub_group.merge(
    df_sub_group.groupby("match_id")["rank"].max().to_frame("max_rank").reset_index(), 
    on="match_id", how="left")
df_sub_group["adjusted_perc"] = (df_sub_group["rank"] - 1) / (df_sub_group["num_groups"] - 1)

df_sub = df_sub.merge(df_sub_group[["adjusted_perc", "match_id", "group_id"]], on=["match_id", "group_id"], how="left")
df_sub["win_place_perc"] = df_sub["adjusted_perc"]

# Deal with edge cases
df_sub.loc[df_sub['max_place'] == 0, "win_place_perc"] = 0
df_sub.loc[df_sub['max_place'] == 1, "win_place_perc"] = 1

# Align with maxPlace
# Credit: https://www.kaggle.com/anycode/simple-nn-baseline-4
subset = df_sub.loc[df_sub['max_place'] > 1]
gap = 1.0 / (subset['max_place'].values - 1)
new_perc = np.around(subset['win_place_perc'].values / gap) * gap
df_sub.loc[df_sub['max_place'] > 1, "win_place_perc"] = new_perc

# Edge case
df_sub.loc[(df_sub['max_place'] > 1) & (df_sub['num_groups'] == 1), "win_place_perc"] = 0
assert df_sub["win_place_perc"].isnull().sum() == 0
