In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['train_V2.csv', 'test_V2.csv', 'sample_submission_V2.csv']


utils.py

In [2]:
import re
import bz2
import pickle
from datetime import datetime


def camelcase_to_underscore(name):
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()


def save_model(pipeline_model_index_score):
    current_datetime = datetime.now().strftime('%d.%m.%Y-%H.%M.%S')
    str_valid_score = '{0:.5f}'.format(pipeline_model_index_score['valid_score'])
    name = f'valid_score_{str_valid_score}__{current_datetime}'
    path = f'models/{name}.pkl.bz2'
    with bz2.BZ2File(path, 'w') as fout:
        pickle.dump(pipeline_model_index_score, fout)


def load_model(path):
    with bz2.BZ2File(path, 'r') as fin:
        return pickle.load(fin)


def predict_from_file(df, path):
    model = load_model(path)
    x = model['pipeline'].transform(df)
    return model['model'].predict(x)


features.py

In [3]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin


class FeatureGenerator(BaseEstimator, TransformerMixin):
    """
    Иерархия:
        - SimpleFeatureGenerator
        - GroupAggregatedFeatureGenerator,
    """
    def __init__(self, numerical_columns, id_columns=None, target_column=None, categorical_columns=None):
        self.created_features = None
        self.id_columns = id_columns
        self.target_column = target_column
        self.categorical_columns = categorical_columns
        self.numerical_columns = numerical_columns

    def fit_transform(self, df, y=None, **fit_params):
        return self.transform(df)

    def transform(self, df):
        print('FeatureGenerator ...')
        # Hand Written Features
        simple_feature_generator = SimpleFeatureGenerator()
        df_features = pd.concat([df, simple_feature_generator.fit_transform(df)], axis=1)

        # 1-st level
        features = self.numerical_columns + simple_feature_generator.get_feature_names()
        df_features = pd.concat([
            df_features,
            GroupAggregatedFeatureGenerator(features).fit_transform(df_features),
        ], axis=1)

        if self.created_features is None:
            self.created_features = [col for col in df_features.columns if col in df.columns]
        else:
#             assert self.created_features == [col for col in df_features.columns if col in df.columns]
            pass
        return df_features

    def fit(self, x, y=None, **fit_params):
        return self

    def get_feature_names(self):
        return self.created_features


class SimpleFeatureGenerator(BaseEstimator, TransformerMixin):
    """
    Based on https://www.kaggle.com/deffro/eda-is-fun
    """
    def __init__(self):
        self.created_features = None

    def fit_transform(self, df, y=None, **fit_params):
        return self.transform(df)
    
    def transform(self, df):
        df_features = pd.DataFrame()
        df_features['players_joined'] = df.groupby('match_id')['match_id'].transform('count')
        df_features['total_distance'] = df['ride_distance'] + df['walk_distance'] + df['swim_distance']
        df_features['kills_norm'] = df['kills'] * ((100 - df_features['players_joined']) / 100 + 1)
        df_features['damage_dealt_norm'] = df['damage_dealt'] * ((100 - df_features['players_joined']) / 100 + 1)
        df_features['heals_and_boosts'] = df['heals'] + df['boosts']
        df_features['total_distance'] = df['walk_distance'] + df['ride_distance'] + df['swim_distance']
        df_features['boosts_per_walk_distance'] = df['boosts'] / (df['walk_distance'] + 1)
        df_features['boosts_per_walk_distance'].fillna(0, inplace=True)
        df_features['heals_per_walk_distance'] = df['heals'] / (df['walk_distance'] + 1)
        df_features['heals_per_walk_distance'].fillna(0, inplace=True)
        df_features['heals_and_boosts_per_walk_distance'] = df_features['heals_and_boosts'] / (df['walk_distance'] + 1)
        df_features['heals_and_boosts_per_walk_distance'].fillna(0, inplace=True)
        df_features['kills_per_walk_distance'] = df['kills'] / (df['walk_distance'] + 1)
        df_features['kills_per_walk_distance'].fillna(0, inplace=True)
        df_features['team'] = [1 if i > 50 else 2 if (bool(i > 25) & bool(i <= 50)) else 4 for i in df['num_groups']]
        
        if self.created_features is None:
            self.created_features = list(df_features.columns)
        else:
#             assert self.created_features == list(df_features.columns)
            pass
        return df_features

    def fit(self, x, y=None, **fit_params):
        return self
    
    def get_feature_names(self):
        return self.created_features


class GroupAggregatedFeatureGenerator(BaseEstimator, TransformerMixin):
    """
    Based on https://www.kaggle.com/anycode/simple-nn-baseline-4
    """
    def __init__(self, features):
        self.created_features = None
        self.features = features

    def fit_transform(self, df, y=None, **fit_params):
        return self.transform(df)

    def transform(self, df):
        df_features = []
        # Aggregate by Group
        for agg_type in ('mean', 'max', 'min'):
            df_aggregated = df.groupby(['match_id', 'group_id'], as_index=False)[self.features].agg(agg_type)
            df_aggregated = self.restore_row_order(df, df_aggregated, on=['match_id', 'group_id'])
            agg_column_names = {col: f'{agg_type}_group_{col}' for col in self.features}
            df_aggregated.rename(columns=agg_column_names, inplace=True)

            # Rank Groups by Match
            columns_to_select = list(agg_column_names.values())
            # Anyway deletes match_id
#             df_ranked = df_aggregated.groupby('match_id', as_index=False)[columns_to_select].rank(pct=True)
#             ranked_column_names = {col: f'rank_{col}' for col in columns_to_select}
#             df_ranked.rename(columns=ranked_column_names, inplace=True)
            # Unsafe merge because of rank, which deletes match_id
#             df_aggregated_ranked = pd.concat([df_aggregated, df_ranked], axis=1)
#             df_features.append(df_aggregated_ranked)
#             del df_aggregated, df_ranked
            df_features.append(df_aggregated)
            del df_aggregated
        df_features = pd.concat(df_features, axis=1)

        if self.created_features is None:
            self.created_features = list(df_features.columns)
        else:
            if self.created_features == list(df_features.columns):
                print('Lost features')
                for col in df_features.columns:
                    if col not in self.created_features:
                        print(col)
        return df_features

    def fit(self, x, y=None, **fit_params):
        return self

    def get_feature_names(self):
        return self.created_features

    def restore_row_order(self, df, df_aggregated, on):
        """
        Восстановление индекса, FeatureUnion просто стакает колонки,
        поэтому результаты надо приводить к индексу в исходном датафрейме.
        :param df:
        :param df_aggregated:
        :param on:
        :return:
        """
        if isinstance(on, list):
            left_selected = ['index'] + on
        else:
            left_selected = ['index', on]
        df_features = df.reset_index()[left_selected].merge(
            df_aggregated,
            how='left',
            on=on,
        )
        df_features.set_index('index', inplace=True)
        df_features.sort_index(inplace=True)
        return df_features


preprocessing.py

In [4]:
SELECTED_FEATURES = [
    'damage_dealt',
     'dbn_os',
     'kill_place',
     'kills',
     'longest_kill',
     'match_duration',
     'max_place',
     'num_groups',
     'walk_distance',
     'kills_norm',
     'damage_dealt_norm',
     'kills_per_walk_distance',
     'mean_group_boosts',
     'mean_group_damage_dealt',
     'mean_group_dbn_os',
     'mean_group_kill_place',
     'mean_group_kills',
     'mean_group_kill_streaks',
     'mean_group_longest_kill',
     'mean_group_match_duration',
     'mean_group_max_place',
     'mean_group_num_groups',
     'mean_group_walk_distance',
     'mean_group_total_distance',
     'mean_group_kills_norm',
     'mean_group_kills_per_walk_distance',
     'max_group_damage_dealt',
     'max_group_dbn_os',
     'max_group_kill_place',
     'max_group_kill_streaks',
     'max_group_longest_kill',
     'max_group_match_duration',
     'max_group_max_place',
     'max_group_num_groups',
     'max_group_walk_distance',
     'max_group_kills_norm',
     'max_group_damage_dealt_norm',
     'max_group_kills_per_walk_distance',
     'min_group_dbn_os',
     'min_group_kill_place',
     'min_group_kills',
     'min_group_kill_streaks',
     'min_group_longest_kill',
     'min_group_match_duration',
     'min_group_max_place',
     'min_group_num_groups',
     'min_group_walk_distance',
     'min_group_kills_norm',
     'min_group_damage_dealt_norm',
     'min_group_kills_per_walk_distance'
]

In [5]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler


class Preprocessor(BaseEstimator, TransformerMixin):
    """
    """
    def __init__(self, numerical_columns, id_columns=None, target_column=None, categorical_columns=None):
        self.features = None
        self.id_columns = id_columns
        self.target_column = target_column
        self.categorical_columns = categorical_columns
        self.numerical_columns = numerical_columns

        self.imputer = None
        self.scaler = None

    def fit_transform(self, df, y=None, **fit_params):
        print('Preprocessor ...')
        # Drop columns
        to_drop = [col for col in df.columns if col in self.id_columns + [self.target_column] + self.categorical_columns]
        x = df.drop(to_drop, axis=1).copy()
        # Fill missings
        x.fillna(0, inplace=True)
        # Feature Selection
        non_selected = [col for col in x.columns if col not in SELECTED_FEATURES]
#         non_selected = []
        x.drop(non_selected, axis=1, inplace=True)
        # Normilize
        self.scaler = MinMaxScaler()
        self.features = x.columns
        x = x.astype(np.float64)
        x = pd.DataFrame(self.scaler.fit_transform(x), columns=[col for col in self.features if col in SELECTED_FEATURES])
        return x


    def transform(self, df):
        print('Preprocessor ...')
        # Drop columns
        to_drop = [col for col in df.columns if col in self.id_columns + [self.target_column] + self.categorical_columns]
        x = df.drop(to_drop, axis=1).copy()
        # Fill missings
        x.fillna(0, inplace=True)
        # Feature Selection
        non_selected = [col for col in x.columns if col not in SELECTED_FEATURES]
#         non_selected = []
        x.drop(non_selected, axis=1, inplace=True)
        # Normilize
        x = pd.DataFrame(self.scaler.fit_transform(x.astype(np.float64)), columns=[col for col in self.features if col in SELECTED_FEATURES])
        return x

    def fit(self, x, y=None, **fit_params):
        return self

    def get_feature_names(self):
        return self.features


pipeline.py

In [6]:
# from sklearn.base import BaseEstimator, TransformerMixin

# from features import FeatureGenerator
# from preprocessing import Preprocessor


class NotFittedError(Exception):
    pass


class Pipeline(BaseEstimator, TransformerMixin):
    """
    """
    def __init__(self, numerical_columns, id_columns=None, target_column=None, categorical_columns=None):
        self.created_features = None
        self.id_columns = id_columns
        self.target_column = target_column
        self.categorical_columns = categorical_columns
        self.numerical_columns = numerical_columns

        self.feature_generator = None
        self.preprocessor = None

    def fit_transform(self, df, y=None, **fit_params):
        print('Transforming ...')
        self.feature_generator = FeatureGenerator(
            id_columns=self.id_columns,
            numerical_columns=self.numerical_columns,
            categorical_columns=self.categorical_columns,
            target_column=self.target_column,
        )
        df_features = self.feature_generator.fit_transform(df)

        self.preprocessor = Preprocessor(
            id_columns=self.id_columns,
            numerical_columns=self.numerical_columns,
            categorical_columns=self.categorical_columns,
            target_column=self.target_column,
        )
        x = self.preprocessor.fit_transform(df_features)
        return x

    def transform(self, df):
        print('Transforming ...')
        if self.feature_generator is None:
            raise NotFittedError(f'feature_generator = {self.feature_generator}')
        if self.preprocessor is None:
            raise NotFittedError(f'preprocessor = {self.preprocessor}')

        df_features = self.feature_generator.transform(df)
        x = self.preprocessor.transform(df_features)
        return x

    def fit(self, x, y=None, **fit_params):
        return self

    def get_feature_names(self):
        return self.created_features



In [7]:
import re
import pickle
from IPython.display import display

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer

KernelsFeaturesWithSelection.ipynb

### Read Data

In [8]:
df = pd.read_csv('../input/train_V2.csv')
# df = pd.read_csv('../input/train_V2.csv', nrows=100000)
df.columns = [camelcase_to_underscore(col) for col in df.columns]
display(df.head(), df.shape, list(df.columns))

Unnamed: 0,id,group_id,match_id,assists,boosts,damage_dealt,dbn_os,headshot_kills,heals,kill_place,kill_points,kills,kill_streaks,longest_kill,match_duration,match_type,max_place,num_groups,rank_points,revives,ride_distance,road_kills,swim_distance,team_kills,vehicle_destroys,walk_distance,weapons_acquired,win_points,win_place_perc
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,1241,0,0,0.0,1306,squad-fpp,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,0,0,0,0.0,1777,squad-fpp,26,25,1484,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,0,0,0,0.0,1318,duo,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.9,0,0,0,75,0,0,0,0.0,1436,squad-fpp,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,0,1,1,58.53,1424,solo-fpp,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


(4446966, 29)

['id',
 'group_id',
 'match_id',
 'assists',
 'boosts',
 'damage_dealt',
 'dbn_os',
 'headshot_kills',
 'heals',
 'kill_place',
 'kill_points',
 'kills',
 'kill_streaks',
 'longest_kill',
 'match_duration',
 'match_type',
 'max_place',
 'num_groups',
 'rank_points',
 'revives',
 'ride_distance',
 'road_kills',
 'swim_distance',
 'team_kills',
 'vehicle_destroys',
 'walk_distance',
 'weapons_acquired',
 'win_points',
 'win_place_perc']

### Drop NaN Target

In [9]:
df.drop(df[df['win_place_perc'].isnull()].index, inplace=True)

### Select Numerical Features

In [10]:
id_features = ['id', 'group_id', 'match_id']
categorical_features = ['match_type', ]
target_feature = 'win_place_perc'
base_features = [col for col in df.columns if col not in id_features + categorical_features + [target_feature]] 

### Train 1

In [12]:
%%time
import lightgbm as lgb
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

kfold = KFold(n_splits = 2)
kfold.get_n_splits(df)
log = []
for train_index, valid_index in kfold.split(df):
    step = dict()
    pipeline = Pipeline(
        id_columns=id_features, 
        numerical_columns=base_features,
        categorical_columns=categorical_features,
        target_column=target_feature,
    )
    x_train = pipeline.fit_transform(df.loc[train_index, :])
    y_train = df.loc[train_index, target_feature]
    y_train.fillna(0, inplace=True) 
    x_valid = pipeline.transform(df.loc[valid_index, :])
    y_valid = df.loc[valid_index, target_feature]
    
    print('Fitting ...')
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_eval = lgb.Dataset(x_valid, y_valid, reference=lgb_train)
    lgbm_params = dict(
        objective='regression',
        metric='mae',
        n_jobs=-1,
        verbose=2,
        learning_rate=0.1,
        n_estimators=2000,
    )
    model = lgb.train(
        lgbm_params, 
        lgb_train, 
        valid_sets=lgb_eval, 
        early_stopping_rounds=20,
    )
#     model.fit(x_train, y_train)
    step['train_score'] = mean_absolute_error(y_train, model.predict(x_train))
#     del x_train, y_train
    
    step['valid_score'] = mean_absolute_error(y_valid, model.predict(x_valid))
    step['model'] = model
    step['pipeline'] = pipeline
    step['train_index'] = train_index
    step['valid_index'] = valid_index
    try:
        save_model(step)
    except Exception:
        print("Warning: Couldn't save the model")
    print(step['train_score'], step['valid_score'])
    log.append(step)
    break

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Transforming ...
FeatureGenerator ...
Preprocessor ...


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]


Transforming ...
FeatureGenerator ...
Preprocessor ...
Fitting ...




[1]	valid_0's l1: 0.472828
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's l1: 0.472828
[3]	valid_0's l1: 0.472828
[4]	valid_0's l1: 0.472828
[5]	valid_0's l1: 0.472828
[6]	valid_0's l1: 0.472828
[7]	valid_0's l1: 0.472828
[8]	valid_0's l1: 0.472828
[9]	valid_0's l1: 0.472828
[10]	valid_0's l1: 0.472828
[11]	valid_0's l1: 0.472828
[12]	valid_0's l1: 0.472828
[13]	valid_0's l1: 0.472828
[14]	valid_0's l1: 0.472828
[15]	valid_0's l1: 0.472828
[16]	valid_0's l1: 0.472828
[17]	valid_0's l1: 0.472828
[18]	valid_0's l1: 0.472828
[19]	valid_0's l1: 0.472828
[20]	valid_0's l1: 0.472828
[21]	valid_0's l1: 0.472828
Early stopping, best iteration is:
[1]	valid_0's l1: 0.472828


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [38]:
# lgb_train = lgb.Dataset(x_train, y_train)
# lgb_eval = lgb.Dataset(x_valid, y_valid, reference=lgb_train)
# lgbm_params = dict(
#     objective='regression',
#     metric='mae',
#     n_jobs=-1,
#     verbose=2,
#     learning_rate=0.1,
#     n_estimators=2000,
# )
# model = lgb.train(
#         lgbm_params, 
#         lgb_train, 
#         valid_sets=lgb_eval, 
#         early_stopping_rounds=20,
#     )



[1]	valid_0's l1: 0.246634
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's l1: 0.228223
[3]	valid_0's l1: 0.212114
[4]	valid_0's l1: 0.196795
[5]	valid_0's l1: 0.183793
[6]	valid_0's l1: 0.172232
[7]	valid_0's l1: 0.161213
[8]	valid_0's l1: 0.151427
[9]	valid_0's l1: 0.143032
[10]	valid_0's l1: 0.135116
[11]	valid_0's l1: 0.128153
[12]	valid_0's l1: 0.121624
[13]	valid_0's l1: 0.116468
[14]	valid_0's l1: 0.111306
[15]	valid_0's l1: 0.10656
[16]	valid_0's l1: 0.102766
[17]	valid_0's l1: 0.0988835
[18]	valid_0's l1: 0.0958594
[19]	valid_0's l1: 0.0931965
[20]	valid_0's l1: 0.0909657
[21]	valid_0's l1: 0.0890002
[22]	valid_0's l1: 0.0874118
[23]	valid_0's l1: 0.0859108
[24]	valid_0's l1: 0.0843731
[25]	valid_0's l1: 0.0831124
[26]	valid_0's l1: 0.0817853
[27]	valid_0's l1: 0.0807551
[28]	valid_0's l1: 0.0798906
[29]	valid_0's l1: 0.0790306
[30]	valid_0's l1: 0.0784385
[31]	valid_0's l1: 0.077812
[32]	valid_0's l1: 0.0769843
[33]	valid_0's l1: 0.0763544
[34]	val

[277]	valid_0's l1: 0.0635096
[278]	valid_0's l1: 0.063507
[279]	valid_0's l1: 0.0635007
[280]	valid_0's l1: 0.0634964
[281]	valid_0's l1: 0.0634852
[282]	valid_0's l1: 0.0634626
[283]	valid_0's l1: 0.063457
[284]	valid_0's l1: 0.0634538
[285]	valid_0's l1: 0.0634407
[286]	valid_0's l1: 0.0634404
[287]	valid_0's l1: 0.0634303
[288]	valid_0's l1: 0.0634273
[289]	valid_0's l1: 0.0634189
[290]	valid_0's l1: 0.0634187
[291]	valid_0's l1: 0.0634216
[292]	valid_0's l1: 0.0634172
[293]	valid_0's l1: 0.0634101
[294]	valid_0's l1: 0.0633905
[295]	valid_0's l1: 0.0633892
[296]	valid_0's l1: 0.0633706
[297]	valid_0's l1: 0.0633057
[298]	valid_0's l1: 0.0632576
[299]	valid_0's l1: 0.0632535
[300]	valid_0's l1: 0.0632501
[301]	valid_0's l1: 0.0632484
[302]	valid_0's l1: 0.0632356
[303]	valid_0's l1: 0.0632295
[304]	valid_0's l1: 0.0632167
[305]	valid_0's l1: 0.0632188
[306]	valid_0's l1: 0.0632059
[307]	valid_0's l1: 0.0631845
[308]	valid_0's l1: 0.0631836
[309]	valid_0's l1: 0.063177
[310]	valid_0

### Submission

In [46]:
df_test = pd.read_csv('../input/test_V2.csv')
df_test.columns = [camelcase_to_underscore(col) for col in df_test.columns]

In [40]:
x_test = pipeline.transform(df_test)

Transforming ...
FeatureGenerator ...
Preprocessor ...


In [41]:
pred = model.predict(x_test)

In [50]:
df_sub = pd.read_csv('../input/sample_submission_V2.csv')
df_sub['winPlacePerc'] = pred
df_sub.to_csv('lgb_submission.csv', index=False)

In [57]:
# Restore some columns
df_sub = pd.read_csv('../input/sample_submission_V2.csv')
df_sub['winPlacePerc'] = pred
df_sub['id'] = df_sub['Id']
df_sub = df_sub.merge(df_test[["id", "match_id", "group_id", "max_place", "num_groups"]], on="id", how="left")

# Sort, rank, and assign adjusted ratio
df_sub_group = df_sub.groupby(["match_id", "group_id"]).first().reset_index()
df_sub_group["rank"] = df_sub_group.groupby(["match_id"])["winPlacePerc"].rank()
df_sub_group = df_sub_group.merge(
    df_sub_group.groupby("match_id")["rank"].max().to_frame("max_rank").reset_index(), 
    on="match_id", how="left")
df_sub_group["adjusted_perc"] = (df_sub_group["rank"] - 1) / (df_sub_group["num_groups"] - 1)

df_sub = df_sub.merge(df_sub_group[["adjusted_perc", "match_id", "group_id"]], on=["match_id", "group_id"], how="left")
df_sub["winPlacePerc"] = df_sub["adjusted_perc"]

# Deal with edge cases
df_sub.loc[df_sub['max_place'] == 0, "winPlacePerc"] = 0
df_sub.loc[df_sub['max_place'] == 1, "winPlacePerc"] = 1

# Align with maxPlace
# Credit: https://www.kaggle.com/anycode/simple-nn-baseline-4
subset = df_sub.loc[df_sub['max_place'] > 1]
gap = 1.0 / (subset['max_place'].values - 1)
new_perc = np.around(subset['winPlacePerc'].values / gap) * gap
df_sub.loc[df_sub['max_place'] > 1, "winPlacePerc"] = new_perc

# Edge case
df_sub.loc[(df_sub['max_place'] > 1) & (df_sub['num_groups'] == 1), "winPlacePerc"] = 0
assert df_sub["winPlacePerc"].isnull().sum() == 0

df_sub[["Id", "winPlacePerc"]].to_csv("submission_adjusted.csv", index=False)