In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import gc
import os
import itertools

sns.set_style('darkgrid')
sns.set_palette('bone')

#pd.options.display.float_format = '{:.5g}'.format
pd.options.display.float_format = '{:,.3f}'.format

print(os.listdir("./data"))

['.DS_Store', 'test_V2.csv', 'train_V2.csv', '.ipynb_checkpoints']


In [2]:
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import minmax_scale
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [3]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                #if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                #    df[col] = df[col].astype(np.float16)
                #el
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        #else:
            #df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB --> {:.2f} MB (Decreased by {:.1f}%)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
def load_data():
    train = pd.read_csv('./data/train_V2.csv')
    train = reduce_mem_usage(train)
    test = pd.read_csv('./data/test_V2.csv')
    test = reduce_mem_usage(test)
    # dropna
    null_cnt = train.isnull().sum().sort_values()
    print('null count:', null_cnt[null_cnt > 0])
    train.dropna(inplace=True)
    return train ,test
#train,test = load_data()

In [5]:
def toTapleList(list1,list2):
    return list(itertools.product(list1,list2))

In [6]:
def print_divide_line():
    print('-' * 50)

In [7]:
def fillInf(df, val):
    numcols = df.select_dtypes(include='number').columns
    cols = numcols[numcols != 'winPlacePerc']
    df[df == np.Inf] = np.NaN
    df[df == np.NINF] = np.NaN
    for c in cols: 
        df[c].fillna(val, inplace=True)

## DataAnalysis

In [8]:
def eda_id(X):
    print_divide_line()
    print('eda Id, groupId, matchId')
    
    # id groupId matchId
    for c in ['Id','groupId','matchId']:
        print(f'unique [{c}] count:', X[c].nunique())
        
    print_divide_line()
    gc.collect()

In [9]:
def eda_matchtype(X):
    print_divide_line()
    print('eda matchType')
    
    # 3 game type , solo(1), duo(2), squad(4)
    fig, ax = plt.subplots(1, 2, figsize=(12, 4))
    X.groupby('matchId')['matchType'].first().value_counts().plot.bar(ax=ax[0])
    
    '''
    solo  <-- solo,solo-fpp,normal-solo,normal-solo-fpp
    duo   <-- duo,duo-fpp,normal-duo,normal-duo-fpp,crashfpp,crashtpp
    squad <-- squad,squad-fpp,normal-squad,normal-squad-fpp,flarefpp,flaretpp
    '''
    mapper = lambda x: 'solo' if ('solo' in x) else 'duo' if ('duo' in x) or ('crash' in x) else 'squad'
    X['matchType'] = X['matchType'].apply(mapper)
    X.groupby('matchId')['matchType'].first().value_counts().plot.bar(ax=ax[1])
    print_divide_line()
    
    gc.collect()

In [10]:
def eda_maxPlace_groups(X):
    print_divide_line()
    print('eda maxPlace')
    
    for q in ['numGroups == maxPlace','numGroups > maxPlace','numGroups < maxPlace']:
        print(q, ':', len(X.query(q)))
        
    print_divide_line()
    print('eda numGroups matchType maxPlace')
    
    # describe
    cols = ['numGroups','maxPlace']
    # 第一个组是 
    desc1 = X.groupby('matchType')[cols].describe()[toTapleList(cols,['min','mean','max'])]
    # groups in match
    group = X.groupby(['matchType','matchId','groupId','numGroups','maxPlace']).count().groupby(['matchType','matchId']).size().to_frame('groups in match')
    desc2 = group.groupby('matchType').describe()[toTapleList(['groups in match'],['min','mean','max'])]

    display(pd.concat([desc1, desc2], axis=1))
    
    print_divide_line()
    gc.collect()

In [11]:
def eda_players(X):
    print_divide_line()
    print('eda players')
    match = X.groupby(['matchType','matchId']).size().to_frame('players in match') # 每场比赛中玩家的人数
    group = X.groupby(['matchType','matchId','groupId']).size().to_frame('players in group') # 每场队伍的人数
    display(pd.concat([match.groupby('matchType').describe()[toTapleList(['players in match'],['min','mean','max'])], 
           group.groupby('matchType').describe()[toTapleList(['players in group'],['min','mean','max'])]], axis=1))
    print('players in match, group 100 players join the same server,  so in the case of duos the max teams are 50 and in the case of squads the max teams are 25.')
    print_divide_line()
    
    print(group['players in group'].nlargest(5))
    del match,group
    
    print_divide_line()  
          
    subset = X[X['matchId']=='41a634f62f86b7']
    sub_grp = subset[subset['groupId']=='128b07271aa012']

    print('matchId==\'41a634f62f86b7\' & groupId==\'128b07271aa012\'')
    print('-'*50)
    print('players:',len(subset)) # 该比赛中玩家的数量
    print('groups:',subset['groupId'].nunique()) # 该比赛中队伍的数量
    print('numGroups:',subset['numGroups'].unique()) # 该比赛中对应的 numGroups
    print('maxPlace:',subset['maxPlace'].unique()) # 该比赛中对应的 maxPlace
    print('-'*50)
    print('max-group players:',len(sub_grp)) # group players
    print('max-group winPlacePerc:',sub_grp['winPlacePerc'].unique()) # group winPlacePerc
    print('-'*50)
    print('winPlacePerc:',subset['winPlacePerc'].sort_values().unique()) # match winPlacePerc
          
    print_divide_line()
    
    group = X.groupby(['matchId','groupId','matchType'])['Id'].count().to_frame('players').reset_index()
    group.loc[group['players'] > 4, 'players'] = '5+'
    group['players'] = group['players'].astype(str)

    fig, ax = plt.subplots(1, 3, figsize=(16, 4))
    for mt, ax in zip(['solo','duo','squad'], ax.ravel()):
        ax.set_xlabel(mt)
        group[group['matchType'] == mt]['players'].value_counts().sort_index().plot.bar(ax=ax)
        gc.collect()
        
    gc.collect()

In [12]:
def eda_matchduration(X):
    fig, ax = plt.subplots(1, 2, figsize=(12, 4))
    # there are two types of maps?
    X['matchDuration'].hist(bins=50, ax=ax[0])
    X.query('matchDuration >= 1400 & matchDuration <= 1800')['matchDuration'].hist(bins=50, ax=ax[1])
    
    display(X[X['matchDuration'] == X['matchDuration'].min()].head())
    display(X[X['matchDuration'] == X['matchDuration'].max()].head())
    display(' If there is same match Duration : ',(X.groupby('matchId')['matchDuration'].nunique() > 1).any())
    
    gc.collect()

In [13]:
def eda_healsboosts(X):
    print_divide_line()
    print('eda heals and boosts : ')
    
    fig, ax = plt.subplots(2, 2, figsize=(16, 12))
    cols = ['boosts','heals']
    for col, ax in zip(cols, ax):
        sub = X[['winPlacePerc',col]].copy()
        mv = (sub[col].max() // 5) + 1
        sub[col] = pd.cut(sub[col], [5 * x for x in range(0 , mv)], right = False)
        sub.groupby(col).mean()['winPlacePerc'].plot.bar(ax=ax[0])
        X[col].hist(bins=20, ax=ax[1])
        ax[0].set_title(col)
        ax[1].set_title(col)
    
    print_divide_line()
    
    gc.collect()

In [14]:
def eda_revives(X):
    print_divide_line()
    print('eda revives : ')
    print('solo player has revives:', 'solo' in X.query('revives > 0')['matchType'].unique())
    
    fig, ax = plt.subplots(1 , 2 , figsize = (16 , 4))
    col = 'revives'
    sub = X.loc[~X['matchType'].str.contains('solo') , ['winPlacePerc' , col]].copy()
    sub[col] = pd.cut(sub[col], [5 * x for x in range(0 , 8)], right = False)
    sub.groupby(col).mean()['winPlacePerc'].plot.bar(ax=ax[0])
    X[col].hist(bins=20, ax=ax[1])
    
    for a in ax:
        a.set_title(col)

    print_divide_line()
    
    gc.collect()

In [15]:
def eda_killPlace(X):
    print_divide_line()
    print('eda killPlace')
    display(X.groupby(['matchType'])['killPlace'].describe()[['min','max','mean']])

    plt.figure(figsize = (8 , 4))
    col = 'killPlace'
    sub = X[['winPlacePerc' , col]].copy()
    sub[col] = pd.cut(sub[col], [10 * x for x in range(0 , 11)] , right = False)
    sub.groupby(col).mean()['winPlacePerc'].plot.bar()
    
    print('killPlace is a sorted ranking of kills and winPlacePerc in each match')
    subMatch = X[X['matchId'] == X['matchId'].min()].sort_values(['winPlacePerc' , 'killPlace'])
    cols = ['groupId' , 'kills' , 'winPlacePerc' , 'killPlace']
    
    display(subMatch[cols])
    print('kill place maybe is the most important feature')
    print_divide_line()
    
    gc.collect()

In [16]:
def eda_kills(X):
    print_divide_line()
    print('eda kills : ')
    fig, ax = plt.subplots(1, 2, figsize=(16, 4))

    col = 'kills'
    sub = X[['winPlacePerc' , col]].copy()
    sub[col] = pd.cut(sub[col], [5 * x for x in range(0 , 20)], right=False)
    sub.groupby(col).mean()['winPlacePerc'].plot.bar(ax=ax[0])
    X[X['kills'] < 20][col].hist(bins=20, ax=ax[1])
    print_divide_line()
    
    print('kills eda by matchType')
    sub = X['matchType'].str.contains('solo')
    gpmt = pd.concat([
            X.loc[sub].groupby('matchId')['kills'].sum().describe()# solo的总击杀数
            , X.loc[~sub].groupby('matchId')['kills'].sum().describe() # 非solo的总击杀数
        ]
            , keys=['solo','team'], axis=1).T
    display(gpmt)

    gc.collect()

In [17]:
def eda_killStreaks_dbnos(X):
    
    print_divide_line()
    print('eda killStreaks and dbnos : ')
    
    fig, ax = plt.subplots(2, 2, figsize=(16, 12))

    cols = ['killStreaks' , 'DBNOs']
    for col, ax in zip(cols , ax):
        sub = X[['winPlacePerc' , col]].copy()
        sub[col] = pd.cut(sub[col], 6)
        sub.groupby(col).mean()['winPlacePerc'].plot.bar(ax=ax[0])
        X[col].hist(bins=20, ax=ax[1])
        for a in ax :
            a.set_title(col)

    print_divide_line()
    
    gc.collect()

In [18]:
def eda_roadheadshotteamKills(X):
    print_divide_line()
    print('eda roadkills headshotkills teamkills')
    
    fig, ax = plt.subplots(3, 2, figsize=(16, 16))
    cols = ['headshotKills' , 'roadKills' , 'teamKills']
    for col, ax in zip(cols , ax):
        sub = X[['winPlacePerc' , col]].copy()
        sub.loc[sub[col] >= 5 , col] = '5+'  
        sub[col] = sub[col].astype(str)
        sub.groupby(col).mean()['winPlacePerc'].plot.bar(ax = ax[0])
        X[col].hist(bins=20 , ax=ax[1])
        for a in ax:
            a.set_title(col)

    print_divide_line()
    
    gc.collect()

In [19]:
def eda_assists(X):
    fig , ax = plt.subplots(1, 2, figsize=(16, 4))
    col = 'assists'
    sub = X[['winPlacePerc' , col]].copy()
    sub.loc[sub[col] >= 5 , col] = '5+'
    sub[col] = sub[col].astype(str)
    sub.groupby(col).mean()['winPlacePerc'].plot.bar(ax = ax[0])
    X[col].hist(bins=20, ax=ax[1])
    for a in ax:
        a.set_title(col)
    
    df = pd.concat([
        X[X['matchType'] == 'solo'].describe()['assists'] ,
        X[X['matchType'] != 'solo'].describe()['assists']
    ] , keys=['solo','team'] , axis = 1).T

    display(df)
    
    gc.collect()

In [20]:
def eda_longestkill(X):
    print_divide_line()
    print('eda longestkills : ')
    
    fig, ax = plt.subplots(1, 2, figsize=(16, 4))
    col = 'longestKill'
    sub = X[['winPlacePerc' , col]].copy()
    sub[col] = pd.cut(sub[col], 6)
    sub.groupby(col).mean()['winPlacePerc'].plot.bar(ax=ax[0])
    X[col].hist(bins=20, ax=ax[1])
    
    print_divide_line()
    gc.collect()

In [21]:
def eda_damagedealt(X):
    print_divide_line()
    print('eda damagedealt : ')
    fig, ax = plt.subplots(1, 2, figsize=(16, 4))

    col = 'damageDealt'
    sub = X[['winPlacePerc',col]].copy()
    sub[col] = pd.cut(sub[col], 6)
    sub.groupby(col).mean()['winPlacePerc'].plot.bar(ax=ax[0])
    X[col].hist(bins=20, ax=ax[1])
    print_divide_line()

    cheater = X.query('damageDealt == 0 & (kills > 0 | DBNOs > 0)')[
    ['damageDealt','kills','DBNOs','headshotKills','roadKills','teamKills']].head(20)
    print(' cheater display damage 0 with kills > 0 DBNOS > 0')
    display(cheater)
    gc.collect()

In [22]:
def eda_distance(X):
    print_divide_line()
    print('eda distance : ')

    print_divide_line()
    print('eda zombie : ')
    # zombie ?
    sub = X.query('walkDistance == 0 & kills == 0 & weaponsAcquired == 0 & \'solo\' in matchType')
    print('count:', len(sub), ' winPlacePerc:', round(sub['winPlacePerc'].mean(),3))
    print_divide_line()
    
    print_divide_line()
    print('eda cheater : ')
    # cheater ?
    sq = 'kills > 3 & (headshotKills / kills) >= 0.8'
    sub = X.query(sq)
    print(sq, '\n count:', len(sub), ' winPlacePerc:', round(sub['winPlacePerc'].mean(),3))
    print_divide_line()
    
    # 分析 distance 与winperc的关系
    sub = X[['walkDistance','rideDistance','swimDistance','winPlacePerc']].copy()
    walk = X['walkDistance']
    sub['walkDistanceBin'] = pd.cut(walk, 
                                    [0, 0.001, walk.quantile(.25), walk.quantile(.5), walk.quantile(.75), 99999])
    # 开车和没开车，游泳和没游泳
    sub['rideDistanceBin'] = (X['rideDistance'] > 0).astype(int)
    sub['swimDistanceBin'] = (X['swimDistance'] > 0).astype(int)

    fig, ax = plt.subplots(1, 3, figsize=(16, 3), sharey=True)
    sub.groupby('walkDistanceBin').mean()['winPlacePerc'].plot.bar(ax=ax[0])
    sub.groupby('rideDistanceBin').mean()['winPlacePerc'].plot.bar(ax=ax[1])
    sub.groupby('swimDistanceBin').mean()['winPlacePerc'].plot.bar(ax=ax[2])
    ax[0].set_title('walk distance bin')
    ax[1].set_title('ride distance bin')
    ax[2].set_title('swim distance bin')

    del sub, walk

    # distance 本身的分布
    fig, ax = plt.subplots(3, 2 , figsize=(16, 12))
    cols = ['walkDistance', 'rideDistance', 'swimDistance']
    for col, ax in zip(cols, ax):
        sub = X[['winPlacePerc' , col]].copy()
        sub[col] = pd.cut(sub[col], 6)
        sub.groupby(col).mean()['winPlacePerc'].plot.bar(ax=ax[0])
        X[col].hist(bins=20, ax=ax[1])
        for a in ax:
            a.set_title(col)
            
    gc.collect()

In [23]:
def eda_points(X):
    print_divide_line()
    print('eda points : ')
    fig, ax = plt.subplots(1, 3, figsize = (16, 4), sharey = True)
    cols = ['killPoints' , 'rankPoints' , 'winPoints']
    for col, a in zip(cols, ax.ravel()): 
        X.plot.scatter(x = col, y = 'winPlacePerc', ax=a)
    
    # rankPoint: being deprecated
    # killPoints,winPoints: If there is a value other than -1 in rankPoints, then any 0 should be treated as a “None”.
    sign = lambda x: 'p<=0' if x <= 0 else 'p>0'
    df = pd.concat([
        pd.crosstab(X['rankPoints'].apply(sign) , X['winPoints'].apply(sign) , margins=False),
        pd.crosstab(X['rankPoints'].apply(sign) , X['killPoints'].apply(sign) , margins=False)
    ], keys=['winPoints','killPoints'], axis=1)
    display(df)
    
    gc.collect()

In [24]:
def eda_winPlacePerc(X):
    print_divide_line()
    # win place describe
    print('eda win place perc : ')
    display(X['winPlacePerc'].describe())
    print_divide_line()
    
    # 吃鸡和反向吃鸡的
    print_divide_line()
    print('winPlacePerc 1 and 0 : ')
    df = pd.concat([X[X['winPlacePerc'] == 1].head(5),
           X[X['winPlacePerc'] == 0].head(5)],
          keys=['winPlacePerc_1', 'winPlacePerc_0'])
    display(df)
    print_divide_line()
    
    print_divide_line()
    print('match count:', X['matchId'].nunique()) # 比赛 数目
    # not contains 1st place
    maxPlacePerc = X.groupby('matchId')['winPlacePerc'].max()
    print('match [not contains 1st place]:', len(maxPlacePerc[maxPlacePerc != 1])) # 每场比赛中不吃鸡的人
    del maxPlacePerc

    # edge case
    sub = X[(X['maxPlace'] > 1) & (X['numGroups'] == 1)]
    print('match [maxPlace>1 & numGroups==1]:' , len(sub.groupby('matchId')))
    print(' - unique winPlacePerc:', sub['winPlacePerc'].unique())
    print_divide_line()
    
    gc.collect()

In [25]:
def eda_matchsummary(X):
    print_divide_line()
    print('eda matchsummary : ')
    cols = [
        'kills','teamKills','DBNOs','revives','assists','boosts','heals'
        ,'damageDealt','walkDistance','rideDistance','swimDistance','weaponsAcquired'
    ]

    aggs = ['count','min','mean','max']
    # summary of solo-match
    grp = X.loc[X['matchType'].str.contains('solo')].groupby('matchId')
    grpSolo = grp[cols].sum()
    # summary of team-match
    grp = X.loc[~X['matchType'].str.contains('solo')].groupby('matchId')
    grpTeam = grp[cols].sum()
    soloMatchCount = X.loc[X['matchType'].str.contains('solo')]['matchId'].nunique()
    teamMatchCount = X.loc[~X['matchType'].str.contains('solo')]['matchId'].nunique()
    print('matchCount : ' , X['matchId'].nunique())
    print('solo matchCount : ', soloMatchCount)
    print('team matchCount : ', teamMatchCount)
    print('total match Count = ', soloMatchCount+teamMatchCount)

    df = pd.concat([grpSolo.describe().T[aggs], grpTeam.describe().T[aggs]], keys=['solo', 'team'], axis=1)
    display(df)
    
    gc.collect()

In [26]:
def eda_groupSummary(X):
    print_divide_line()
    
    cols = [
        'kills','teamKills','DBNOs','revives','assists','boosts','heals','damageDealt',
        'walkDistance','rideDistance','swimDistance','weaponsAcquired']
    cols.extend(['killPlace','winPlacePerc'])
    group = X.groupby(['matchId','groupId'])[cols]

    fig, ax = plt.subplots(3, 1, figsize=(8, 24), sharey=True)
    for df, ax, title in zip([group.mean(), group.min(), group.max()], ax.ravel(),['mean','min','max']):
        sns.heatmap(df.corr(), annot=True, linewidths=.6
                    , fmt='.2f', vmax=1, vmin=-1, center=0, cmap='Blues', ax=ax)
        ax.set_title(title)
        del df
    
    print_divide_line()
    
    gc.collect()

In [27]:
def eda_matchStats(X):
    
    def printMatchStats(matchIds):
        for mid in matchIds:
            subMatch = X[X['matchId'] == mid]
            print('matchType:', subMatch['matchType'].values[0])

            grp1st = subMatch[subMatch['winPlacePerc'] == 1]
            grpOther = subMatch[subMatch['winPlacePerc'] != 1]
            print('players'.ljust(10)
                  , ' total:{:>3}  1st:{:>3}  other:{:>3}'.format(len(subMatch), len(grp1st), len(grpOther)))
            for c in ['kills','teamKills','roadKills','DBNOs','revives','assists']:
                print(c.ljust(10)
                      , ' total:{:>3}  1st:{:>3}  other:{:>3}'.format(subMatch[c].sum(), grp1st[c].sum(), grpOther[c].sum()))
            print('-' * 30)
            
    sampleMid = X['matchId'].unique()[0:5]
    printMatchStats(sampleMid)
    
    match = X.groupby(['matchId'])['Id'].count()
    fullplayer = match[match == 100].reset_index()
    sampleMid = fullplayer['matchId'][0:5]
    printMatchStats(sampleMid)
    
    gc.collect()

In [28]:
def data_eda():
    eda_fucs = [
        eda_id
        , eda_matchtype
        , eda_maxPlace_groups
        , eda_players #
        , eda_matchduration #
        , eda_healsboosts
        , eda_revives #
        , eda_killPlace # most important
        , eda_kills #
        , eda_killStreaks_dbnos #
        , eda_roadheadshotteamKills #
        , eda_assists #
        , eda_longestkill #
        , eda_damagedealt #
        , eda_distance #
        , eda_points #
        , eda_winPlacePerc
        , eda_matchsummary
        , eda_groupSummary
        , eda_matchStats
    ]
    
    train , _ = load_data()
    for edafuc in eda_fucs:
        print(edafuc.__name__)
        edafuc(train)

#data_eda()

# Feature Engineering

In [29]:
#data = [train]
gc.collect()

140

In [30]:
def fe_new_rankpct(data):
    # 给出kill walk 排名所占比赛的百分位
    for X in data:
        match = X.groupby('matchId') # 以比赛分组
        X['killsPerc'] = match['kills'].rank(pct=True).values # 以比赛为基准的击杀排名
        X['killPlacePerc'] = match['killPlace'].rank(pct=True).values # 以比赛为基准的击杀排名排名
        X['walkDistancePerc'] = match['walkDistance'].rank(pct=True).values # 一比赛为基准的走路距离排名
        #all_data['damageDealtPerc'] = match['damageDealt'].rank(pct=True).values
        X['walkPerc_killsPerc'] = X['walkDistancePerc'] / X['killsPerc'] # 表示走路的排名超过击杀的排名
        
    return data

In [31]:
def fe_new_distance(data):
    # 将三个距离综合到一起。
    for X in data:
        X['_totalDistance'] = X['rideDistance'] + X['walkDistance'] + X['swimDistance']
        
    return data

In [32]:
def fe_new_user(data):
    for X in data:
        '''
        solo  <-- solo,solo-fpp,normal-solo,normal-solo-fpp
        duo   <-- duo,duo-fpp,normal-duo,normal-duo-fpp,crashfpp,crashtpp
        squad <-- squad,squad-fpp,normal-squad,normal-squad-fpp,flarefpp,flaretpp
        '''
        # 将比赛类型以大类作为基准分类
        mapper = lambda x: 'solo' if ('solo' in x) else 'duo' if ('duo' in x) or ('crash' in x) else 'squad'
        X['matchType'] = X['matchType'].apply(mapper) # matchType分大类
        
        X['_healthItems'] = X['heals'] + X['boosts'] # 使用的提升类和治疗类用品总数
        X['_headshotKillRate'] = X['headshotKills'] / X['kills'] # 爆头率
        X['_killPlaceOverMaxPlace'] = X['killPlace'] / X['maxPlace'] # 击杀排名比最大排名低的, which is noob
        X['_killsOverWalkDistance'] = X['kills'] / X['walkDistance'] # 击杀数量超过走路距离????
        #all_data['_killsOverDistance'] = all_data['kills'] / all_data['_totalDistance']
        #all_data['_walkDistancePerSec'] = all_data['walkDistance'] / all_data['matchDuration']

        fillInf(X, 0)
        
    return data

In [33]:
def fe_drop(data):
    for X in data:
        
        X.drop(['boosts','heals','killStreaks','DBNOs'], axis=1, inplace=True)
        X.drop(['headshotKills','roadKills','vehicleDestroys'], axis=1, inplace=True)
        X.drop(['rideDistance','swimDistance','matchDuration'], axis=1, inplace=True)
        X.drop(['rankPoints','killPoints','winPoints'], axis=1, inplace=True)
        
    return data

In [34]:
def fe_group(data):
    for i,X in enumerate(data):
        # ---------- 确定聚合目标 
        # 比赛聚合
        match = X.groupby(['matchId'])
        # 小组聚合
        group = X.groupby(['matchId','groupId','matchType'])
        # target feature (max, min) 求最大最小值的特征列
        agg_col = []
        exclude_agg_col = ['Id' , 'matchId','groupId','matchType','maxPlace','numGroups','winPlacePerc']
        for c in X.columns:
            if c not in exclude_agg_col:
                agg_col.append(c)
        # target feature (sum) 求和的特征列
        sum_col = ['kills','killPlace','damageDealt','walkDistance','_healthItems']
        
        # aggregate columns , sum columns
        print_divide_line()
        print('聚合的特征 : ' , agg_col)
        print('求和的特征 : ' , sum_col)
        print_divide_line()
        
        # ---------- 比赛的求和，最大值，平均值， 组级别的求和, 比赛未求最小值
        ''' 
        match sum, match max, match mean, group sum
        '''
        match_data = pd.concat([
            match.size().to_frame('m.players'), # 比赛的玩家人数
            match[sum_col].sum().rename(columns = lambda s: 'm.sum.' + s), # 比赛 和
            match[sum_col].max().rename(columns = lambda s: 'm.max.' + s), # 比赛 最大值
            match[sum_col].mean().rename(columns = lambda s: 'm.mean.' + s) # 比赛 平均值
        ], axis=1).reset_index()
        match_data = pd.merge(
            match_data, 
            group[sum_col].sum().rename(columns = lambda s: 'g.sum.' + s).reset_index() # 小队 和
        )
        match_data = reduce_mem_usage(match_data)
        
        print_divide_line()
        print('进行以比赛为分组对象的 求和 求最大值 求平均值， 以小队为分组对象的 求和')
        print('聚合后的特征 : ' , match_data.columns)
        print('聚合后的数据规模 : ' , match_data.shape)
        display(match_data.head(8))
        print_divide_line()
        
        # ---------- 以比赛级别对kill killPlace排序
        ''' 
        ranking of kills and killPlace in each match
        没太看懂这是在干吗？
        '''
        #minKills = all_data.sort_values(['matchId','groupId','kills','killPlace']).groupby(
        #    ['matchId','groupId','kills']).first().reset_index().copy()
        #for n in np.arange(4):
        #    c = 'kills_' + str(n) + '_Place'
        #    nKills = (minKills['kills'] == n)
        #    minKills.loc[nKills, c] = minKills[nKills].groupby(['matchId'])['killPlace'].rank().values
        #    match_data = pd.merge(match_data, minKills[nKills][['matchId','groupId',c]], how='left')
        #    #match_data[c].fillna(0, inplace=True)
        #match_data = reduce_mem_usage(match_data)
        #del minKills, nKills
        #print(match_data.shape)
        
        # ---------- 小队 平均，最大，最小
        ''' 
        group mean, max, min
        '''
        X = pd.concat([
            group.size().to_frame('g.players') # 小队求玩家人数
            ,group.mean() # 小队求平均
            #.rename(columns = lambda s: 'g.mean.' + s),
            ,group[agg_col].max().rename(columns = lambda s: 'g.max.' + s) # 小队求最大
            ,group[agg_col].min().rename(columns = lambda s: 'g.min.' + s) # 小队求最小
            ], axis=1).reset_index()
        X = reduce_mem_usage(X)
        
        print_divide_line()
        print('进行以小队为对象 求和 求小队人数 求最大 求最小')
        print('聚合后的特征 : ' , X.columns)
        print('聚合后的数据规模: ' , X.shape)
        print_divide_line()
        
        # ---------- 聚合特征
        # 选取所有数值类型 除预测目标
        numcols = X.select_dtypes(include='number').columns.values
        numcols = numcols[numcols != 'winPlacePerc']
        print_divide_line()
        print('数值特征 : ' , numcols)
        print_divide_line()

        # 以比赛为基准进行归一化
        ''' 
        match summary, max
        '''
        X = pd.merge(X, match_data)
        del match_data
        gc.collect()
        
        X['enemy.players'] = X['m.players'] - X['g.players']# 敌方玩家
        for c in sum_col:
            #all_data['enemy.' + c] = (all_data['m.sum.' + c] - all_data['sum.' + c]) / all_data['enemy.players']
            #all_data['p.sum_msum.' + c] = all_data['sum.' + c] / all_data['m.sum.' + c]
            #all_data['p.max_mmean.' + c] = all_data['max.' + c] / all_data['m.mean.' + c]
            X['p.max_msum.' + c] = X['g.max.' + c] / X['m.sum.' + c] # group max / match sum 
            X['p.max_mmax.' + c] = X['g.max.' + c] / X['m.max.' + c] # group max / match max
            X.drop(['m.sum.' + c, 'm.max.' + c], axis=1, inplace=True)
        fillInf(X, 0)
        print_divide_line()
        print('以组为基准进行聚合标准化后的特征 : ' , X.columns)
        print('以组为基准进行聚合标准化后的数据规模 : ' , X.shape)
        print_divide_line()
        
        ''' 
        match rank
        '''
        # 对每个数值型特征进行排序
        match = X.groupby('matchId')
        matchRank = match[numcols].rank(pct=True).rename(columns = lambda s: 'm.rank.' + s)# 以比赛为基准队所有的数值特征进行排序
        X = reduce_mem_usage(pd.concat([X , matchRank], axis=1))
        rank_col = matchRank.columns

        print_divide_line()
        print('以比赛为基准进行排序的特征 ： ', rank_col)
        print('以比赛为基准进行排序的数据子集规模 : ' , matchRank.shape)
        print_divide_line()
        
        del matchRank
        gc.collect()
        
        # instead of rank(pct=True, method='dense')
        # 对每个rank进行归一化。
        match = X.groupby('matchId')
        matchRank = match[rank_col].max().rename(columns = lambda s: 'm.max.' + s).reset_index() # rank的最大值
        X = pd.merge(X , matchRank)
        for c in numcols:
            X['m.rank.' + c] = X['m.rank.' + c] / X['m.max.m.rank.' + c] # rank/ rank的最大,归一化
            X.drop(['m.max.m.rank.' + c], axis=1, inplace=True)
        del matchRank
        gc.collect()
        
        print_divide_line()
        print('以比赛为基准的归一化的特征 : ', X.columns)
        print('以比赛为基准的归一化的数据规模 : ' , X.shape)
        print_divide_line()
        
        data[i] = X
     
    return data

In [35]:
def fe_drop_constant(data):
    for X in data:
        # drop constant column
        constant_column = [col for col in X.columns if X[col].nunique() == 1] # 删除一些值都是固定的特征
        print('drop columns:', constant_column)
        X.drop(constant_column, axis=1, inplace=True)
        
    return data

In [36]:
def fe_encode(data):
    for X in data:
        # one hot matchType
        X = pd.concat([X, pd.get_dummies(X['matchType'])], axis=1)
        X.drop(['matchType'], axis=1, inplace=True)

        # 转换成整数
        X['matchId'] = X['matchId'].apply(lambda x: int(x,16))
        X['groupId'] = X['groupId'].apply(lambda x: int(x,16))
        
        # X.drop([],axis=1,inplace=True)
        cols = [col for col in X.columns if col not in ['Id','matchId','groupId']]
        for i, t in X.loc[: , cols].dtypes.iteritems():
            if t == object:
                # 将所有object类型的特征转换为编号。
                X[i] = pd.factorize(X[i])[0]

        X = reduce_mem_usage(X)
        
    return data

In [37]:
#del train, test, data
gc.collect()

60

In [38]:
#train, _ = load_data()
#gp = train.groupby('groupId').size()
#gp.shape
# group predict 2026744 shape

In [39]:
def get_data():
    train , test = load_data()
    data = [train, test]
    featureEngineering = [
        fe_new_rankpct
        ,fe_new_distance
        ,fe_new_user
        ,fe_drop
        ,fe_group
        ,fe_drop_constant
        ,fe_encode
    ]
    
    print('before fe X shape : ', train.shape)
    for fe in featureEngineering:
        print_divide_line()
        print('running : ', fe.__name__)
        data = fe(data)
        print('finished : ' , fe.__name__)
        print('after : ' , fe.__name__ , 'train shape : ', data[0].shape)
        print_divide_line()
        
    print_divide_line()
    
    y_train = X_train.pop('winPlacePerc')
    # X_test_grp = X_test[['matchId','groupId']].copy()
    #train_matchId = X_train['matchId']

    # drop matchId,groupId
    X_train.drop(['matchId','groupId'], axis=1, inplace=True)
    X_test.drop(['matchId','groupId'], axis=1, inplace=True)
    
    return X_train,y_train ,X_test    

In [40]:
def predict():
    X_train ,y_train ,X_test = get_data()
    print_divide_line()
    print('开始输出预测结果 : ')
    params = {
        'learning_rate': 0.05
        ,'objective':'mae'
        ,'metric':'mae'
        ,'num_leaves': 128
        ,'verbose': 1
        ,'random_state':42
        ,'bagging_fraction': 0.7
        ,'feature_fraction': 0.7
    }

    reg = lgb.LGBMRegressor(**params, n_estimators=100)
    reg.fit(X_train, Y_train)
    pred = reg.predict(X_test, num_iteration=reg.best_iteration_)
    print_divide_line()
    return pred

pred = predict()

Memory usage of dataframe is 983.90 MB --> 339.28 MB (Decreased by 65.5%)
Memory usage of dataframe is 413.18 MB --> 140.19 MB (Decreased by 66.1%)
null count: winPlacePerc    1
dtype: int64
before fe X shape :  (4446965, 29)
--------------------------------------------------
running :  fe_new_rankpct
finished :  fe_new_rankpct
after :  fe_new_rankpct train shape :  (4446965, 33)
--------------------------------------------------
--------------------------------------------------
running :  fe_new_distance
finished :  fe_new_distance
after :  fe_new_distance train shape :  (4446965, 34)
--------------------------------------------------
--------------------------------------------------
running :  fe_new_user
finished :  fe_new_user
after :  fe_new_user train shape :  (4446965, 38)
--------------------------------------------------
--------------------------------------------------
running :  fe_drop
finished :  fe_drop
after :  fe_drop train shape :  (4446965, 25)
--------------------

Unnamed: 0,matchId,m.players,m.sum.kills,m.sum.killPlace,m.sum.damageDealt,m.sum.walkDistance,m.sum._healthItems,m.max.kills,m.max.killPlace,m.max.damageDealt,...,m.mean.damageDealt,m.mean.walkDistance,m.mean._healthItems,groupId,matchType,g.sum.kills,g.sum.killPlace,g.sum.damageDealt,g.sum.walkDistance,g.sum._healthItems
0,0000a43bce5eec,95,90.0,4560.0,13539.771,119082.789,224.0,7,95,983.7,...,142.524,1253.503,2.358,18b16ec699d8b6,squad,2.0,82.0,219.35,263.37,2.0
1,0000a43bce5eec,95,90.0,4560.0,13539.771,119082.789,224.0,7,95,983.7,...,142.524,1253.503,2.358,236ab9e9c081b9,squad,0.0,543.0,287.93,119.591,0.0
2,0000a43bce5eec,95,90.0,4560.0,13539.771,119082.789,224.0,7,95,983.7,...,142.524,1253.503,2.358,3a6addfa0df938,squad,0.0,189.0,0.0,9.636,0.0
3,0000a43bce5eec,95,90.0,4560.0,13539.771,119082.789,224.0,7,95,983.7,...,142.524,1253.503,2.358,4bf06994bd4c9a,squad,0.0,119.0,23.4,1118.1,1.0
4,0000a43bce5eec,95,90.0,4560.0,13539.771,119082.789,224.0,7,95,983.7,...,142.524,1253.503,2.358,4d1bbbc19b9084,squad,6.0,28.0,681.9,6585.0,9.0
5,0000a43bce5eec,95,90.0,4560.0,13539.771,119082.789,224.0,7,95,983.7,...,142.524,1253.503,2.358,599d924f8a02db,squad,0.0,141.0,119.18,7260.0,9.0
6,0000a43bce5eec,95,90.0,4560.0,13539.771,119082.789,224.0,7,95,983.7,...,142.524,1253.503,2.358,6620b219ed2ee2,squad,4.0,106.0,770.6,11118.0,15.0
7,0000a43bce5eec,95,90.0,4560.0,13539.771,119082.789,224.0,7,95,983.7,...,142.524,1253.503,2.358,6c44ef4381fe8d,squad,10.0,39.0,1253.3,6657.0,11.0


--------------------------------------------------
Memory usage of dataframe is 633.98 MB --> 411.70 MB (Decreased by 35.1%)
--------------------------------------------------
进行以小队为对象 求和 求小队人数 求最大 求最小
聚合后的特征 :  Index(['matchId', 'groupId', 'matchType', 'g.players', 'assists',
       'damageDealt', 'killPlace', 'kills', 'longestKill', 'maxPlace',
       'numGroups', 'revives', 'teamKills', 'walkDistance', 'weaponsAcquired',
       'winPlacePerc', 'killsPerc', 'killPlacePerc', 'walkDistancePerc',
       'walkPerc_killsPerc', '_totalDistance', '_healthItems',
       '_headshotKillRate', '_killPlaceOverMaxPlace', '_killsOverWalkDistance',
       'g.max.assists', 'g.max.damageDealt', 'g.max.killPlace', 'g.max.kills',
       'g.max.longestKill', 'g.max.revives', 'g.max.teamKills',
       'g.max.walkDistance', 'g.max.weaponsAcquired', 'g.max.killsPerc',
       'g.max.killPlacePerc', 'g.max.walkDistancePerc',
       'g.max.walkPerc_killsPerc', 'g.max._totalDistance',
       'g.max._healthItem

Unnamed: 0,matchId,m.players,m.sum.kills,m.sum.killPlace,m.sum.damageDealt,m.sum.walkDistance,m.sum._healthItems,m.max.kills,m.max.killPlace,m.max.damageDealt,...,m.mean.damageDealt,m.mean.walkDistance,m.mean._healthItems,groupId,matchType,g.sum.kills,g.sum.killPlace,g.sum.damageDealt,g.sum.walkDistance,g.sum._healthItems
0,0008c31a9be4a7,98,89.0,4851.0,13270.87,135042.719,239.0,9,98,1261.0,...,135.417,1377.987,2.439,01fb9c20f6abc2,squad,0.0,228.0,83.0,956.4,1.0
1,0008c31a9be4a7,98,89.0,4851.0,13270.87,135042.719,239.0,9,98,1261.0,...,135.417,1377.987,2.439,0943c3f283b976,squad,1.0,22.0,104.0,2855.0,8.0
2,0008c31a9be4a7,98,89.0,4851.0,13270.87,135042.719,239.0,9,98,1261.0,...,135.417,1377.987,2.439,11b26f1f710257,squad,17.0,24.0,2108.1,10213.0,29.0
3,0008c31a9be4a7,98,89.0,4851.0,13270.87,135042.719,239.0,9,98,1261.0,...,135.417,1377.987,2.439,1568e092a99583,squad,1.0,23.0,135.7,1457.0,6.0
4,0008c31a9be4a7,98,89.0,4851.0,13270.87,135042.719,239.0,9,98,1261.0,...,135.417,1377.987,2.439,26d4045668cf95,squad,0.0,74.0,0.0,323.4,0.0
5,0008c31a9be4a7,98,89.0,4851.0,13270.87,135042.719,239.0,9,98,1261.0,...,135.417,1377.987,2.439,298bb0348ccd3a,squad,1.0,241.0,466.41,2493.7,2.0
6,0008c31a9be4a7,98,89.0,4851.0,13270.87,135042.719,239.0,9,98,1261.0,...,135.417,1377.987,2.439,3cd4258cebec3d,squad,1.0,100.0,186.1,1389.0,1.0
7,0008c31a9be4a7,98,89.0,4851.0,13270.87,135042.719,239.0,9,98,1261.0,...,135.417,1377.987,2.439,3f91e7fec60224,squad,7.0,45.0,849.49,5469.0,38.0


--------------------------------------------------
Memory usage of dataframe is 273.84 MB --> 175.80 MB (Decreased by 35.8%)
--------------------------------------------------
进行以小队为对象 求和 求小队人数 求最大 求最小
聚合后的特征 :  Index(['matchId', 'groupId', 'matchType', 'g.players', 'assists',
       'damageDealt', 'killPlace', 'kills', 'longestKill', 'maxPlace',
       'numGroups', 'revives', 'teamKills', 'walkDistance', 'weaponsAcquired',
       'killsPerc', 'killPlacePerc', 'walkDistancePerc', 'walkPerc_killsPerc',
       '_totalDistance', '_healthItems', '_headshotKillRate',
       '_killPlaceOverMaxPlace', '_killsOverWalkDistance', 'g.max.assists',
       'g.max.damageDealt', 'g.max.killPlace', 'g.max.kills',
       'g.max.longestKill', 'g.max.revives', 'g.max.teamKills',
       'g.max.walkDistance', 'g.max.weaponsAcquired', 'g.max.killsPerc',
       'g.max.killPlacePerc', 'g.max.walkDistancePerc',
       'g.max.walkPerc_killsPerc', 'g.max._totalDistance',
       'g.max._healthItems', 'g.max._head

Memory usage of dataframe is 433.58 MB --> 441.19 MB (Decreased by -1.8%)
finished :  fe_encode
after :  fe_encode train shape :  (2026744, 138)
--------------------------------------------------
--------------------------------------------------


UnboundLocalError: local variable 'all_data' referenced before assignment