In [1]:
### TODO:
# 1. Do feature engineering on group level instead of user level
# 2. Separate data between first person mode and free for all mode
# 3. Eliminate cheaters and anomalies
# 4. Develop prediction funcgion
#    - Final ranking per match can be determined using this formula -> 100/maxPlace, as ranking interval

In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# import xgboost
from matplotlib import pyplot as plt

%precision %.4f

'%.4f'

In [6]:
### Read training data
train = pd.read_csv('../../input/train_V2.csv')

In [8]:
train['winPlacePerc'].values

array([0.4444, 0.64  , 0.7755, ..., 0.4815, 0.8   , 0.5464])

In [None]:
train.columns

In [None]:
train.head(5)

In [None]:
from pylab import rcParams
rcParams['figure.figsize'] = 10, 8

In [None]:
matchTypes = train.groupby('matchId')['matchType'].first().value_counts()

In [None]:
matchTypes.index

In [None]:
fig, ax = plt.subplots()
ax.bar(np.arange(len(matchTypes.index)),matchTypes.values,align='center')
ax.set_xticks(np.arange(len(matchTypes.index)))
ax.set_xticklabels(matchTypes.index,rotation=70)
plt.title("Match type queue distribution")
plt.show()


### 1. Group Level Inspection and Feature Engineering

Features need to be generated on a group level. Below is the explanation. Let's inspect data for matchId **a10357fd1a4a91**

In [None]:
### Inspect a match data
matchA_df = train[train.matchId == 'a10357fd1a4a91']

In [None]:
### Inspect a group
matchA_df[matchA_df.groupId == '654c638629b8fc']

In [None]:
### select groupIds in a match
playerGroups = matchA_df[['Id','groupId']]

In [None]:
### Number of players per group
playerCountGroup = playerGroups.groupby('groupId',as_index=False).agg({'Id':'count'}).sort_values('Id').rename(columns={"Id":"players"}).reset_index(drop=True)

In [None]:
playerCountGroup

Various number of players in a group may exist in one match. Player queuing solo will be in a group with only 1 member.

In [None]:
### Total players in this match
print("Total players: {}".format(playerCountGroup.players.sum()))
print("Total groups: {}".format(playerCountGroup.groupId.count()))

Now notice the distribution of placement ranking which we would like to predict

In [None]:
### Notice the percentage ranking
matchA_df[['winPlacePerc']].drop_duplicates().sort_values('winPlacePerc').reset_index(drop=True)

There are 26 placements which actually correspond to the number of groups. The interval could be retrieved using:

In [None]:
### The increment of winPlacePerc is retrieved using:
print((100/float(26))/100)

Because the ranking is spread based on the number of groups in one match, group level features need to be generated!

In [None]:
### Generate group level features    

def generate_group_level_features(dataset,feature_columns=['kills','assists','boosts']):
    features = dataset[["matchId","groupId",*feature_columns]].reset_index(drop=True)
    matchGroups = features[["matchId","groupId"]].drop_duplicates().reset_index(drop=True)
    
    ### predefined basic statistic operations
    _stats = ['max','min','sum','mean','std']
    
    ### calculate group level features
    for f in feature_columns:
        for s in _stats:
            new_field = '{s}_{f}'.format(s=s,f=f)
            print(new_field)
            matchGroups = pd.merge(matchGroups,
                features.groupby(["matchId","groupId"],as_index=False)\
                .agg({f:s}).rename(columns={f:new_field}).fillna(0)[["matchId","groupId",new_field]].drop_duplicates(),
                on=['matchId','groupId'],how='inner'
            )
            
    return matchGroups.reset_index(drop=True)

In [None]:
### sample of group level features
#generate_group_level_features(matchA_df)

In [None]:
import time
s = time.time()
groupLevelFeatures_train = generate_group_level_features(train)
e = time.time()
print("elapsed {}s".format(e-s))

In [None]:
groupLevelFeatures_train.to_csv("groupLevelFeatures_train.csv",index=False)

In [None]:
# dummy
# a = pd.DataFrame(data=[{"a":1,"b":2},{"a":3,"b":6}])

# for i,r in a.iterrows():
#     print(r['a'])

# def dum(x):
#     x['c'] = x['a'] + x['b']
#     return x

# a = a.apply(lambda x: dum(x),1)

# a.iloc[0]['c'] = 12

# a

### 2. Separate Game Modes data

(TODO) <br>
There are several game modes / match types in PUBG <br>
https://pubg.gamepedia.com/Game_Modes <br>
Patterns might differ for example between First Person Mode and Third Person Mode even though the players are on solo queue game.

### 3. Eliminate Anomalies

(TODO) <br>
There are already existing kernel out there mentioning anomalies or cheaters in PUBG matches. <br>
We need to adopt some of them.

### 4. Prediction Functions

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [4]:
# groupLevelFeatures_train.head()

In [5]:
# len(groupLevelFeatures_train['matchId'].unique()), len(groupLevelFeatures_train['groupId'].unique())

In [6]:
# train.head()
train.sort_values(['matchId', 'winPlacePerc']).head()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
3599150,fc62a751955351,3a6addfa0df938,0000a43bce5eec,0,0,0.0,0,0,0,95,...,0,0.0,0,0.0,0,0,9.636,1,1557,0.0
3895898,7ad5883d71d42e,3a6addfa0df938,0000a43bce5eec,0,0,0.0,0,0,0,94,...,0,0.0,0,0.0,0,0,0.0,0,1413,0.0
1425950,5a3afae17b53c0,236ab9e9c081b9,0000a43bce5eec,0,0,66.65,0,0,0,93,...,0,0.0,0,0.0,0,0,17.51,1,1430,0.037
1998699,e431d8a8f6c99b,236ab9e9c081b9,0000a43bce5eec,0,0,21.29,0,0,0,92,...,0,0.0,0,0.0,0,0,5.201,1,1503,0.037
2290792,e7b325d63d8393,236ab9e9c081b9,0000a43bce5eec,0,0,0.0,0,0,0,91,...,0,0.0,0,0.0,0,0,18.65,1,1450,0.037


In [7]:
train = train[train['winPlacePerc'].notnull()].reset_index(drop=True)

In [66]:
# col_metrics = ['assists', 'boosts', 'damageDealt', 'DBNOs',
#        'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills',
#        'killStreaks', 'longestKill', 'matchDuration', 'maxPlace',
#        'numGroups', 'rankPoints', 'revives', 'rideDistance', 'roadKills',
#        'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance',
#        'weaponsAcquired', 'winPoints']
# col_drop = ['Id', 'groupId', 'matchId']

col_metrics = ['boosts', 'damageDealt', 'DBNOs',
       'headshotKills', 'killPlace', 
       'killStreaks', 'longestKill',
       'walkDistance',
       'weaponsAcquired']
col_drop = ['Id', 'groupId', 'matchId']

In [67]:
X_train, X_test, y_train, y_test = train_test_split(train.drop('winPlacePerc', axis=1), 
                                    train[['Id', 'winPlacePerc']], test_size=0.33, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2979466, 28), (1467499, 28), (2979466, 2), (1467499, 2))

In [68]:
X_train_2 = X_train.drop(col_drop, axis=1)
X_train_2.head()

Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,...,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints
1486370,0,1,449.6,5,1,1,6,0,4,1,...,1500,0,0.0,0,0.0,0,0,1455.0,5,0
1208781,1,0,87.9,0,0,0,74,0,0,0,...,1445,0,151.8,0,0.0,0,0,379.9,3,0
3427006,0,0,180.5,0,0,0,36,0,1,1,...,1504,0,0.0,0,0.0,0,0,298.5,2,0
1658192,0,0,387.2,3,1,0,13,1547,3,2,...,-1,0,0.0,0,0.0,0,0,129.0,1,1562
2555464,0,0,0.0,0,0,0,79,0,0,0,...,1506,0,0.0,0,0.0,0,0,120.9,3,0


# Decision Tree

In [69]:
classifier = DecisionTreeRegressor(random_state=42)
classifier.fit(X_train_2[col_metrics], y_train['winPlacePerc'])
del X_train_2

# Random Forest

In [70]:
# classifier = RandomForestRegressor(n_jobs=4, n_estimators=10, random_state=42)
# classifier.fit(X_train_2[col_metrics], y_train['winPlacePerc'])
# del X_train_2

In [97]:
X_test_2 = X_test.drop(col_drop, axis=1)
y_pred = classifier.predict(X_test_2[col_metrics])
del X_test_2
y_pred, X_test.shape, len(y_pred)

(array([0.5208, 0.9053, 0.7938, ..., 0.8125, 0.1481, 0.9381]),
 (1467499, 28),
 1467499)

In [72]:
mean_absolute_error(y_test['winPlacePerc'], y_pred)

0.10329007607145285

In [73]:
X_test_copy = X_test.copy()
X_test_grp = X_test[['matchId','groupId']].copy()
X_test_copy.drop(['matchId','groupId'], axis=1, inplace=True)
X_test_grp['winPlacePerc'] = y_pred
X_test_grp['y_pred'] = y_pred
X_test_grp.head()

Unnamed: 0,matchId,groupId,winPlacePerc,y_pred
4270779,e1c02c1421bdd4,6dfb23f52dc772,0.5208,0.5208
1687951,7669a0409fa32a,43b509a591d8f1,0.9053,0.9053
3732582,a3eae2eaede76e,3d2944992d2111,0.7938,0.7938
3386216,a22b2e7366205a,405f55ab35614a,0.1099,0.1099
243580,7f9e3909d86e2e,cb396c5f1bb96e,0.7609,0.7609


In [74]:
# X_test_copy.head()
y_test['winPlacePerc_true'] = y_test['winPlacePerc']
y_test.drop(['winPlacePerc'], axis=1, inplace=True)
y_test.head()

Unnamed: 0,Id,winPlacePerc_true
4270779,ef2d51d18a38cf,0.28
1687951,0ccf6e163b5009,0.8571
3732582,948c2d231060b2,0.7812
3386216,3a5872249139fc,0.1379
243580,a78cc239213b35,0.7263


In [75]:
group = X_test_grp.groupby('matchId')
X_test_grp['_rank.winPlacePerc'] = group['winPlacePerc'].rank(method='min')
print(X_test_grp[['matchId', 'groupId', 
                  '_rank.winPlacePerc', 'y_pred']].sort_values(['matchId', '_rank.winPlacePerc']).head())
X_test_copy = pd.concat([X_test_copy, X_test_grp], axis=1)
X_test_copy = pd.merge(X_test_copy, y_test)
X_test_copy.sort_values('Id').head()

                matchId         groupId  _rank.winPlacePerc    y_pred
609078   0000a43bce5eec  e8ff1c0fe7f6aa                 1.0  0.000000
3599149  0000a43bce5eec  3a6addfa0df938                 1.0  0.000000
1998699  0000a43bce5eec  236ab9e9c081b9                 1.0  0.000000
3895897  0000a43bce5eec  3a6addfa0df938                 4.0  0.002988
4367295  0000a43bce5eec  236ab9e9c081b9                 5.0  0.044400


Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,...,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,matchId,groupId,winPlacePerc,y_pred,_rank.winPlacePerc,winPlacePerc_true
838752,0000053e9d70a9,0,2,183.0,0,1,1,15,0,2,...,0,2065.0,2,0,e65e89709875ca,ea6619b280f5c9,0.9684,0.9684,31.0,0.8125
1056895,00002299115341,0,1,114.7,0,0,0,86,0,0,...,0,125.1,2,0,67402a2f879c8f,92041a706ac1ff,0.1667,0.1667,2.0,0.0
747646,000035d735aafb,0,1,100.0,1,0,0,74,0,0,...,0,432.1,6,0,f15961b94c9268,4b474ef3331a92,0.1786,0.1786,5.0,0.2222
1444670,000044511e367a,1,2,224.6,3,1,1,9,0,3,...,0,2757.0,3,0,e946a37ec1dc3f,61562e23b34da7,0.92,0.92,24.0,0.8889
149030,000056883d8510,0,0,0.0,0,0,0,70,0,0,...,0,31.24,1,0,ca72d76111cf62,8d0fe23814f136,0.2692,0.2692,13.0,0.3333


In [76]:
# X_test_copy[['Id', 'numGroups', 'maxPlace', 'winPlacePerc', '_rank.winPlacePerc']].head(10)

In [77]:
fullgroup = (X_test_copy['numGroups'] == X_test_copy['maxPlace'])

print(sum(fullgroup))
# full group (201366) --> calculate from rank
subset = X_test_copy.loc[fullgroup]
X_test_copy.loc[fullgroup, 'winPlacePerc'] = (subset['_rank.winPlacePerc'].values - 1) / (subset['maxPlace'].values - 1)

# not full group (684872) --> align with maxPlace
subset = X_test_copy.loc[~fullgroup]
gap = 1.0 / (subset['maxPlace'].values - 1)
new_perc = np.around(subset['winPlacePerc'].values / gap) * gap  # half&up
X_test_copy.loc[~fullgroup, 'winPlacePerc'] = new_perc

X_test_copy['winPlacePerc'] = X_test_copy['winPlacePerc'].clip(lower=0,upper=1)


408636


In [96]:
print(X_test_copy.loc[~fullgroup]['matchId'].unique())
X_test_copy[['Id', 'matchId', 'groupId', 'winPlacePerc', 'y_pred', 
                   '_rank.winPlacePerc', 'winPlacePerc_true', '_pred.winPlace']].sort_values(['matchId', 'groupId']).head(50)

['7669a0409fa32a' 'a3eae2eaede76e' 'a22b2e7366205a' ... 'ba580ea8f2f61b'
 '5ddd684bf067f8' 'd6be8e6354f722']


Unnamed: 0,Id,matchId,groupId,winPlacePerc,y_pred,_rank.winPlacePerc,winPlacePerc_true,_pred.winPlace
795458,1845bd80ef1e31,0000a43bce5eec,236ab9e9c081b9,0.148148,0.0444,5.0,0.037,
1268099,e431d8a8f6c99b,0000a43bce5eec,236ab9e9c081b9,0.0,0.0,1.0,0.037,
513634,7ad5883d71d42e,0000a43bce5eec,3a6addfa0df938,0.111111,0.002988,4.0,0.0,
1095864,fc62a751955351,0000a43bce5eec,3a6addfa0df938,0.0,0.0,1.0,0.0,
996011,9db8cb6643530a,0000a43bce5eec,4d1bbbc19b9084,1.0,0.9286,30.0,1.0,
1318594,ffc5bf94121ebc,0000a43bce5eec,599d924f8a02db,0.925926,0.8846,26.0,0.5926,
3051,a638435c730f4e,0000a43bce5eec,6620b219ed2ee2,0.814815,0.6667,23.0,0.7778,
387568,93825c1a755f53,0000a43bce5eec,6620b219ed2ee2,0.777778,0.5833,22.0,0.7778,
447386,51bf1d40b394c6,0000a43bce5eec,6c44ef4381fe8d,0.555556,0.3929,16.0,0.7037,
1235761,8c5828852148b5,0000a43bce5eec,767819928e6279,0.481481,0.3404,14.0,0.2593,


In [79]:
X_test_copy.loc[~fullgroup, '_pred.winPlace'] = np.around(X_test_copy.loc[~fullgroup, 'winPlacePerc'].values / gap) + 1
X_test_copy.loc[~fullgroup & (X_test_copy['matchId'] == '12acd71ccf720e'),
           ['matchId','groupId','winPlacePerc','maxPlace','numGroups','_pred.winPlace','_rank.winPlacePerc']
          ].sort_values(['matchId','_pred.winPlace','_rank.winPlacePerc'])

Unnamed: 0,matchId,groupId,winPlacePerc,maxPlace,numGroups,_pred.winPlace,_rank.winPlacePerc
459159,12acd71ccf720e,ab0ecc017f4a78,0.0,96,92,1.0,1.0
7,12acd71ccf720e,47458e09570b3f,0.0,96,92,1.0,2.0
430418,12acd71ccf720e,47458e09570b3f,0.021053,96,92,3.0,3.0
476844,12acd71ccf720e,d1174936d621cd,0.021053,96,92,3.0,4.0
657120,12acd71ccf720e,24666a091c34f1,0.052632,96,92,6.0,5.0
1029598,12acd71ccf720e,f6a7f887aa3d95,0.073684,96,92,8.0,6.0
1150802,12acd71ccf720e,d34c177da570eb,0.178947,96,92,18.0,7.0
374318,12acd71ccf720e,d7ce9908c04deb,0.210526,96,92,21.0,8.0
214278,12acd71ccf720e,260751ef30f92a,0.221053,96,92,22.0,9.0
1084024,12acd71ccf720e,b96724d88e1285,0.242105,96,92,24.0,10.0


In [80]:
y_test.head(), X_test_copy['winPlacePerc'].head()

(                     Id  winPlacePerc_true
 4270779  ef2d51d18a38cf             0.2800
 1687951  0ccf6e163b5009             0.8571
 3732582  948c2d231060b2             0.7812
 3386216  3a5872249139fc             0.1379
 243580   a78cc239213b35             0.7263, 0    0.440000
 1    0.908163
 2    0.791667
 3    0.103448
 4    0.273684
 Name: winPlacePerc, dtype: float64)

In [84]:
final_df = pd.merge(y_test, X_test_copy[['Id', 'matchId', 'groupId', 'winPlacePerc', '_pred.winPlace', '_rank.winPlacePerc', 'y_pred']])
final_df.head(50)

Unnamed: 0,Id,winPlacePerc_true,matchId,groupId,winPlacePerc,_pred.winPlace,_rank.winPlacePerc,y_pred
0,ef2d51d18a38cf,0.28,e1c02c1421bdd4,6dfb23f52dc772,0.44,,12.0,0.5208
1,0ccf6e163b5009,0.8571,7669a0409fa32a,43b509a591d8f1,0.908163,90.0,26.0,0.9053
2,948c2d231060b2,0.7812,a3eae2eaede76e,3d2944992d2111,0.791667,77.0,29.0,0.7938
3,3a5872249139fc,0.1379,a22b2e7366205a,405f55ab35614a,0.103448,4.0,9.0,0.1099
4,a78cc239213b35,0.7263,7f9e3909d86e2e,cb396c5f1bb96e,0.273684,,27.0,0.7609
5,c9c0eddbf63319,0.1923,625cc032ef1253,9a78ed7d69a464,0.269231,8.0,11.0,0.2553
6,9c754cf2e39552,0.9091,549073ce7cb13a,73b690d428df6e,0.954545,43.0,39.0,0.9579
7,6f0a3fc5134110,0.0,12acd71ccf720e,47458e09570b3f,0.0,1.0,2.0,0.003669
8,4f4a1fe62d1638,0.0833,a4f184e45b529c,2d5fcfc06e5a99,0.0625,,4.0,0.0213
9,660a11813c1435,0.1364,32b7d4d1461614,66fc34e84c2e09,0.318182,,8.0,0.2857


In [87]:
final_df.sort_values(['matchId', 'groupId', 'winPlacePerc']).head(50)

Unnamed: 0,Id,winPlacePerc_true,matchId,groupId,winPlacePerc,_pred.winPlace,_rank.winPlacePerc,y_pred
1268099,e431d8a8f6c99b,0.037,0000a43bce5eec,236ab9e9c081b9,0.0,,1.0,0.0
795458,1845bd80ef1e31,0.037,0000a43bce5eec,236ab9e9c081b9,0.148148,,5.0,0.0444
1095864,fc62a751955351,0.0,0000a43bce5eec,3a6addfa0df938,0.0,,1.0,0.0
513634,7ad5883d71d42e,0.0,0000a43bce5eec,3a6addfa0df938,0.111111,,4.0,0.002988
996011,9db8cb6643530a,1.0,0000a43bce5eec,4d1bbbc19b9084,1.0,,30.0,0.9286
1318594,ffc5bf94121ebc,0.5926,0000a43bce5eec,599d924f8a02db,0.925926,,26.0,0.8846
387568,93825c1a755f53,0.7778,0000a43bce5eec,6620b219ed2ee2,0.777778,,22.0,0.5833
3051,a638435c730f4e,0.7778,0000a43bce5eec,6620b219ed2ee2,0.814815,,23.0,0.6667
447386,51bf1d40b394c6,0.7037,0000a43bce5eec,6c44ef4381fe8d,0.555556,,16.0,0.3929
1235761,8c5828852148b5,0.2593,0000a43bce5eec,767819928e6279,0.481481,,14.0,0.3404


In [90]:
X_test_copy[X_test_copy['groupId'] == '3a6addfa0df938']

Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,...,walkDistance,weaponsAcquired,winPoints,matchId,groupId,winPlacePerc,y_pred,_rank.winPlacePerc,winPlacePerc_true,_pred.winPlace
513634,7ad5883d71d42e,0,0,0.0,0,0,0,94,1306,0,...,0.0,0,1413,0000a43bce5eec,3a6addfa0df938,0.111111,0.002988,4.0,0.0,
1095864,fc62a751955351,0,0,0.0,0,0,0,95,1458,0,...,9.636,1,1557,0000a43bce5eec,3a6addfa0df938,0.0,0.0,1.0,0.0,


In [83]:
mean_absolute_error(final_df['winPlacePerc'], final_df['winPlacePerc_true'])

0.11922039296180897

# Result

Attributes | MAE
--- | ---
**Decision Tree** |
All | 0.0822435372194249
All with Ranking | 0.020896060576532312, 0.10
All with Ranking (with ascending False) | 0.19069269005752446
Features from Chicken Dinner Notebook (including maxplace, numgroups) | 0.024114451660075895 
Features from Chicken Dinner Notebook | 0.11921477140699023, 0.11922039296180897
**Random Forest** |
All with Ranking | 0.0004156307577606391
Features from Chicken Dinner Notebook | 0.09820843398398921

(TODO) <br>
Because how dynamic a winPlacePerc value can be, custom prediction function needs to be developed

In [None]:
# final_df[['Id', 'winPlacePerc']]