# PUBG 

## Description

You are given over 65,000 games' worth of anonymized player data, split into training and testing sets, and asked to predict final placement from final in-game stats and initial player ratings.

What's the best strategy to win in PUBG? Should you sit in one spot and hide your way into victory, or do you need to be the top shot? Let's let the data do the talking!

## Evaluation
Submissions are evaluated on Mean Absolute Error between your predicted **winPlacePerc** and the observed **winPlacePerc**.



In [2]:
# Importing the libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import random
random.seed(7)

In [3]:
# Checking your working directory
os.getcwd() 
#os.chdir("/Users/axeljeremy/Documents/Jupyter/PUBG")

'/Users/axeljeremy/Documents/Jupyter/PUBG'

In [4]:
dataset = pd.read_csv('./all/train_V2.csv')

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4446966 entries, 0 to 4446965
Data columns (total 29 columns):
Id                 object
groupId            object
matchId            object
assists            int64
boosts             int64
damageDealt        float64
DBNOs              int64
headshotKills      int64
heals              int64
killPlace          int64
killPoints         int64
kills              int64
killStreaks        int64
longestKill        float64
matchDuration      int64
matchType          object
maxPlace           int64
numGroups          int64
rankPoints         int64
revives            int64
rideDistance       float64
roadKills          int64
swimDistance       float64
teamKills          int64
vehicleDestroys    int64
walkDistance       float64
weaponsAcquired    int64
winPoints          int64
winPlacePerc       float64
dtypes: float64(6), int64(19), object(4)
memory usage: 983.9+ MB


In [6]:
dataset.head()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,...,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,...,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,...,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.9,0,0,0,75,...,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,...,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


In [7]:
dim = dataset.shape
dim

(4446966, 29)

In [8]:
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, dim[1]-1].values

In [9]:
X.shape

(4446966, 28)

In [10]:
Y.shape

(4446966,)

In [11]:
Y[0:15]

array([0.4444, 0.64  , 0.7755, 0.1667, 0.1875, 0.037 , 0.    , 0.7368,
       0.3704, 0.2143, 0.3929, 0.4043, 0.9286, 0.875 , 0.9   ])

In [12]:
dataset['winPlacePerc']

0          0.4444
1          0.6400
2          0.7755
3          0.1667
4          0.1875
5          0.0370
6          0.0000
7          0.7368
8          0.3704
9          0.2143
10         0.3929
11         0.4043
12         0.9286
13         0.8750
14         0.9000
15         0.2766
16         0.7308
17         0.8211
18         0.1923
19         0.9310
20         0.6383
21         0.2143
22         0.7500
23         0.9592
24         0.9231
25         0.8696
26         0.1154
27         0.7234
28         0.9630
29         0.0000
            ...  
4446936    0.7308
4446937    0.7111
4446938    0.0385
4446939    0.3830
4446940    0.6250
4446941    0.1600
4446942    1.0000
4446943    0.1111
4446944    0.6875
4446945    0.1875
4446946    0.7292
4446947    0.3830
4446948    0.7917
4446949    0.1458
4446950    0.5000
4446951    0.1000
4446952    0.8462
4446953    0.5926
4446954    0.5306
4446955    0.4792
4446956    0.1071
4446957    0.4583
4446958    0.0000
4446959    0.0842
4446960   

In [13]:
dataset['winPlacePerc'].describe()

count    4.446965e+06
mean     4.728216e-01
std      3.074050e-01
min      0.000000e+00
25%      2.000000e-01
50%      4.583000e-01
75%      7.407000e-01
max      1.000000e+00
Name: winPlacePerc, dtype: float64

In [14]:
dataset['winPlacePerc'].isna()==True

0          False
1          False
2          False
3          False
4          False
5          False
6          False
7          False
8          False
9          False
10         False
11         False
12         False
13         False
14         False
15         False
16         False
17         False
18         False
19         False
20         False
21         False
22         False
23         False
24         False
25         False
26         False
27         False
28         False
29         False
           ...  
4446936    False
4446937    False
4446938    False
4446939    False
4446940    False
4446941    False
4446942    False
4446943    False
4446944    False
4446945    False
4446946    False
4446947    False
4446948    False
4446949    False
4446950    False
4446951    False
4446952    False
4446953    False
4446954    False
4446955    False
4446956    False
4446957    False
4446958    False
4446959    False
4446960    False
4446961    False
4446962    False
4446963    Fal

In [15]:
np.sum(dataset['winPlacePerc'].isna()==False)

4446965

In [16]:
np.sum([True,True,True,False]==False)

0

In [17]:
np.sum(dataset['winPlacePerc'].isna()==True)

1

In [18]:
dataset['winPlacePerc'].count() # count no NAs

4446965

In [50]:
dataset[dataset['winPlacePerc'].isna()==True]['matchId']

2744604    224a123c53e008
Name: matchId, dtype: object

In [55]:
dataset.loc[dataset['matchId'] == '224a123c53e008']

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
2744604,f70c74418bb064,12dfbede33f92b,224a123c53e008,0,0,0.0,0,0,0,1,...,0,0.0,0,0.0,0,0,0.0,0,0,


In [60]:
dataset['matchId'][2744604]

'224a123c53e008'

In [25]:
newd = dataset[dataset['winPlacePerc'].isna()==False]

In [26]:
newd.shape

(4446965, 29)

In [27]:
trainDataset = dataset
trainDataset = trainDataset.dropna(subset=['winPlacePerc','killPoints']) 
trainDataset.shape

(4446965, 29)

In [28]:
trainDataset = trainDataset.dropna()
trainDataset.shape

(4446965, 29)

In [20]:
dataset.shape

(4446966, 29)

In [22]:
print(dataset.columns.values)
print(dataset.columns)

['Id' 'groupId' 'matchId' 'assists' 'boosts' 'damageDealt' 'DBNOs'
 'headshotKills' 'heals' 'killPlace' 'killPoints' 'kills' 'killStreaks'
 'longestKill' 'matchDuration' 'matchType' 'maxPlace' 'numGroups'
 'rankPoints' 'revives' 'rideDistance' 'roadKills' 'swimDistance'
 'teamKills' 'vehicleDestroys' 'walkDistance' 'weaponsAcquired'
 'winPoints' 'winPlacePerc']
Index(['Id', 'groupId', 'matchId', 'assists', 'boosts', 'damageDealt', 'DBNOs',
       'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills',
       'killStreaks', 'longestKill', 'matchDuration', 'matchType', 'maxPlace',
       'numGroups', 'rankPoints', 'revives', 'rideDistance', 'roadKills',
       'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance',
       'weaponsAcquired', 'winPoints', 'winPlacePerc'],
      dtype='object')


In [76]:
 dataset.columns.values[0]

'Id'

In [79]:
dataset.columns.values[0:-1]

array(['Id', 'groupId', 'matchId', 'assists', 'boosts', 'damageDealt',
       'DBNOs', 'headshotKills', 'heals', 'killPlace', 'killPoints',
       'kills', 'killStreaks', 'longestKill', 'matchDuration',
       'matchType', 'maxPlace', 'numGroups', 'rankPoints', 'revives',
       'rideDistance', 'roadKills', 'swimDistance', 'teamKills',
       'vehicleDestroys', 'walkDistance', 'weaponsAcquired', 'winPoints'],
      dtype=object)

In [29]:
newdf = dataset
newdf2 = newdf.dropna(subset=[dataset.columns.values[2]]) # index
newdf2.shape

(4446966, 29)