Import Libraries

In [35]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import pubg_model_functions
from sklearn.ensemble import RandomForestRegressor

Load Data

In [2]:
trainData = pd.read_csv('train_V2.csv')
testData = pd.read_csv('test_V2.csv')
print(trainData.shape, testData.shape)

(8999, 29) (7505, 28)


In [3]:
trainData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8999 entries, 0 to 8998
Data columns (total 29 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               8999 non-null   object 
 1   groupId          8999 non-null   object 
 2   matchId          8999 non-null   object 
 3   assists          8999 non-null   int64  
 4   boosts           8999 non-null   int64  
 5   damageDealt      8999 non-null   float64
 6   DBNOs            8999 non-null   int64  
 7   headshotKills    8999 non-null   int64  
 8   heals            8999 non-null   int64  
 9   killPlace        8999 non-null   int64  
 10  killPoints       8999 non-null   int64  
 11  kills            8999 non-null   int64  
 12  killStreaks      8999 non-null   int64  
 13  longestKill      8999 non-null   float64
 14  matchDuration    8999 non-null   int64  
 15  matchType        8999 non-null   object 
 16  maxPlace         8999 non-null   int64  
 17  numGroups     

In [4]:
testData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7505 entries, 0 to 7504
Data columns (total 28 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               7505 non-null   object 
 1   groupId          7505 non-null   object 
 2   matchId          7505 non-null   object 
 3   assists          7505 non-null   int64  
 4   boosts           7505 non-null   int64  
 5   damageDealt      7505 non-null   float64
 6   DBNOs            7505 non-null   int64  
 7   headshotKills    7505 non-null   int64  
 8   heals            7505 non-null   int64  
 9   killPlace        7505 non-null   int64  
 10  killPoints       7505 non-null   int64  
 11  kills            7505 non-null   int64  
 12  killStreaks      7505 non-null   int64  
 13  longestKill      7505 non-null   float64
 14  matchDuration    7505 non-null   int64  
 15  matchType        7505 non-null   object 
 16  maxPlace         7505 non-null   int64  
 17  numGroups     

In [5]:
trainData.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
assists,8999.0,0.22847,0.57319,0.0,0.0,0.0,0.0,6.0
boosts,8999.0,1.129459,1.759368,0.0,0.0,0.0,2.0,11.0
damageDealt,8999.0,130.226078,170.845571,0.0,0.0,80.34,183.65,2325.0
DBNOs,8999.0,0.651961,1.132574,0.0,0.0,0.0,1.0,13.0
headshotKills,8999.0,0.226136,0.582475,0.0,0.0,0.0,0.0,8.0
heals,8999.0,1.375042,2.666197,0.0,0.0,0.0,2.0,29.0
killPlace,8999.0,47.900878,27.482678,1.0,24.0,48.0,72.0,100.0
killPoints,8999.0,507.42238,627.486182,0.0,0.0,0.0,1172.0,2026.0
kills,8999.0,0.922102,1.551745,0.0,0.0,0.0,1.0,21.0
killStreaks,8999.0,0.539504,0.713577,0.0,0.0,0.0,1.0,5.0


In [6]:
print(trainData.columns)
print(testData.columns)

Index(['Id', 'groupId', 'matchId', 'assists', 'boosts', 'damageDealt', 'DBNOs',
       'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills',
       'killStreaks', 'longestKill', 'matchDuration', 'matchType', 'maxPlace',
       'numGroups', 'rankPoints', 'revives', 'rideDistance', 'roadKills',
       'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance',
       'weaponsAcquired', 'winPoints', 'winPlacePerc'],
      dtype='object')
Index(['Id', 'groupId', 'matchId', 'assists', 'boosts', 'damageDealt', 'DBNOs',
       'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills',
       'killStreaks', 'longestKill', 'matchDuration', 'matchType', 'maxPlace',
       'numGroups', 'rankPoints', 'revives', 'rideDistance', 'roadKills',
       'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance',
       'weaponsAcquired', 'winPoints'],
      dtype='object')


In [7]:
# Merge all data together to ahve better data set
pubgData = pd.concat([trainData, testData], axis =0)
print(pubgData.shape)
pubgData.head()

(16504, 29)


Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,...,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,...,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,...,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.9,0,0,0,75,...,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,...,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


In [8]:
pubgData.info()
#none of the input has Null data

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16504 entries, 0 to 7504
Data columns (total 29 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               16504 non-null  object 
 1   groupId          16504 non-null  object 
 2   matchId          16504 non-null  object 
 3   assists          16504 non-null  int64  
 4   boosts           16504 non-null  int64  
 5   damageDealt      16504 non-null  float64
 6   DBNOs            16504 non-null  int64  
 7   headshotKills    16504 non-null  int64  
 8   heals            16504 non-null  int64  
 9   killPlace        16504 non-null  int64  
 10  killPoints       16504 non-null  int64  
 11  kills            16504 non-null  int64  
 12  killStreaks      16504 non-null  int64  
 13  longestKill      16504 non-null  float64
 14  matchDuration    16504 non-null  int64  
 15  matchType        16504 non-null  object 
 16  maxPlace         16504 non-null  int64  
 17  numGroups    

In [9]:
pubgData.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
assists,16504.0,0.228975,0.571904,0.0,0.0,0.0,0.0,10.0
boosts,16504.0,1.124939,1.748059,0.0,0.0,0.0,2.0,13.0
damageDealt,16504.0,131.352608,170.177824,0.0,0.0,82.62,186.425,2325.0
DBNOs,16504.0,0.663112,1.147407,0.0,0.0,0.0,1.0,13.0
headshotKills,16504.0,0.231338,0.60411,0.0,0.0,0.0,0.0,12.0
heals,16504.0,1.391784,2.704147,0.0,0.0,0.0,2.0,32.0
killPlace,16504.0,47.657416,27.556983,1.0,24.0,47.0,71.0,100.0
killPoints,16504.0,509.873425,628.601707,0.0,0.0,0.0,1178.0,2033.0
kills,16504.0,0.933047,1.550793,0.0,0.0,0.0,1.0,21.0
killStreaks,16504.0,0.546352,0.716346,0.0,0.0,0.0,1.0,9.0


In [10]:
# Drop any null values if any
# dropna
pubgData.dropna(inplace=True)

nullCount = pubgData.isnull().sum().sort_values()
print('Null count : ', nullCount[nullCount > 0])


Null count :  Series([], dtype: int64)


Feature Engineering

In [11]:
# Create new features 
# for example total distance travelled includes walking, driving, swimming
# total health including boost and assist from others
pubgData['totalDistance'] = pubgData['rideDistance'] + pubgData['walkDistance'] + pubgData['swimDistance']
pubgData['healthItems'] = pubgData['heals'] + pubgData['boosts']
pubgData['headshotKillRate'] = pubgData['headshotKills'] / pubgData['kills']
pubgData['killPlaceOverMaxPlace'] = pubgData['killPlace'] / pubgData['maxPlace']
pubgData['killsOverWalkDistance'] = pubgData['kills'] / pubgData['walkDistance']

# fill missing or Inf data with 0
#fillInf(pubgData, 0)

In [12]:
# Rank as percentage
match = pubgData.groupby('matchId')
pubgData['killsPerc'] = match['kills'].rank(pct=True).values
pubgData['killPlacePerc'] = match['killPlace'].rank(pct=True).values
pubgData['walkDistancePerc'] = match['walkDistance'].rank(pct=True).values
pubgData['walkPerc_killsPerc'] = pubgData['walkDistancePerc'] / pubgData['killsPerc']


In [13]:
pubgData.columns

Index(['Id', 'groupId', 'matchId', 'assists', 'boosts', 'damageDealt', 'DBNOs',
       'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills',
       'killStreaks', 'longestKill', 'matchDuration', 'matchType', 'maxPlace',
       'numGroups', 'rankPoints', 'revives', 'rideDistance', 'roadKills',
       'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance',
       'weaponsAcquired', 'winPoints', 'winPlacePerc', 'totalDistance',
       'healthItems', 'headshotKillRate', 'killPlaceOverMaxPlace',
       'killsOverWalkDistance', 'killsPerc', 'killPlacePerc',
       'walkDistancePerc', 'walkPerc_killsPerc'],
      dtype='object')

In [14]:
# Drop Features which are irrrelevant
pubgData = pubgData.drop(['boosts', 'heals', 'revives', 'assists', 'headshotKills'], axis = 1)
pubgData = pubgData.drop(['roadKills', 'teamKills', 'rideDistance', 'walkDistance', 'swimDistance'], axis = 1)
pubgData = pubgData.drop(['Id', 'matchDuration', 'longestKill', 'rankPoints', 'killPoints', 'vehicleDestroys'], axis = 1)

In [15]:
pubgData.columns

Index(['groupId', 'matchId', 'damageDealt', 'DBNOs', 'killPlace', 'kills',
       'killStreaks', 'matchType', 'maxPlace', 'numGroups', 'weaponsAcquired',
       'winPoints', 'winPlacePerc', 'totalDistance', 'healthItems',
       'headshotKillRate', 'killPlaceOverMaxPlace', 'killsOverWalkDistance',
       'killsPerc', 'killPlacePerc', 'walkDistancePerc', 'walkPerc_killsPerc'],
      dtype='object')

In [16]:
pubgData.head()

Unnamed: 0,groupId,matchId,damageDealt,DBNOs,killPlace,kills,killStreaks,matchType,maxPlace,numGroups,...,winPlacePerc,totalDistance,healthItems,headshotKillRate,killPlaceOverMaxPlace,killsOverWalkDistance,killsPerc,killPlacePerc,walkDistancePerc,walkPerc_killsPerc
0,4d4b580de459be,a10357fd1a4a91,0.0,0,60,0,0,squad-fpp,28,26,...,0.4444,244.8,0,,2.142857,0.0,1.0,1.0,1.0,1.0
1,684d5656442f9e,aeb375fc57110c,91.47,0,57,0,0,squad-fpp,26,25,...,0.64,1445.0445,0,,2.192308,0.0,0.5,1.0,1.0,2.0
2,6a4a42c3245a74,110163d8bb94ae,68.0,0,47,0,0,duo,50,47,...,0.7755,161.8,0,,0.94,0.0,1.0,1.0,1.0,1.0
3,a930a9c79cd721,f1f1f4ef412d7e,32.9,0,75,0,0,squad-fpp,31,30,...,0.1667,202.7,0,,2.419355,0.0,0.5,1.0,0.333333,0.666667
4,de04010b3458dd,6dc8ff871e21e6,100.0,0,45,1,1,solo-fpp,97,95,...,0.1875,49.75,0,0.0,0.463918,0.020101,1.0,1.0,1.0,1.0


In [17]:
# Group all data type into one
# groupData = pubgData.groupby(['matchId','groupId','matchType'])

In [18]:
# Drop any feaure which has constant values across all data points as it will not add any 
# significance to model
constant_column = [col for col in pubgData.columns if pubgData[col].nunique() == 1]
print('drop columns:', constant_column)
pubgData.drop(constant_column, axis=1, inplace=True)

drop columns: []


In [19]:
pubgData.head()

Unnamed: 0,groupId,matchId,damageDealt,DBNOs,killPlace,kills,killStreaks,matchType,maxPlace,numGroups,...,winPlacePerc,totalDistance,healthItems,headshotKillRate,killPlaceOverMaxPlace,killsOverWalkDistance,killsPerc,killPlacePerc,walkDistancePerc,walkPerc_killsPerc
0,4d4b580de459be,a10357fd1a4a91,0.0,0,60,0,0,squad-fpp,28,26,...,0.4444,244.8,0,,2.142857,0.0,1.0,1.0,1.0,1.0
1,684d5656442f9e,aeb375fc57110c,91.47,0,57,0,0,squad-fpp,26,25,...,0.64,1445.0445,0,,2.192308,0.0,0.5,1.0,1.0,2.0
2,6a4a42c3245a74,110163d8bb94ae,68.0,0,47,0,0,duo,50,47,...,0.7755,161.8,0,,0.94,0.0,1.0,1.0,1.0,1.0
3,a930a9c79cd721,f1f1f4ef412d7e,32.9,0,75,0,0,squad-fpp,31,30,...,0.1667,202.7,0,,2.419355,0.0,0.5,1.0,0.333333,0.666667
4,de04010b3458dd,6dc8ff871e21e6,100.0,0,45,1,1,solo-fpp,97,95,...,0.1875,49.75,0,0.0,0.463918,0.020101,1.0,1.0,1.0,1.0


In [20]:
# We have several match type 
# for example solo, duo, squad
# Do one hot encoding for categorical variables
# Encode simialr label for match type into one.
# Example of mapping is below
'''
solo  <-- solo,solo-fpp,normal-solo,normal-solo-fpp
duo   <-- duo,duo-fpp,normal-duo,normal-duo-fpp,crashfpp,crashtpp
squad <-- squad,squad-fpp,normal-squad,normal-squad-fpp
'''
mapper = lambda x: 'solo' if ('solo' in x) else 'duo' if ('duo' in x) or ('squad' in x) else 'squad'
pubgData['matchType'] = pubgData['matchType'].apply(mapper)
pubgData.head()

Unnamed: 0,groupId,matchId,damageDealt,DBNOs,killPlace,kills,killStreaks,matchType,maxPlace,numGroups,...,winPlacePerc,totalDistance,healthItems,headshotKillRate,killPlaceOverMaxPlace,killsOverWalkDistance,killsPerc,killPlacePerc,walkDistancePerc,walkPerc_killsPerc
0,4d4b580de459be,a10357fd1a4a91,0.0,0,60,0,0,duo,28,26,...,0.4444,244.8,0,,2.142857,0.0,1.0,1.0,1.0,1.0
1,684d5656442f9e,aeb375fc57110c,91.47,0,57,0,0,duo,26,25,...,0.64,1445.0445,0,,2.192308,0.0,0.5,1.0,1.0,2.0
2,6a4a42c3245a74,110163d8bb94ae,68.0,0,47,0,0,duo,50,47,...,0.7755,161.8,0,,0.94,0.0,1.0,1.0,1.0,1.0
3,a930a9c79cd721,f1f1f4ef412d7e,32.9,0,75,0,0,duo,31,30,...,0.1667,202.7,0,,2.419355,0.0,0.5,1.0,0.333333,0.666667
4,de04010b3458dd,6dc8ff871e21e6,100.0,0,45,1,1,solo,97,95,...,0.1875,49.75,0,0.0,0.463918,0.020101,1.0,1.0,1.0,1.0


In [21]:
pubgData.columns

Index(['groupId', 'matchId', 'damageDealt', 'DBNOs', 'killPlace', 'kills',
       'killStreaks', 'matchType', 'maxPlace', 'numGroups', 'weaponsAcquired',
       'winPoints', 'winPlacePerc', 'totalDistance', 'healthItems',
       'headshotKillRate', 'killPlaceOverMaxPlace', 'killsOverWalkDistance',
       'killsPerc', 'killPlacePerc', 'walkDistancePerc', 'walkPerc_killsPerc'],
      dtype='object')

In [22]:
categoricalDummies = pd.get_dummies(pubgData['matchType'], prefix='matchType') # This can be done by OneHotEncoder class from sklearn as well
pubgData = pd.concat([pubgData, categoricalDummies], axis = 1)
pubgData.head()

Unnamed: 0,groupId,matchId,damageDealt,DBNOs,killPlace,kills,killStreaks,matchType,maxPlace,numGroups,...,headshotKillRate,killPlaceOverMaxPlace,killsOverWalkDistance,killsPerc,killPlacePerc,walkDistancePerc,walkPerc_killsPerc,matchType_duo,matchType_solo,matchType_squad
0,4d4b580de459be,a10357fd1a4a91,0.0,0,60,0,0,duo,28,26,...,,2.142857,0.0,1.0,1.0,1.0,1.0,1,0,0
1,684d5656442f9e,aeb375fc57110c,91.47,0,57,0,0,duo,26,25,...,,2.192308,0.0,0.5,1.0,1.0,2.0,1,0,0
2,6a4a42c3245a74,110163d8bb94ae,68.0,0,47,0,0,duo,50,47,...,,0.94,0.0,1.0,1.0,1.0,1.0,1,0,0
3,a930a9c79cd721,f1f1f4ef412d7e,32.9,0,75,0,0,duo,31,30,...,,2.419355,0.0,0.5,1.0,0.333333,0.666667,1,0,0
4,de04010b3458dd,6dc8ff871e21e6,100.0,0,45,1,1,solo,97,95,...,0.0,0.463918,0.020101,1.0,1.0,1.0,1.0,0,1,0


In [23]:
pubgData.columns

Index(['groupId', 'matchId', 'damageDealt', 'DBNOs', 'killPlace', 'kills',
       'killStreaks', 'matchType', 'maxPlace', 'numGroups', 'weaponsAcquired',
       'winPoints', 'winPlacePerc', 'totalDistance', 'healthItems',
       'headshotKillRate', 'killPlaceOverMaxPlace', 'killsOverWalkDistance',
       'killsPerc', 'killPlacePerc', 'walkDistancePerc', 'walkPerc_killsPerc',
       'matchType_duo', 'matchType_solo', 'matchType_squad'],
      dtype='object')

In [24]:
# drop matchId,groupId
pubgData.drop(['matchId','groupId', 'matchType'], axis=1, inplace=True)

In [25]:
pubgData.columns

Index(['damageDealt', 'DBNOs', 'killPlace', 'kills', 'killStreaks', 'maxPlace',
       'numGroups', 'weaponsAcquired', 'winPoints', 'winPlacePerc',
       'totalDistance', 'healthItems', 'headshotKillRate',
       'killPlaceOverMaxPlace', 'killsOverWalkDistance', 'killsPerc',
       'killPlacePerc', 'walkDistancePerc', 'walkPerc_killsPerc',
       'matchType_duo', 'matchType_solo', 'matchType_squad'],
      dtype='object')

In [26]:
pubgData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8999 entries, 0 to 8998
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   damageDealt            8999 non-null   float64
 1   DBNOs                  8999 non-null   int64  
 2   killPlace              8999 non-null   int64  
 3   kills                  8999 non-null   int64  
 4   killStreaks            8999 non-null   int64  
 5   maxPlace               8999 non-null   int64  
 6   numGroups              8999 non-null   int64  
 7   weaponsAcquired        8999 non-null   int64  
 8   winPoints              8999 non-null   int64  
 9   winPlacePerc           8999 non-null   float64
 10  totalDistance          8999 non-null   float64
 11  healthItems            8999 non-null   int64  
 12  headshotKillRate       3829 non-null   float64
 13  killPlaceOverMaxPlace  8999 non-null   float64
 14  killsOverWalkDistance  8805 non-null   float64
 15  kill

Train and test data

In [27]:
# Remove the entries tending towards infinite value or out of float range
pubgData = pubg_model_functions.clean_dataset(pubgData)

In [28]:
from sklearn.model_selection import train_test_split
X = pubgData.drop(['winPlacePerc'], axis=1)
y = pubgData['winPlacePerc']
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.25)
print(type(X))
print(type(y))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [29]:
print(type(X_train), type(y_train), type(X_test), type(y_test))

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'> <class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>


In [30]:
X_train

Unnamed: 0,damageDealt,DBNOs,killPlace,kills,killStreaks,maxPlace,numGroups,weaponsAcquired,winPoints,totalDistance,...,headshotKillRate,killPlaceOverMaxPlace,killsOverWalkDistance,killsPerc,killPlacePerc,walkDistancePerc,walkPerc_killsPerc,matchType_duo,matchType_solo,matchType_squad
77,243.1,1.0,29.0,1.0,1.0,47.0,45.0,4.0,0.0,1749.00,...,1.000000,0.617021,0.000572,1.0,1.0,1.0,1.0,1.0,0.0,0.0
175,151.6,1.0,34.0,1.0,1.0,27.0,27.0,2.0,0.0,179.00,...,0.000000,1.259259,0.005587,1.0,1.0,1.0,1.0,1.0,0.0,0.0
7352,200.0,2.0,22.0,2.0,2.0,25.0,25.0,1.0,0.0,166.20,...,0.000000,0.880000,0.012034,1.0,1.0,1.0,1.0,1.0,0.0,0.0
1835,405.2,3.0,7.0,3.0,3.0,28.0,28.0,5.0,1500.0,2343.00,...,0.000000,0.250000,0.001280,1.0,1.0,1.0,1.0,1.0,0.0,0.0
4371,410.1,1.0,7.0,3.0,2.0,28.0,28.0,6.0,1563.0,7086.00,...,0.333333,0.250000,0.001381,1.0,1.0,1.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7941,100.0,1.0,39.0,1.0,1.0,48.0,48.0,1.0,1488.0,38.53,...,0.000000,0.812500,0.025954,1.0,1.0,1.0,1.0,1.0,0.0,0.0
5247,137.1,0.0,10.0,2.0,1.0,27.0,26.0,5.0,1467.0,1916.60,...,0.000000,0.370370,0.001245,1.0,1.0,1.0,1.0,1.0,0.0,0.0
5334,219.3,0.0,9.0,2.0,1.0,29.0,27.0,7.0,1612.0,5792.00,...,0.000000,0.310345,0.001035,1.0,1.0,1.0,1.0,1.0,0.0,0.0
2620,400.0,2.0,6.0,4.0,2.0,46.0,44.0,3.0,0.0,1607.90,...,0.250000,0.130435,0.006360,1.0,1.0,1.0,1.0,1.0,0.0,0.0


Apply Normalisation

In [31]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [32]:
print(X_train.shape)
print(X_test.shape)
print(type(X_train))
print(type(y_train))

(2870, 21)
(957, 21)
<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>


Build ANN using Sequential and Layers class

In [33]:
X_train

array([[-0.0464621 , -0.25409067,  0.67120847, ...,  0.45262552,
        -0.45150754, -0.02640739],
       [-0.51722879, -0.25409067,  1.08126601, ...,  0.45262552,
        -0.45150754, -0.02640739],
       [-0.26821122,  0.46650244,  0.09712791, ...,  0.45262552,
        -0.45150754, -0.02640739],
       ...,
       [-0.16891289, -0.97468379, -0.96902168, ...,  0.45262552,
        -0.45150754, -0.02640739],
       [ 0.760787  ,  0.46650244, -1.21505621, ...,  0.45262552,
        -0.45150754, -0.02640739],
       [ 1.18370527,  0.46650244, -1.05103319, ...,  0.45262552,
        -0.45150754, -0.02640739]])

In [36]:
# Convert N-d array to 2-d array as it creates matrix multiplication issue in Sequentail layers
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1]))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1]))

In [37]:
print(X_train.shape)
print(X_test.shape)

(2870, 21)
(957, 21)


In [38]:
type(X_train.shape[1])

int

In [37]:
model = RandomForestRegressor(n_estimators=100, criterion='squared_error', random_state=42)

In [49]:
model.fit(X_train, y_train)

Predict results

In [50]:
y_pred = model.predict(X_test)

print(X_test.shape)
print(y_pred.shape)


(957, 21)
(957,)


Accuracy

In [51]:
from sklearn.metrics import r2_score
r2Score = r2_score(y_test, y_pred)
r2Score

0.7734070216434055

In [54]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
mse

0.019738690883614424