# Data Preprocessing

In [30]:
import pandas as pd
import numpy as np

In [31]:
df_train = pd.read_csv('train_V2.csv')

In [32]:
test_X = pd.read_csv('test_V2.csv')

### 1) Dealing with Null Values

In [33]:
df_train = df_train.dropna()
df_train['winPlacePerc'].isnull().sum()

0

### 2) Dealing with Outliers

In [34]:
# Concatenating two dataset in order to do data preprocessing in both sets
df = pd.concat([df_train,test_X],ignore_index = True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  


In [35]:
# detecting and dropping players' data with extreme killing counts
df.drop(df[df['kills'] > 30].index, inplace=True)
df.drop(df[df['roadKills'] > 10].index, inplace=True)

# detecting and dropping the killing without moving playes' data
df['totalDistance'] = df['rideDistance'] + df['walkDistance'] + df['swimDistance']
df['killsWithoutMoving'] = ((df['kills'] > 0) & (df['totalDistance'] == 0))
df.drop(df[df['killsWithoutMoving'] == True].index, inplace=True)

# detecting and dropping players' data with extreme long distance kills
df.drop(df[df['longestKill'] >= 1000].index, inplace=True)

# detecting and dropping players' data with extreme moving distance
df.drop(df[df['walkDistance'] >= 10000].index, inplace=True)
df.drop(df[df['rideDistance'] >= 20000].index, inplace=True)
df.drop(df[df['swimDistance'] >= 2000].index, inplace=True)

# deleting extreme damageDealt data
df.drop(df[df['damageDealt'] >= (np.percentile(df['damageDealt'], 99))].index, inplace=True)

# deleting extreme acquired weapons and heals data
df.drop(df[df['weaponsAcquired'] >= (np.percentile(df['weaponsAcquired'], 99))].index, inplace=True)
df.drop(df[df['heals'] >= (np.percentile(df['heals'], 99))].index, inplace=True)

In [36]:
# drop unuseful features
df = df.drop(['killsWithoutMoving'],axis=1)

### 3) Normalizing Data

In [8]:
# turning object values into categorical values and get category coding
df['groupId'] = df['groupId'].astype('category')
df['matchId'] = df['matchId'].astype('category')

df['groupId_cat'] = df['groupId'].cat.codes
df['matchId_cat'] = df['matchId'].cat.codes

df.drop(columns=['groupId', 'matchId'], inplace=True)

# Because the test set contains different Ids, Id column won't be useful to the machine learning algorithm
df.drop(columns = ['Id'], inplace=True)

In [9]:
df = pd.get_dummies(df, columns = ['matchType'])

### 4) Splitting Data for Modeling

In [10]:
# split the train and test set in order to do EDA
df_train_pro = df[~df['winPlacePerc'].isnull()]
test_X_pro = df[df['winPlacePerc'].isnull()]
print(df_train_pro.shape)
print(test_X_pro.shape)

(4276555, 44)
(1862905, 44)


In [11]:
test_X_pro = test_X_pro.drop(['winPlacePerc'],axis=1)

# Neural Network Model

In [15]:
!pip install torch torchvision

[33mYou are using pip version 18.0, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [16]:
#Let's get rid of some imports
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
#Define the model 
import torch
import torch.nn as nn
import torch.nn.functional as F

In [17]:
#This will throw and error at import if haven't upgraded. 
# from sklearn.cross_validation  import train_test_split  
from sklearn.model_selection  import train_test_split
#y is the dependent variable.
y = df_train_pro['winPlacePerc']
#As we know, iloc is used to slice the array by index number. Here this is the matrix of 
#independent variables.
X = df_train_pro.iloc[:,0:43]

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(2993588, 43) (1282967, 43) (2993588,) (1282967,)


In [18]:
#Define training hyperprameters.
batch_size = 50
num_epochs = 200
learning_rate = 0.01
size_hidden= 100

#Calculate some other hyperparameters based on data.  
batch_no = len(X_train) // batch_size  #batches
cols=X_train.shape[1] #Number of columns in input matrix
n_output=1


In [19]:
#Create the model
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Assume that we are on a CUDA machine, then this should print a CUDA device:
print("Executing the model on :",device)
class Net(torch.nn.Module):
    def __init__(self, n_feature, size_hidden, n_output):
        super(Net, self).__init__()
        self.hidden = torch.nn.Linear(cols, size_hidden)   # hidden layer
        self.predict = torch.nn.Linear(size_hidden, n_output)   # output layer

    def forward(self, x):
        x = F.relu(self.hidden(x))      # activation function for hidden layer
        x = self.predict(x)             # linear output
        return x
net = Net(cols, size_hidden, n_output)

Executing the model on : cpu


In [20]:
#Adam is a specific flavor of gradient decent which is typically better
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)
#optimizer = torch.optim.SGD(net.parameters(), lr=0.2)
criterion = torch.nn.MSELoss(size_average=False)  # this is for regression mean squared loss



In [21]:
#Change to numpy arraay. 
X_train=X_train.values
y_train=y_train.values
X_test=X_test.values
y_test=y_test.values

In [22]:
from sklearn.utils import shuffle
from torch.autograd import Variable
running_loss = 0.0
for epoch in range(num_epochs):
    #Shuffle just mixes up the dataset between epocs
    X_train, y_train = shuffle(X_train, y_train)
    # Mini batch learning
    for i in range(batch_no):
        start = i * batch_size
        end = start + batch_size
        inputs = Variable(torch.FloatTensor(X_train[start:end]))
        labels = Variable(torch.FloatTensor(y_train[start:end]))
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        #print("outputs",outputs)
        #print("outputs",outputs,outputs.shape,"labels",labels, labels.shape)
        loss = criterion(outputs, torch.unsqueeze(labels,dim=1))
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        
    print('Epoch {}'.format(epoch+1), "loss: ",running_loss)
    running_loss = 0.0

Epoch 1 loss:  1052536289502.3394
Epoch 2 loss:  296830709.6415849
Epoch 3 loss:  276931.01168203354
Epoch 4 loss:  276921.15700507164
Epoch 5 loss:  276922.1037712097
Epoch 6 loss:  276915.3274667263
Epoch 7 loss:  276922.0236167908
Epoch 8 loss:  276905.35077905655
Epoch 9 loss:  276929.7626488209
Epoch 10 loss:  276911.1831550598
Epoch 11 loss:  276900.1951057911
Epoch 12 loss:  276913.04455804825
Epoch 13 loss:  276926.94372558594
Epoch 14 loss:  276896.97618603706
Epoch 15 loss:  276928.90104842186
Epoch 16 loss:  276934.49557590485
Epoch 17 loss:  276909.43905878067
Epoch 18 loss:  276921.11840867996
Epoch 19 loss:  276918.04668188095
Epoch 20 loss:  276920.5670449734
Epoch 21 loss:  276912.0158083439
Epoch 22 loss:  276929.18187737465
Epoch 23 loss:  276917.5998892784
Epoch 24 loss:  276907.48375701904
Epoch 25 loss:  276922.4272685051
Epoch 26 loss:  276911.09183335304
Epoch 27 loss:  276890.95586538315
Epoch 28 loss:  276903.0332374573
Epoch 29 loss:  276897.92585778236
Epoch 

In [23]:
import pandas as pd
from sklearn.metrics import r2_score

X = Variable(torch.FloatTensor(X_train)) 
result = net(X)
pred=result.data[:,0].numpy()
print(len(pred),len(y_train))
r2_score(pred,y_train)

2993588 2993588


-95830221310.6652

In [24]:
import pandas as pd
from sklearn.metrics import r2_score
#This is a little bit tricky to get the resulting prediction.  
def calculate_r2(x,y=[]):
    """
    This function will return the r2 if passed x and y or return predictions if just passed x. 
    """
    # Evaluate the model with the test set. 
    X = Variable(torch.FloatTensor(x))  
    result = net(X) #This outputs the value for regression
    result=result.data[:,0].numpy()
  
    if len(y) != 0:
        r2=r2_score(result, y)
        print("R-Squared", r2)
        #print('Accuracy {:.2f}'.format(num_right / len(y)), "for a total of ", len(y), "records")
        return pd.DataFrame(data= {'actual': y, 'predicted': result})
    else:
        print("returning predictions")
        return result

In [25]:
result1=calculate_r2(X_train,y_train)
result2=calculate_r2(X_test,y_test)

R-Squared -95830221310.6652
R-Squared -473592.1055717208


# LinearRegression Model

In [26]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit( X_train, y_train )

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [27]:
print('R2 for Train)', lm.score( X_train, y_train ))
print('R2 for Test (cross validation)', lm.score(X_test, y_test))

R2 for Train) 1.0
R2 for Test (cross validation) 1.0
