In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import models, layers
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# Import our datasets

train_csv = pd.read_csv('dota2Train.csv')
test_csv = pd.read_csv('dota2Test.csv')

In [3]:
train_columns = ['feature_{}'.format(x) for x in range(1, train_csv.shape[1] + 1)]
train_columns[0] = 'target_feature'
train_csv.columns = train_columns
train_csv.head()

Unnamed: 0,target_feature,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_108,feature_109,feature_110,feature_111,feature_112,feature_113,feature_114,feature_115,feature_116,feature_117
0,1,152,2,2,0,0,0,1,0,-1,...,0,0,0,0,0,0,0,0,0,0
1,1,131,2,2,0,0,0,1,0,-1,...,0,0,0,0,0,0,0,0,0,0
2,1,154,2,2,0,0,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
3,-1,171,2,3,0,0,0,0,0,-1,...,0,0,0,0,0,0,0,0,0,0
4,1,122,2,3,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,-1


In [4]:
test_columns = ['feature_{}'.format(x) for x in range(1, test_csv.shape[1] + 1)]
test_columns[0] = 'target_feature'
test_csv.columns = test_columns
test_csv.head()

Unnamed: 0,target_feature,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_108,feature_109,feature_110,feature_111,feature_112,feature_113,feature_114,feature_115,feature_116,feature_117
0,1,227,8,2,0,0,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
1,-1,136,2,2,1,0,0,0,-1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,227,2,2,-1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,184,2,3,0,0,0,-1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,231,2,2,0,0,0,0,0,0,...,0,0,0,0,0,0,-1,0,0,0


In [5]:
# concatenate the two datasets into one

raw_dota = train_csv.append(test_csv, ignore_index=True, sort=False)
raw_dota.head()

Unnamed: 0,target_feature,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_108,feature_109,feature_110,feature_111,feature_112,feature_113,feature_114,feature_115,feature_116,feature_117
0,1,152,2,2,0,0,0,1,0,-1,...,0,0,0,0,0,0,0,0,0,0
1,1,131,2,2,0,0,0,1,0,-1,...,0,0,0,0,0,0,0,0,0,0
2,1,154,2,2,0,0,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
3,-1,171,2,3,0,0,0,0,0,-1,...,0,0,0,0,0,0,0,0,0,0
4,1,122,2,3,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,-1


In [6]:
# Let's check how balanced our target data is

raw_dota['target_feature'].value_counts()

 1    54284
-1    48658
Name: target_feature, dtype: int64

### Feature 2 relevance

Let's find out the relevance of place id, to the winning team, and remove it if none. This is essential, because since this column `feature_2` is categorical and is comprised of many values, it would be extremely costly if we had to create dummy variables for all these ID's and a huge improvement and dimensionality reduction if we could remove this safely, by proving that the winning team (1 or -1) has nothing to do with the ID.

In [7]:
# Let's plot feature 2 realations

group_placeids_by_winning_team = raw_dota.groupby(['feature_2', 'target_feature'])['target_feature'].count()

group_placeids_by_winning_team

feature_2  target_feature
111        -1                 427
            1                 491
112        -1                 439
            1                 462
121        -1                 354
            1                 434
122        -1                 346
            1                 390
123        -1                 345
            1                 403
124        -1                 353
            1                 370
131        -1                 393
            1                 427
132        -1                 409
            1                 436
133        -1                 397
            1                 446
134        -1                 348
            1                 389
135        -1                 335
            1                 353
136        -1                 334
            1                 354
137        -1                 301
            1                 386
138        -1                 377
            1                 398
144        -1         

### Quick Note

We could actually plot or take a sample, but by examining one by one, we can see that almost all outcomes per given place are very close to 50-50 percent, or at least within any statistical threshold, enough to deny any existential correlation.

#### Extra note

A possible method here, could be a **chi-square test** for hypothesis testing. Were we can set all expected values on 50-50 per 
given place, and see how likely is for the observed values to fit our hyppothesis.

In [8]:
# drop feature_2 columns as it has no significant impact location

raw_dota.drop(columns='feature_2', inplace=True)

In [9]:
def make_binary(x):
    return 0 if x == -1 else 1

In [10]:
raw_dota['target_feature'] = raw_dota['target_feature'].apply(make_binary)

In [11]:
# Create the train test  sets by 80-20% proportion

raw_shuffled = raw_dota.sample(frac=1, random_state=101).reset_index(drop=True)
raw_shuffled.head()

Unnamed: 0,target_feature,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,...,feature_108,feature_109,feature_110,feature_111,feature_112,feature_113,feature_114,feature_115,feature_116,feature_117
0,0,2,3,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,8,2,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,2,3,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,2,2,0,-1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,2,3,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
dota_refined = pd.get_dummies(raw_shuffled, columns=test_columns[2:])
dota_refined.head(5)

Unnamed: 0,target_feature,feature_3_1,feature_3_2,feature_3_3,feature_3_4,feature_3_5,feature_3_6,feature_3_7,feature_3_8,feature_3_9,...,feature_114_1,feature_115_-1,feature_115_0,feature_115_1,feature_116_-1,feature_116_0,feature_116_1,feature_117_-1,feature_117_0,feature_117_1
0,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,1,0
1,1,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,1,0,0,1,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,1,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,1,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,1,0


In [23]:
dota_size = dota_refined.shape[0]

train_size = int(dota_size * 0.8)
test_size = dota_size - train_size

print(train_size)
print(test_size)

82353
20589


In [24]:
train_set = dota_refined.loc[0:train_size]
test_set = dota_refined.loc[train_size:dota_size]
print(train_set.shape)
print(test_set.shape)

(82354, 348)
(20589, 348)


In [25]:
Y_train = train_set['target_feature']
X_train = train_set.drop(columns='target_feature')
Y_test = test_set['target_feature']
X_test = test_set.drop(columns='target_feature')
print(X_train.shape)
print(X_test.shape)

(82354, 347)
(20589, 347)


In [28]:
model = models.Sequential()

model.add(layers.Input(shape=(347,)))

model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(256, activation='relu'))

model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer=keras.optimizers.RMSprop(learning_rate=1e-3), loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_18 (Dense)             (None, 256)               89088     
_________________________________________________________________
dense_19 (Dense)             (None, 256)               65792     
_________________________________________________________________
dense_20 (Dense)             (None, 256)               65792     
_________________________________________________________________
dense_21 (Dense)             (None, 256)               65792     
_________________________________________________________________
dense_22 (Dense)             (None, 256)               65792     
_________________________________________________________________
dense_23 (Dense)             (None, 256)               65792     
_________________________________________________________________
dense_24 (Dense)             (None, 256)              

## Model Fit

After preprocessing procedure is over, we can create the architecture of our ANN, and train our model. The combinations we tried were numerous and it would be impossible to depict them in as single jupyter notebook. Just as fact of evidence I will post that we tried all possible combinations by hypertuning the **following parameters**:

1. Number of hidden layers: 2, 3
2. Number of neurons on each hidden layer: 16, 32, 64, 128, 256
3. Activation function on hidden layers: relu, sigmoid, softmax, tanh
4. Output layer: sigmoid neuron with binary cross entropy as loss function, 2 softmax neurons with sparse_categorical_crossentropy as a loss function
5. optimizer: rmsprop, adam, stochastic gradient descent
6. Loss: binary_crossentropy, mse, sparse_categorical_crossentropy

### In other words 

We used a grid search of all values that were described. 

Most of the differences had a small impact on the outcome, but the best performing model on validation split accuracy is depicted below.

In [29]:
# We use a validation test in order to detect overfitting patterns.

model.fit(X_train, Y_train, batch_size=128, epochs=10, validation_split=0.2)

Train on 65883 samples, validate on 16471 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1d016785eb8>

### Quick Notes

We can see that even with the best fit params we could find, accuracy on validation set remains relatively low, and just merely over random choice at 58%.(not sure if is the dataset, the preprocessing or anything else though). 

Also after the sixth epoch the model tends to overfit, as the validation test accuracy remains stable, while train set accuracy get's higher.

In [21]:
test_loss, test_acc = model.evaluate(X_test, Y_test)
print('Accuracy in the testing data:', test_acc)

Accuracy in the testing data: 0.5694303


Test accuracy is also very close to our own validation set, which makes sense. 

### Traditional machine learning approach?

Let's see how another machine learning classifier will perform, and if it can do better. We will choose logistic regression because the dataset is huge and most of the times is the easiest to run (we have almost 100.000 rows and more than 300 columns),so anything else should take a lot of time to run on CPU. 

We could also use KNN but since we don't have too much of numerical data and almost all of the columns are categorical, we chose not too.

In [16]:
# use logistic regression as comparison 

lr_model = LogisticRegression(solver="saga", max_iter=1000, random_state=101).fit(X_train, Y_train)
lr_predictions = lr_model.predict(X_test)
print(accuracy_score(Y_test, lr_predictions))

0.6042061294866191


### Conclusion Note

With almost default params, seems like logistic regression attains the same level of accuracy with our neural network.