## Random Forest

Notebook with implementation of the Random Forest algorithm to predict victory in Dota 2

-------------------------------------------------------------------------------------------------------------------------------

Useful functions to use to explore the data and preprocessing steps before feeding the data into the algorithm:

* df.columns : to see the names of the columns (i.e., features)
* df.dtype : to see the types in the data
* data.head()
* data.info()
* df.describe()

## Time blowout matches

In [1]:
# Import necessary libraries
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, auc
import statistics as st

In [2]:
# Directory for the time blowout group
cwd = os.getcwd()
root_directory = os.path.dirname(cwd)
time_blowout_data_dir = root_directory + "\\model_features_pre-match\\time_blowout\\"

### Exploration and preprocessing of the data

In [3]:
# Read data
feature_time_blowout_df = pd.read_csv(time_blowout_data_dir + "dota2_time_blowout_features.csv")

In [None]:
# Print feature names
feature_time_blowout_df.columns

In [5]:
# Drop first column (match id)
feature_time_blowout_df = feature_time_blowout_df.drop(['match_id'], axis=1)

In [None]:
# Existing types
feature_time_blowout_df.dtypes

In [None]:
feature_time_blowout_df.head()

In [None]:
feature_time_blowout_df.info()

In [9]:
# Fill in missing values with the median value of the feature
feature_time_blowout_df = feature_time_blowout_df.fillna(feature_time_blowout_df.median())

In [10]:
feature_time_blowout_df['rad_first_pick'] = feature_time_blowout_df['rad_first_pick'].astype(int)

### Model building, training and evaluation

In [11]:
# Import random forest library
from sklearn.ensemble import RandomForestClassifier

In [12]:
# Split into features (X) and label (y)
X, y = feature_time_blowout_df.iloc[:,:-1],feature_time_blowout_df.iloc[:,-1]

### Grid search

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [14]:
# Create the grid of parameters
param_grid = {
    'bootstrap': [True],
    'max_depth': [5, 10, 15, 50],
    'max_features': ['auto', 'sqrt', 'log2'],
    'n_estimators': [50, 100, 200, 300]
}

# Create a based model
rf = RandomForestClassifier()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = 2, verbose = 2)

# Perform search and print the best parameters
grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   15.3s
[Parallel(n_jobs=2)]: Done 144 out of 144 | elapsed:  2.0min finished


{'bootstrap': True,
 'max_depth': 50,
 'max_features': 'auto',
 'n_estimators': 300}

Best parameters:
{'bootstrap': True,
 'max_depth': 50,
 'max_features': 'auto',
 'n_estimators': 300}

In [18]:
# Create the model using the best paramtersauto
model = RandomForestClassifier(bootstrap = True,
                               n_estimators=300, 
                               max_depth=50,
                               max_features = 'auto')

In [19]:
features = [c for c in feature_time_blowout_df.columns if c != 'win_label']
target = 'win_label'

In [20]:
# Define the number of folders for the k-fold cross-validation
kfolds = KFold(n_splits=10, shuffle=True)

In [22]:
auc = list()

for train_idx, test_idx in kfolds.split(X):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]  
    
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    
    auc.append(roc_auc_score(y_test, preds))

'Median AUC: {:.04f}'.format(st.median(auc))

'Median AUC: 0.7318'