In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn import ensemble

import seaborn as sns
import math
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv("C:/Users/bodak/Desktop/features.csv")
test = pd.read_csv("C:/Users/bodak/Desktop/features_test.csv")

heroes = pd.read_csv("C:/Users/bodak/Desktop/data/dictionaries/heroes.csv")
items = pd.read_csv("C:/Users/bodak/Desktop/data/dictionaries/items.csv")

y = np.array(train['radiant_win'])
index = test['match_id']

In [3]:
#combine data

na = train.shape[0]
train = train.drop(['radiant_win'], axis = 1)
combine = pd.concat((train, test), sort=False).reset_index(drop=True)

In [4]:
#data cleaning

combine = combine.drop(['tower_status_radiant'], axis = 1)
combine = combine.drop(['tower_status_dire'], axis = 1)
combine = combine.drop(['barracks_status_radiant'], axis = 1)
combine = combine.drop(['barracks_status_dire'], axis = 1)
combine = combine.drop(['duration'], axis = 1)
combine = combine.drop(['lobby_type'], axis = 1)
combine = combine.drop(['start_time'], axis = 1)
combine = combine.drop(['match_id'], axis = 1)


In [5]:
#filling missing values

combine['first_blood_player2'] = combine['first_blood_player2'].fillna(-1)
combine['radiant_flying_courier_time'] = combine['radiant_flying_courier_time'].fillna(0)
combine['dire_flying_courier_time'] = combine['dire_flying_courier_time'].fillna(0)
combine['first_blood_player1'] = combine['first_blood_player1'].fillna(-1)
combine['first_blood_team'] = combine['first_blood_team'].fillna(-1)
combine['first_blood_time'] = combine['first_blood_time'].fillna(-1)
combine['dire_bottle_time'] = combine['dire_bottle_time'].fillna(0)
combine['radiant_bottle_time'] = combine['radiant_bottle_time'].fillna(0)
combine['radiant_first_ward_time'] = combine['radiant_first_ward_time'].fillna(0)
combine['dire_first_ward_time'] = combine['dire_first_ward_time'].fillna(0)
combine['radiant_courier_time'] = combine['radiant_courier_time'].fillna(0)
combine['dire_courier_time'] = combine['dire_courier_time'].fillna(0)

In [6]:
#feature engineering

combine['r_gold'] = combine['r1_gold'] + combine['r2_gold'] + combine['r3_gold'] + combine['r4_gold'] + combine['r5_gold']
combine['d_gold'] = combine['d1_gold'] + combine['d2_gold'] + combine['d3_gold'] + combine['d4_gold'] + combine['d5_gold']
combine['diff_gold'] = combine['r_gold'] - combine['d_gold']

combine['r_xp'] = combine['r1_xp'] + combine['r2_xp'] + combine['r3_xp'] + combine['r4_xp'] + combine['r5_xp']
combine['d_xp'] = combine['d1_xp'] + combine['d2_xp'] + combine['d3_xp'] + combine['d4_xp'] + combine['d5_xp']
combine['diff_xp'] = combine['r_xp'] - combine['d_xp']

combine['r_lh'] = combine['r1_lh'] + combine['r2_lh'] + combine['r3_lh'] + combine['r4_lh'] + combine['r5_lh']
combine['d_lh'] = combine['d1_lh'] + combine['d2_lh'] + combine['d3_lh'] + combine['d4_lh'] + combine['d5_lh']
combine['diff_lh'] = combine['r_lh'] - combine['d_lh']

combine['r_deaths'] = combine['r1_deaths'] + combine['r2_deaths'] + combine['r3_deaths'] + combine['r4_deaths'] + combine['r5_deaths']
combine['d_deaths'] = combine['d1_deaths'] + combine['d2_deaths'] + combine['d3_deaths'] + combine['d4_deaths'] + combine['d5_deaths']
combine['diff_deaths'] = combine['r_deaths'] - combine['d_deaths']

combine['r_kills'] = combine['r1_kills'] + combine['r2_kills'] + combine['r3_kills'] + combine['r4_kills'] + combine['r5_kills']
combine['d_kills'] = combine['d1_kills'] + combine['d2_kills'] + combine['d3_kills'] + combine['d4_kills'] + combine['d5_kills']
combine['diff_kills'] = combine['r_kills'] - combine['d_kills']

combine['diff_boots'] = combine['radiant_boots_count'] - combine['dire_boots_count']

combine['diff_tpscroll'] = combine['radiant_tpscroll_count'] - combine['dire_tpscroll_count']

combine['diff_bottle'] = combine['radiant_bottle_time'] - combine['dire_bottle_time']

combine['late_r_courier'] = pd.Series(combine['radiant_courier_time'] > 150).astype('int')
combine['early_r_courier'] = pd.Series(combine['radiant_courier_time'] < 150).astype('int')
combine['no_r_courier'] = pd.Series(combine['radiant_courier_time'] == 0).astype('int')
combine['late_d_courier'] = pd.Series(combine['dire_courier_time'] > 150).astype('int')
combine['early_d_courier'] = pd.Series(combine['dire_courier_time'] < 150).astype('int')
combine['no_d_courier'] = pd.Series(combine['dire_courier_time'] == 0).astype('int')

In [7]:
#some of features is categorical in fact. Now they are completely useless for model
#lets fill 'hero names' and 'hero items' with the right value 


d_heroes = {}
for i in range(len(heroes['id'])):
    d_heroes[heroes['id'][i]] = heroes['name'][i]
    
d_items = {}
for i in range(len(items['id'])):
    d_items[items['id'][i]] = items['name'][i]
    
for column in combine:
        if 'hero' in column:
            combine[column] = combine[column].map(d_heroes)

for column in combine:
        if 'items' in column:
            combine[column] = combine[column].map(d_items)
            


In [8]:
#now we can use get_dummies for object features that we created before

combine_cat_dummies = pd.get_dummies(combine, columns=combine.columns[combine.dtypes == 'object'])

In [9]:
#get test and train datasets from combine

Train = combine_cat_dummies[:na]
Test = combine_cat_dummies[na:]

In [10]:
#split data

x_train, x_test, y_train, y_test = train_test_split(Train, y, test_size = .3, random_state=0)

In [11]:
#scale data

scaler= RobustScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
X_test= scaler.transform(Test)

In [12]:
#Logistic Regression works well with lots of features 

logit = LogisticRegression(random_state=17)

In [13]:
combine_cat_dummies.shape

(114407, 1449)

In [14]:
logit.fit(x_train, y_train)
y_pred_logit = logit.predict(x_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [15]:
roc_auc_score(y_test, y_pred_logit)

0.6738285994678312

In [16]:
#GradientBoostingClassifier

params = {'n_estimators': 100, 'max_depth': 3, 'min_samples_split': 2,
          'learning_rate': 0.01}
gbr = ensemble.GradientBoostingClassifier(**params)
gbr.fit(x_train, y_train)
y_pred = gbr.predict(x_test)

In [17]:
roc_auc_score(y_test, y_pred)

0.6517981991940751

In [18]:
#RandomForestClassifier

forest = RandomForestClassifier(n_estimators = 1000, max_depth=4)

In [19]:
forest.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=4, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [20]:
y_pred_forest = forest.predict(x_test)
roc_auc_score(y_test, y_pred_forest)

0.6475112361399982

In [49]:
#Soft voting entails computing a weighted sum of the predicted probabilities of all models for each class.

ensemble=VotingClassifier(estimators=[('Logistic Regression', logit), ('GBoosting', gbr), ('Random Forest', forest)], voting='soft')

In [50]:
ensemble.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


VotingClassifier(estimators=[('Logistic Regression',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=17,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('GBoosting',
                              GradientBoostingClassifier(ccp_alpha=0.0,
                                                         criterion=...
                                                     criterion='gini',
                  

In [51]:
y_pred_ens = ensemble.predict(x_test)

In [52]:
roc_auc_score(y_test, y_pred_ens)

#ensamble looks the same as Logistic Regression
#but on the test data it showed better result

0.6714396048733009

In [38]:
y_subm_ens = ensemble.predict_proba(X_test)[:, 1]
submission = pd.DataFrame({
        "match_id": index,
        "radiant_win": y_subm_ens })
submission.to_csv('C:/Users/bodak/Desktop/my_submission.csv', index = False)