# All Features Statistical ML Model
- finding a classificator that fits to all (most) match features
- using the data of the `Role.CANONICAL_CARRIES`

In [3]:
import pandas as pd
import joblib
import sys
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

import os
import sys
sys.path.append(os.path.join("..", "..", "src"))
import game_data_collector.parse_game_data_utils as pgdu
import game_data_collector.api as dapi
from database import MongoDB

# data preparation

## mongodb connection

In [2]:
db = MongoDB(username='root', password='example')
db.connect_db()
db.get_database("mmr_predictor_2")
col = db.get_collection("dota_game_collection")

print(f"Found {col.count_documents({})} recorded matches with {len(col.distinct('players.account_id'))} unique players in the database.")

Found 6514 recorded matches with 31352 unique players in the database.


## data conversion

In [3]:
# get only CC match data with selected features
df_cc = dapi.get_kpis_by_role(col, dapi.Role.CANONICAL_CARRY).drop(columns = "hero_id")
df_cc["rank_group"] = (df_cc["rank_tier"] // 10).astype("category")

print(df_cc.head(n=5))


  df_cc = dapi.get_kpis_by_role(col, dapi.Role.CANONICAL_CARRY).drop(columns = "hero_id")


collected 3740 rows
   rank_tier    kda  last_hits  actions_per_min  gold_per_min  xp_per_min  \
0         35  35.00        184              237           723         908   
1         72   6.00        262              442           758         831   
2         73   0.46        125              238           328         350   
3         24   3.67        146              297           464         578   
4         53   2.17        321              270           668         973   

  rank_group  
0          3  
1          7  
2          7  
3          2  
4          5  


In [4]:
# not normalizing the data
X = df_cc.drop(columns=["rank_tier", "rank_group"])
Y = df_cc["rank_group"].to_numpy()

In [5]:
# split dataset
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.2, random_state=42)


In [6]:
print(np.isnan(X))

        kda  last_hits  actions_per_min  gold_per_min  xp_per_min
0     False      False            False         False       False
1     False      False            False         False       False
2     False      False            False         False       False
3     False      False            False         False       False
4     False      False            False         False       False
...     ...        ...              ...           ...         ...
3735  False      False            False         False       False
3736  False      False            False         False       False
3737  False      False            False         False       False
3738  False      False            False         False       False
3739  False      False            False         False       False

[3740 rows x 5 columns]


## random forest model

In [7]:
# basic model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(x_train, y_train)

### k fold cross validation

In [8]:
# Modell auf Test-Split anwenden
y_pred = clf.predict(x_test)
print(y_pred[:5])

print(f"acc\t= {round(metrics.accuracy_score(y_test, y_pred), 4)}")
print(f"prec\t= {round(metrics.precision_score(y_test, y_pred, average='weighted'), 4)}")
print(f"rec\t= {round(metrics.recall_score(y_test, y_pred, average='weighted'), 4)}")
print(f"f1\t= {round(metrics.f1_score(y_test, y_pred, average='weighted'), 4)}")

[1 7 7 6 7]
acc	= 0.2179
prec	= 0.2225
rec	= 0.2179
f1	= 0.2107
