# Initial Features Statistical ML Model
- finding a classificator that fits to the initially selected match features
- using the data of the `Role.CANONICAL_CARRIES`

In [1]:
import pandas as pd
import joblib
import sys
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

import game_data_collector.parse_game_data_utils as pgdu
import game_data_collector.api as dapi
from database import MongoDB

# data preparation

## mongodb connection

In [2]:
db = MongoDB(username='root', password='example')
db.connect_db()
db.get_database("mmr_predictor")
col = db.get_collection("dota_game_collection")

print(f"Found {col.count_documents({})} recorded matches with {len(col.distinct('players.account_id'))} unique players in the database.")

Found 17048 recorded matches with 73122 unique players in the database.


## data conversion

In [3]:
# get only CC match data with selected features
df_cc = dapi.get_kpis_by_role(col, dapi.Role.CANONICAL_CARRY).drop(columns = "hero_id")
df_cc["rank_group"] = (df_cc["rank_tier"] // 10).astype("category")

print(df_cc.head(n=5))


collected 10003 rows
   rank_tier   kda  last_hits  actions_per_min  gold_per_min  xp_per_min  \
0         74  1.25        125              281           408         460   
1         75  1.83        280              349           578         796   
2         45  5.33        330              154           654         719   
3         51  0.55        160              272           458         472   
4         71  7.00        296              320           760        1001   

  rank_group  
0          7  
1          7  
2          4  
3          5  
4          7  


In [4]:
# normalize the data
X = StandardScaler().fit_transform(df_cc.drop(columns=["rank_tier", "rank_group"])) # z score standardization
Y = df_cc["rank_group"].to_numpy()

In [5]:
# split dataset
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.2, random_state=42)


In [6]:
print(np.isnan(X))

[[False False False False False]
 [False False False False False]
 [False False False False False]
 ...
 [False False False False False]
 [False False False False False]
 [False False False False False]]


## logistic regression model

In [7]:
# basic model
lr_classifier = LogisticRegression(C=1, max_iter=200, n_jobs=-1)
lr_classifier.fit(x_train, y_train)

### k fold cross validation

In [8]:
# Logistic Regression is one of the most simple and commonly used Machine Learning algorithms for two-class classification
# https://www.datacamp.com/tutorial/understanding-logistic-regression-python

# about solving methods
# https://stackoverflow.com/questions/38640109/logistic-regression-python-solvers-definitions

# param_grid = {
#     'C': [0.1,1,10]
# }

# for beefy computers
param_grid = {
    'penalty':['none', 'elasticnet', 'l1', 'l2'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'C': [0.001,0.01,0.1,1,10,100,1000]
}

lr_classifier_grid = GridSearchCV(LogisticRegression(max_iter=400), param_grid, cv=5, scoring='f1_weighted', verbose=1, n_jobs=-1) # scoring=scoring, refit='f1'

# use parallelization to speed up process (important for larger dataset)
with joblib.parallel_config("threading"):
    lr_classifier_grid.fit(x_train, y_train)
    
print("Best Parameters:", lr_classifier_grid.best_params_)

Fitting 5 folds for each of 140 candidates, totalling 700 fits
Best Parameters: {'C': 100, 'penalty': 'l2', 'solver': 'sag'}


455 fits failed out of a total of 700.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
175 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\cedri\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\cedri\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\cedri\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\cedri\AppData\Local\Programs\Python\Python312\L

In [9]:
# Modell auf Test-Split anwenden
y_pred = lr_classifier_grid.predict(x_test)
print(y_pred[:5])

print(f"acc\t= {round(metrics.accuracy_score(y_test, y_pred), 4)}")
print(f"prec\t= {round(metrics.precision_score(y_test, y_pred, average='weighted'), 4)}")
print(f"rec\t= {round(metrics.recall_score(y_test, y_pred, average='weighted'), 4)}")
print(f"f1\t= {round(metrics.f1_score(y_test, y_pred, average='weighted'), 4)}")

[5 7 4 5 5]
acc	= 0.2324
prec	= 0.1808
rec	= 0.2324
f1	= 0.1663


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
