In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report

from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score
from sklearn.pipeline import Pipeline

# Define your models
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import CategoricalNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imPipeline
import matplotlib.pyplot as plt
import seaborn as sns





In [2]:
# Load and clean data
df = pd.read_csv("players_data.csv", encoding='latin1')
df = df.drop_duplicates()
df.dropna(subset=['character', 'gameId', 'gameRank'], inplace=True)
df.fillna('Unknown', inplace=True)



In [3]:
# Create team ID
df['teamId'] = df['gameId'].astype(str) + "_" + df['gameRank'].astype(str)
df['win'] = df['gameRank'].apply(lambda x: 1 if x <= 3 else 0)



In [4]:
# Aggregate team composition and compute team stats
team_df = df.groupby('teamId').apply(lambda g: pd.Series({
    'gameId': g['gameId'].iloc[0],
    'gameRank': g['gameRank'].iloc[0],
    'win': g['win'].iloc[0],
    'characters': sorted(list(g['character'])),
    'kills': g['Kill'].sum(),
    'deaths': g['Death'].sum(),
    'assists': g['Assist'].sum(),
    'damage': g['Dmg_Player'].sum()
})).reset_index()



  team_df = df.groupby('teamId').apply(lambda g: pd.Series({


In [5]:
# Keep only full teams
team_df = team_df[team_df['characters'].apply(len) == 3]



In [6]:
# Split characters into separate columns
team_df[['character1', 'character2', 'character3']] = pd.DataFrame(team_df['characters'].to_list(), index=team_df.index)
team_df.drop(columns=['characters', 'teamId', 'gameId', 'gameRank'], inplace=True)



In [7]:
print(team_df.head(10))


   win  kills  deaths  assists  damage      character1 character2 character3
0    1     13      11       21   62112           Adina     Lenore  Li Dailin
1    1      7      12       10   38223           Adela     Alonso      Yumin
2    1     11      10       13   46516           Cathy      Katja       Yuki
3    0      9       5       15   38821          Alonso     Celine       Hart
4    0      8      10       12   57173            Emma    Estelle      Katja
5    0      9       3       11   35192  Debi_&_Marlene      Fiora     Nadine
6    0      5       4        6   20419         Hyunwoo      Katja       Yuki
7    0      5       7        4   20290           Darko       Hart    Hyunwoo
8    1     21       0       35   89062         Abigail     Magnus    Shoichi
9    1      7      10       10   51343           Adina    Bernice    Hyunwoo


In [8]:
# Define features and target
X = team_df[['character1', 'character2', 'character3', 'kills', 'deaths', 'assists', 'damage']]
y = team_df['win']



In [9]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)



In [10]:
# Preprocessing
categorical_features = ['character1', 'character2', 'character3']
numerical_features = ['kills', 'deaths', 'assists', 'damage']

preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
], remainder='passthrough')



In [11]:
# Model pipeline (RFC)
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced'))
])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.87      0.83      6820
           1       0.78      0.67      0.72      4505

    accuracy                           0.79     11325
   macro avg       0.79      0.77      0.78     11325
weighted avg       0.79      0.79      0.79     11325



In [111]:
models = {
    "Random Forest": RandomForestClassifier(random_state=42, class_weight='balanced'),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42, class_weight='balanced'),
    "SVC": SVC(probability=True),
    "KNN": KNeighborsClassifier(),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "Extra Trees": ExtraTreesClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier(random_state=42)
}

results = {}

for name, model in models.items():
    print(f"Evaluating {name}...")
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    
    scores = cross_validate(
        pipeline,
        X_train,
        y_train,
        cv=5,
        scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'],
        return_train_score=False
    )
    
    results[name] = {
        'accuracy': (scores['test_accuracy'].mean(), scores['test_accuracy'].std()),
        'precision': (scores['test_precision_macro'].mean(), scores['test_precision_macro'].std()),
        'recall': (scores['test_recall_macro'].mean(), scores['test_recall_macro'].std()),
        'f1': (scores['test_f1_macro'].mean(), scores['test_f1_macro'].std())
    }

# Convert to DataFrame for easy viewing
import pandas as pd
results_df = pd.DataFrame(results).T.sort_values(by='f1', ascending=False)

# Split means and standard deviations into separate columns
results_df[['accuracy_mean', 'accuracy_std']] = pd.DataFrame(results_df['accuracy'].to_list(), index=results_df.index)
results_df[['precision_mean', 'precision_std']] = pd.DataFrame(results_df['precision'].to_list(), index=results_df.index)
results_df[['recall_mean', 'recall_std']] = pd.DataFrame(results_df['recall'].to_list(), index=results_df.index)
results_df[['f1_mean', 'f1_std']] = pd.DataFrame(results_df['f1'].to_list(), index=results_df.index)

# Drop the original columns containing tuples
results_df = results_df.drop(columns=['accuracy', 'precision', 'recall', 'f1'])

Evaluating Random Forest...
Evaluating Gradient Boosting...
Evaluating Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Evaluating Decision Tree...
Evaluating SVC...
Evaluating KNN...
Evaluating AdaBoost...




Evaluating Extra Trees...
Evaluating XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Evaluating LightGBM...
[LightGBM] [Info] Number of positive: 5716, number of negative: 8669
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000260 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 751
[LightGBM] [Info] Number of data points in the train set: 14385, number of used features: 200
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.397358 -> initscore=-0.416484
[LightGBM] [Info] Start training from score -0.416484
[LightGBM] [Info] Number of positive: 5716, number of negative: 8669
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000363 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 751
[LightGBM] [Info] Number of data points in the train set: 14385, number of used features: 200
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.397358 -> initscore=-0.416484
[LightGBM] [Info] Start training from score -0.

In [112]:
print(results_df)

                     accuracy_mean  accuracy_std  precision_mean  \
Gradient Boosting         0.793572      0.008490        0.788396   
AdaBoost                  0.790124      0.006499        0.782204   
LightGBM                  0.790347      0.010325        0.784655   
Random Forest             0.788567      0.009367        0.782566   
Logistic Regression       0.786788      0.011132        0.782587   
XGBoost                   0.784396      0.009270        0.777825   
Extra Trees               0.777723      0.007279        0.772027   
SVC                       0.775053      0.006644        0.768527   
KNN                       0.749694      0.003254        0.739172   
Decision Tree             0.712602      0.007149        0.700693   

                     precision_std  recall_mean  recall_std   f1_mean  \
Gradient Boosting         0.009390     0.774950    0.008968  0.779821   
AdaBoost                  0.007489     0.776167    0.005450  0.778688   
LightGBM                  0.0114

In [12]:
# Prediction function
def predict_win_rate(char1, char2, char3, kills=0, deaths=0, assists=0, damage=0):
    chars = sorted([char1, char2, char3])
    input_df = pd.DataFrame([{
        'character1': chars[0],
        'character2': chars[1],
        'character3': chars[2],
        'kills': kills,
        'deaths': deaths,
        'assists': assists,
        'damage': damage
    }])
    prob = clf.predict_proba(input_df)[0][1]
    return f"Estimated Win Probability (Top 3): {prob:.2%}"

In [20]:
# Example usage
print(predict_win_rate("Adina", "Lenore", "Li Dailin"))

Estimated Win Probability (Top 3): 45.00%
