In [55]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report

from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score
from sklearn.pipeline import Pipeline

# Define your models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import CategoricalNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imPipeline





In [None]:
# Load and clean data
df = pd.read_csv("players_data.csv")
df = df.drop_duplicates()
df.dropna(subset=['character', 'gameId', 'gameRank'], inplace=True)
df.fillna('Unknown', inplace=True)



In [None]:
# Create team ID
df['teamId'] = df['gameId'].astype(str) + "_" + df['gameRank'].astype(str)
df['win'] = df['gameRank'].apply(lambda x: 1 if x <= 3 else 0)



In [None]:
# Aggregate team composition and compute team stats
team_df = df.groupby('teamId').apply(lambda g: pd.Series({
    'gameId': g['gameId'].iloc[0],
    'gameRank': g['gameRank'].iloc[0],
    'win': g['win'].iloc[0],
    'characters': sorted(list(g['character'])),
    'kills': g['Kill'].sum(),
    'deaths': g['Death'].sum(),
    'assists': g['Assist'].sum(),
    'damage': g['Dmg_Player'].sum()
})).reset_index()



In [None]:
# Keep only full teams
team_df = team_df[team_df['characters'].apply(len) == 3]



In [None]:
# Split characters into separate columns
team_df[['character1', 'character2', 'character3']] = pd.DataFrame(team_df['characters'].to_list(), index=team_df.index)
team_df.drop(columns=['characters', 'teamId', 'gameId', 'gameRank'], inplace=True)



In [None]:
# Define features and target
X = team_df[['character1', 'character2', 'character3', 'kills', 'deaths', 'assists', 'damage']]
y = team_df['win']



In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)



In [None]:
# Preprocessing
categorical_features = ['character1', 'character2', 'character3']
numerical_features = ['kills', 'deaths', 'assists', 'damage']

preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
], remainder='passthrough')



In [30]:
# Model pipeline (RFC)
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced'))
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.61      0.73      0.66       832
           1       0.41      0.29      0.34       546

    accuracy                           0.56      1378
   macro avg       0.51      0.51      0.50      1378
weighted avg       0.53      0.56      0.54      1378



In [31]:
# Model pipeline (LR)
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000))

])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.61      0.51      0.55       832
           1       0.40      0.51      0.45       546

    accuracy                           0.51      1378
   macro avg       0.51      0.51      0.50      1378
weighted avg       0.53      0.51      0.51      1378



In [32]:
# Model pipeline (SVC)
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC(probability=True, random_state=42, class_weight='balanced'))
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.56      0.59       832
           1       0.42      0.49      0.45       546

    accuracy                           0.53      1378
   macro avg       0.52      0.52      0.52      1378
weighted avg       0.54      0.53      0.53      1378



In [35]:
# Model pipeline (DT)
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42, class_weight='balanced'))
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.60      0.57      0.59       832
           1       0.39      0.41      0.40       546

    accuracy                           0.51      1378
   macro avg       0.49      0.49      0.49      1378
weighted avg       0.52      0.51      0.51      1378



In [39]:
# Model pipeline (GSNB)
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GaussianNB())
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

TypeError: Sparse data was passed for X, but dense data is required. Use '.toarray()' to convert to a dense numpy array.

In [None]:
# Model pipeline (GBC)
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier())
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.60      0.67      0.63       832
           1       0.39      0.32      0.35       546

    accuracy                           0.53      1378
   macro avg       0.49      0.49      0.49      1378
weighted avg       0.51      0.53      0.52      1378



In [41]:
# Model pipeline (GSNB)
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=42))

])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.60      0.97      0.74       832
           1       0.36      0.02      0.04       546

    accuracy                           0.60      1378
   macro avg       0.48      0.50      0.39      1378
weighted avg       0.51      0.60      0.47      1378



In [None]:
# Model pipeline (ABC)
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', AdaBoostClassifier(random_state=42))
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

           0       0.61      0.92      0.73       832
           1       0.43      0.10      0.16       546

    accuracy                           0.59      1378
   macro avg       0.52      0.51      0.44      1378
weighted avg       0.54      0.59      0.50      1378



In [None]:
# Model pipeline (ETC)
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', ExtraTreesClassifier(random_state=42, class_weight='balanced'))
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.61      0.66      0.63       832
           1       0.40      0.34      0.37       546

    accuracy                           0.54      1378
   macro avg       0.50      0.50      0.50      1378
weighted avg       0.52      0.54      0.53      1378



In [44]:
# Model pipeline (XGBC)
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, scale_pos_weight=1.5))
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.64      0.56      0.60       832
           1       0.43      0.51      0.47       546

    accuracy                           0.54      1378
   macro avg       0.53      0.54      0.53      1378
weighted avg       0.55      0.54      0.55      1378



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [45]:
# Model pipeline (LGMBC)
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LGBMClassifier(random_state=42, class_weight='balanced'))
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

[LightGBM] [Info] Number of positive: 2185, number of negative: 3324
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000107 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 330
[LightGBM] [Info] Number of data points in the train set: 5509, number of used features: 165
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
              precision    recall  f1-score   support

           0       0.61      0.51      0.55       832
           1       0.40      0.50      0.44       546

    accuracy                           0.50      1378
   macro avg       0.50      0.50      0.50      1378
weighted avg       0.52      0.50      0.51      1378



In [52]:
# Model pipeline (LGMBC)
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', CatBoostClassifier(verbose=0, random_state=42, class_weights=[1, 2]))

])

cross_val_score(clf, X, y, cv=5, scoring=scorer)

array([0.44339623, 0.44992743, 0.45315904, 0.44371823, 0.45896877])

In [None]:
models = {
    "Random Forest": RandomForestClassifier(random_state=42, class_weight='balanced'),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42, class_weight='balanced'),
    "SVC": SVC(probability=True),
    "KNN": KNeighborsClassifier(),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "Extra Trees": ExtraTreesClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier(random_state=42)
}

# Scorers
scorers = {
    'Accuracy': make_scorer(accuracy_score),
    'Precision': make_scorer(precision_score),
    'Recall': make_scorer(recall_score),
    'F1': make_scorer(f1_score)
}

# Loop through and evaluate each model
for name, model in models.items():
    clf = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    print(f"\n {name}")
    for score_name, scorer in scorers.items():
        scores = cross_val_score(clf, X, y, cv=5, scoring=scorer)
        print(f"{score_name}: {scores.mean():.3f} ± {scores.std():.3f}")



 Random Forest
Accuracy: 0.553 ± 0.008
Precision: 0.413 ± 0.017
Recall: 0.303 ± 0.024
F1: 0.349 ± 0.022

 Gradient Boosting
Accuracy: 0.597 ± 0.005
Precision: 0.398 ± 0.073
Recall: 0.033 ± 0.007
F1: 0.060 ± 0.012

 Logistic Regression
Accuracy: 0.579 ± 0.007
Precision: 0.404 ± 0.024
Recall: 0.131 ± 0.009
F1: 0.198 ± 0.013

 Decision Tree
Accuracy: 0.529 ± 0.011
Precision: 0.413 ± 0.014
Recall: 0.444 ± 0.026
F1: 0.427 ± 0.018

 SVC
Accuracy: 0.595 ± 0.005
Precision: 0.452 ± 0.024
Recall: 0.097 ± 0.012
F1: 0.159 ± 0.016

 KNN
Accuracy: 0.539 ± 0.015
Precision: 0.397 ± 0.024
Recall: 0.316 ± 0.023
F1: 0.352 ± 0.024

 AdaBoost




Accuracy: 0.583 ± 0.010




Precision: 0.408 ± 0.043




Recall: 0.117 ± 0.020




F1: 0.182 ± 0.028

 Extra Trees
Accuracy: 0.548 ± 0.008
Precision: 0.412 ± 0.012
Recall: 0.325 ± 0.022
F1: 0.363 ± 0.017

 XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.569 ± 0.008


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Precision: 0.397 ± 0.024


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Recall: 0.166 ± 0.013


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


F1: 0.234 ± 0.017

 LightGBM
[LightGBM] [Info] Number of positive: 2184, number of negative: 3325
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000143 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 326
[LightGBM] [Info] Number of data points in the train set: 5509, number of used features: 163
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.396442 -> initscore=-0.420312
[LightGBM] [Info] Start training from score -0.420312
[LightGBM] [Info] Number of positive: 2185, number of negative: 3324
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000068 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 334
[LightGBM] [Info] Number of data points in the train set: 5509, number of used features: 167
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.396624 -> initscore=-0.419553
[LightGBM] [Info] Start training from score

In [59]:
# Prediction function
def predict_win_rate(char1, char2, char3, kills=0, deaths=0, assists=0, damage=0):
    chars = sorted([char1, char2, char3])
    input_df = pd.DataFrame([{
        'character1': chars[0],
        'character2': chars[1],
        'character3': chars[2],
        'kills': kills,
        'deaths': deaths,
        'assists': assists,
        'damage': damage
    }])
    prob = clf.predict_proba(input_df)[0][1]
    return f"Estimated Win Probability (Top 3): {prob:.2%}"

In [64]:
# Example usage
print(predict_win_rate("Rio", "Yuki", "Alonso"))

Estimated Win Probability (Top 3): 22.90%
