In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load and clean data
df = pd.read_csv("players_data.csv", encoding='latin1')
df = df.drop_duplicates()
df.dropna(subset=['character', 'gameId', 'gameRank'], inplace=True)
df.fillna('Unknown', inplace=True)

# Define win as gameRank == 1
df['win'] = df['gameRank'].apply(lambda x: 1 if x == 1 else 0)

# Calculate historical win rate for each character
win_rate = df.groupby('character')['win'].mean()

# Calculate average stats for each character (for use when only the character name is provided)
average_stats = df.groupby('character')[['Kill', 'Death', 'Assist', 'Dmg_Player']].mean()

# Features and target
X = df[['character', 'Kill', 'Death', 'Assist', 'Dmg_Player']]
y = df['win']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Preprocessing: one-hot encode character, leave numeric as-is
preprocessor = ColumnTransformer(transformers=[
    ('char', OneHotEncoder(handle_unknown='ignore'), ['character'])
], remainder='passthrough')

# Model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced'))
])

# Train model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.92      0.95      0.93     31934
           1       0.72      0.60      0.66      6986

    accuracy                           0.89     38920
   macro avg       0.82      0.78      0.79     38920
weighted avg       0.88      0.89      0.88     38920



In [8]:
# Function to estimate win probability based on character name
def estimate_character_win(character):
    if character not in average_stats.index:
        return f"Character {character} not found in the dataset."
    
    # Get the average stats for the given character
    avg_stats = average_stats.loc[character]
    
    # Prepare input for model prediction using average stats
    input_df = pd.DataFrame([{
        'character': character,
        'Kill': avg_stats['Kill'],
        'Death': avg_stats['Death'],
        'Assist': avg_stats['Assist'],
        'Dmg_Player': avg_stats['Dmg_Player']
    }])

    # Predict win probability using the model
    model_prob = model.predict_proba(input_df)[0][1]

    # Get historical win rate for the character
    historical_win_rate = win_rate.loc[character]

    # Combine model prediction with historical win rate (weighted average)
    combined_prob = 0.7 * model_prob + 0.3 * historical_win_rate  # You can adjust the weights

    # Calculate expected wins in 100 games
    expected_wins = combined_prob * 200

    return (
        f"Character: {character}\n"
        f"Model Win Probability: {model_prob:.2%}\n"
        f"Historical Win Rate: {historical_win_rate:.2%}\n"
        f"Expected Wins in 200 Games: {expected_wins:.1f}"
    )



In [15]:
# Example usage (only input character name)
print(estimate_character_win("Katja"))

Character: Katja
Model Win Probability: 6.00%
Historical Win Rate: 17.11%
Expected Wins in 200 Games: 18.7
