In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load and clean data
df = pd.read_csv("players_data.csv", encoding='latin1')

In [2]:
# Check how many rows have any missing trait values
trait_cols = ['Trait_Main1', 'Trait_Main2', 'Trait_Sub1', 'Trait_Sub2']

# Count rows with any missing trait
missing_trait_rows = df[trait_cols].isnull().any(axis=1).sum()

# Count total rows before dropping
total_rows_before = len(df)

# Calculate filled vs missing trait ratios
print(f"Total rows before dropping: {total_rows_before}")
print(f"Rows with missing traits: {missing_trait_rows} ({(missing_trait_rows / total_rows_before):.2%})")
print(f"Rows with all traits filled: {total_rows_before - missing_trait_rows} ({((total_rows_before - missing_trait_rows) / total_rows_before):.2%})")


Total rows before dropping: 201001
Rows with missing traits: 23496 (11.69%)
Rows with all traits filled: 177505 (88.31%)


In [None]:
#clean dataset
df = df.drop_duplicates()
df.dropna(subset=['character', 'gameId', 'gameRank'], inplace=True)
df.fillna('Unknown', inplace=True)

In [None]:
#Define win as gameRank == 1
df['win'] = df['gameRank'].apply(lambda x: 1 if x == 1 else 0)



In [None]:
#Calculate historical win rate for each character
win_rate = df.groupby('character')['win'].mean()



In [None]:
# Calculate average stats for each character (for use when only the character name is provided)
average_stats = df.groupby('character')[['Kill', 'Death', 'Assist', 'Dmg_Player']].mean()



In [None]:
#Features and target
X = df[['character', 'Kill', 'Death', 'Assist', 'Dmg_Player']]
y = df['win']



In [None]:
#Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)



In [None]:
#Preprocessing: one-hot encode character, leave numeric as-is
preprocessor = ColumnTransformer(transformers=[
    ('char', OneHotEncoder(handle_unknown='ignore'), ['character'])
], remainder='passthrough')



In [None]:
#Model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced'))
])

In [None]:
#Train model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
#Function to estimate win probability based on character name
def estimate_character_win(character):
    if character not in average_stats.index:
        return f"Character {character} not found in the dataset."
    
    # Get the average stats for the given character
    avg_stats = average_stats.loc[character]
    
    # Prepare input for model prediction using average stats
    input_df = pd.DataFrame([{
        'character': character,
        'Kill': avg_stats['Kill'],
        'Death': avg_stats['Death'],
        'Assist': avg_stats['Assist'],
        'Dmg_Player': avg_stats['Dmg_Player']
    }])

    #Predict win probability using the model
    model_prob = model.predict_proba(input_df)[0][1]

    #Get historical win rate for the character
    historical_win_rate = win_rate.loc[character]

    #Combine model prediction with historical win rate (weighted average)
    combined_prob = 0.7 * model_prob + 0.3 * historical_win_rate  # Adjustable to

    #Calculate expected wins in 100 games
    expected_wins = combined_prob * 200

    return (
        f"Character: {character}\n"
        f"Model Win Probability: {model_prob:.2%}\n"
        f"Historical Win Rate: {historical_win_rate:.2%}\n"
        f"Expected Wins in 200 Games: {expected_wins:.1f}"
    )



In [None]:
# Example usage
print(estimate_character_win("Katja"))

Character: Katja
Model Win Probability: 6.00%
Historical Win Rate: 17.11%
Expected Wins in 200 Games: 18.7
