In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Load your data
df = pd.read_csv("players_data.csv")

# Drop rows with missing values for simplicity
df = df.dropna()

# Filter relevant features
features = [
    'character', 'weapon', 'itemWeapon', 'itemChest', 'itemHead',
    'itemArm', 'itemLeg', 'Trait_Main1', 'Trait_Main2', 'Trait_Sub1', 'Trait_Sub2',
    'TeamKill', 'Kill', 'Death', 'Assist', 'Dmg_Player', 'Dmg_Monster'
]
target = 'gameRank'

# Define features and target variable
X = df[features]
y = df[target]

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(exclude='object').columns.tolist()

# Preprocessing for categorical data
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Preprocessing for numeric data
numerical_transformer = StandardScaler()

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols),
        ('num', numerical_transformer, numerical_cols)
    ]
)

# Create a pipeline with preprocessing and model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Split the data for evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print("R² Score:", r2_score(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))

# --------------------------
# Function to get predictions
# --------------------------

def predict_character_win_rate(character_name, games=50):
    # Filter historical performance
    char_data = df[df['character'] == character_name]
    if char_data.empty:
        return f"No data available for character: {character_name}"

    # Historical win rate (lower rank = better)
    historical_avg_rank = char_data['gameRank'].mean()
    historical_win_rate = (char_data['gameRank'] <= 3).mean()  # top 3 = win

    # Use model to predict expected rank
    latest_matches = char_data[features]
    predicted_ranks = model.predict(latest_matches)
    predicted_win_rate = (predicted_ranks <= 3).mean()

    projected_wins = int(predicted_win_rate * games)

    return {
        "character": character_name,
        "historical_avg_rank": round(historical_avg_rank, 2),
        "historical_win_rate": round(historical_win_rate * 100, 2),
        "predicted_win_rate": round(predicted_win_rate * 100, 2),
        "projected_wins_in_next_50": projected_wins
    }

# Input
result = predict_character_win_rate("Cathy", games=50)
print(result)
