In [None]:
# TODO
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.impute import SimpleImputer
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)
# Load the dataset
file_path2 = '/home/chenxi/data/2023_LoL_esports_match_data_from_OraclesElixir.csv'
data2 = pd.read_csv(file_path2)
data2 = data2[data2['position'] != 'team' ]


# Impute missing values for specified features with zeros
features_to_impute = ['kills', 'assists', 'total cs', 'towers', 'dragons']
data2[features_to_impute] = data2[features_to_impute].fillna(0)

# Recalculate 'impact_score' to ensure it's consistent
data2['impact_score'] = np.log1p(data2['kills']) + np.log1p(data2['assists']) + np.log1p(data2['total cs'] * 0.1) + np.log1p(data2['towers'] * 0.5) + np.log1p(data2['dragons'] * 0.2)
data2['impact_score'] = data2['impact_score'].fillna(data2['impact_score'].median())

# Feature Engineering
data2['early_game_efficiency'] = data2['killsat15'] + data2['assistsat15'] + 0.1 * data2['csat15']
data2['gold_xp_ratio_at15'] = data2['goldat15'] / (data2['xpat15'] + 1)  # Avoid division by zero

# Prepare features and target variable
features = ['champion', 'killsat15', 'assistsat15', 'csat15', 'goldat15', 'xpat15', 'damagetochampions', 'early_game_efficiency', 'gold_xp_ratio_at15']
X = data2[features]
y = data2['impact_score']
y.fillna(y.median(), inplace=True)
# Encode 'champion' categorical feature
X['champion'] = LabelEncoder().fit_transform(X['champion'].astype(str))

# Define preprocessing for numeric and categorical features
numeric_features = ['killsat15', 'assistsat15', 'csat15', 'goldat15', 'xpat15', 'damagetochampions', 'early_game_efficiency', 'gold_xp_ratio_at15']
categorical_features = ['champion']

numeric_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the model pipeline
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning for RandomForestRegressor (example)
param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(rf_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model evaluation
best_rf = grid_search.best_estimator_
predictions = best_rf.predict(X_test)

rmse = mean_squared_error(y_test, predictions, squared=False)
r2 = r2_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
ev = explained_variance_score(y_test, predictions)

print(f"Random Forest RMSE: {rmse}")
print(f"Random Forest R^2: {r2}")
print(f"Random Forest MAE: {mae}")
print(f"Random Forest Explained Variance: {ev}")