In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.impute import SimpleImputer

# Load the dataset
file_path = '/home/chenxi/data/2023_LoL_esports_match_data_from_OraclesElixir.csv'
data = pd.read_csv(file_path)

# Impute missing values for specified features with zeros
features_to_impute = ['killsat15', 'assistsat15', 'csat15', 'goldat15', 'xpat15', 'total cs']
data[features_to_impute] = data[features_to_impute].fillna(0)

# Recalculate 'impact_score' to ensure it's consistent
data['impact_score'] = np.log1p(data['kills']) + np.log1p(data['assists']) + np.log1p(data['total cs'] * 0.1) + np.log1p(data['towers'] * 0.5) + np.log1p(data['dragons'] * 0.2)
data['impact_score'] = data['impact_score'].fillna(data['impact_score'].median())

# Feature Engineering
data['early_game_efficiency'] = data['killsat15'] + data['assistsat15'] + 0.1 * data['csat15']
data['gold_xp_ratio_at15'] = data['goldat15'] / (data['xpat15'] + 1)  # Avoid division by zero

# Prepare features and target variable
features = ['champion', 'killsat15', 'assistsat15', 'csat15', 'goldat15', 'xpat15', 'damagetochampions', 'early_game_efficiency', 'gold_xp_ratio_at15']
X = data[features]
y = data['impact_score']

# Encode 'champion' categorical feature
X['champion'] = LabelEncoder().fit_transform(X['champion'].astype(str))

# Define preprocessing for numeric and categorical features
numeric_features = ['killsat15', 'assistsat15', 'csat15', 'goldat15', 'xpat15', 'damagetochampions', 'early_game_efficiency', 'gold_xp_ratio_at15']
categorical_features = ['champion']

numeric_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the model pipeline
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning for RandomForestRegressor (example)
param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(rf_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model evaluation
best_rf = grid_search.best_estimator_
predictions = best_rf.predict(X_test)

rmse = mean_squared_error(y_test, predictions, squared=False)
r2 = r2_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
ev = explained_variance_score(y_test, predictions)

print(f"Random Forest RMSE: {rmse}")
print(f"Random Forest R^2: {r2}")
print(f"Random Forest MAE: {mae}")
print(f"Random Forest Explained Variance: {ev}")

  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['champion'] = LabelEncoder().fit_transform(X['champion'].astype(str))


Random Forest RMSE: 0.4607419663507044
Random Forest R^2: 0.6848343508621028
Random Forest MAE: 0.14520061213141475
Random Forest Explained Variance: 0.6848659890597746




In [2]:

from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define and train the Linear Regression model
lr_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])
lr_model.fit(X_train, y_train)

# Define and train the XGBoost model
xgb_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(objective='reg:squarederror', random_state=42))
])
xgb_model.fit(X_train, y_train)

# Predict and evaluate Linear Regression model
lr_predictions = lr_model.predict(X_test)
print("Linear Regression RMSE:", mean_squared_error(y_test, lr_predictions, squared=False))

# Predict and evaluate XGBoost model
xgb_predictions = xgb_model.predict(X_test)
print("XGBoost RMSE:", mean_squared_error(y_test, xgb_predictions, squared=False))


Linear Regression RMSE: 0.6243799281715618
XGBoost RMSE: 0.46819779004008183


