## Data Analysis and AI ML Model Predictor

 ##### Welcome! This is my project called -> Predicting Ultra-Marathon Performance, Data Driven Approach using Specific Marathon Event trends and Athlete metadata

 Made by Bryan Lee Santoso

In [None]:
# Importing all necessary libraries
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import plotly.express as px
import streamlit as st
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Importing the dataset
# The dataset is assumed to be in the same directory as this script
df = pd.read_csv("TWO_CENTURIES_OF_UM_RACES.csv")

In [None]:
# Since it contains 7,461,226 ultra-marathon race records from 1,641,168 athletes, we will just sample a bit
df_sample = df.sample(frac=0.10, random_state=42)  # 10% sample

In [None]:
# Fixing the column types
# Convert dates and extract features
df_sample['Event dates'] = pd.to_datetime(df_sample['Event dates'], errors='coerce')
df_sample['Event_year'] = df_sample['Event dates'].dt.year
df_sample['Event_season'] = df_sample['Event dates'].dt.quarter  # 1=Winter, 2=Spring, etc.


In [None]:
#Clean up null values for data validity
df_sample.isnull().sum()
df_sample = df_sample.dropna()

In [None]:
df_sample.reset_index(drop=True)


In [None]:
df_sample['Athlete performance'] = df_sample['Athlete performance'].str.split(' ').str.get(0)

In [None]:
df_sample = df_sample.drop(['Athlete club'], axis = 1)

In [None]:
df_sample.reset_index(drop=True)


In [None]:
# Check for missing values in each column
print(df_sample.isna().sum())

In [None]:
# Standardizing the athlete performance to total seconds
df_sample['Athlete performance'] = pd.to_timedelta(df_sample['Athlete performance'].str.replace(' h', ''), errors='coerce').dt.total_seconds()

In [None]:
df_sample = df_sample[df_sample['Event distance/length'].isin(['50km', '100km'])]

In [None]:
# Extract numeric distance from the string and convert to float
df_sample['Event distance/length'] = df_sample['Event distance/length'].str.replace('km', '').astype(float)

In [None]:
df_sample['Athlete average speed'] = pd.to_numeric(df_sample['Athlete average speed'], errors='coerce')

In [None]:
df_sample.isnull().sum()
df_sample = df_sample.dropna(subset = ['Athlete average speed'])

In [None]:
# Calculate athlete's age at the event
df_sample['Athlete age'] = df_sample['Year of event'] - df_sample['Athlete year of birth']

In [None]:
# Extract country from event name
df_sample['Event country'] = df_sample['Event name'].str.extract(r'\((.*?)\)')

In [None]:
# Check for duplicates
print(df_sample.duplicated().sum())

# Drop duplicates
df_sample = df_sample.drop_duplicates()

In [None]:
# Final checking of the data types
print(df_sample.dtypes)
df_sample.shape

In [None]:
# Just for representation, since we have two categories of races, 50km and 100km, we can view the statistics of each categories by seperating them first
df_50km = df_sample[df_sample['Event distance/length'] == 50]
df_100km = df_sample[df_sample['Event distance/length'] == 100]

In [None]:
sns.boxplot(x='Event distance/length', y='Athlete performance', data=df_50km)
plt.show()
sns.boxplot(x='Event distance/length', y='Athlete performance', data=df_100km)
plt.show()

df_50km['Athlete performance'].describe()

In [None]:
# As there are outliers, we want to clean up the data first before continuing, we will clean up each category first, and then merge the dataset back together in order to prepare the final dataset for the model. We are focusing primarily on athlete performance

# First we want to clean the 50km dataset first
# Calculate Q1 (25th percentile) and Q3 (75th percentile)

Q1_50 = df_50km['Athlete performance'].quantile(0.25)
Q3_50 = df_50km['Athlete performance'].quantile(0.75)

# Calculate the IQR
IQR_50 = Q3_50 - Q1_50

# Define lower and upper bounds
lower_bound_50 = Q1_50 - 1.5 * IQR_50
upper_bound_50 = Q3_50 + 1.5 * IQR_50

print(f"Lower Bound (50km): {lower_bound_50}, Upper Bound (50km): {upper_bound_50}")



In [None]:
# Next we want to calculate the statitics for the 100km races
# Calculate Q1 (25th percentile) and Q3 (75th percentile)

Q1_100 = df_100km['Athlete performance'].quantile(0.25)
Q3_100 = df_100km['Athlete performance'].quantile(0.75)

# Calculate the IQR
IQR_100 = Q3_100 - Q1_100

# Define lower and upper bounds
lower_bound_100 = Q1_100 - 1.5 * IQR_100
upper_bound_100 = Q3_100 + 1.5 * IQR_100

print(f"Lower Bound (100km): {lower_bound_100}, Upper Bound (100km): {upper_bound_100}")



In [None]:
# If we remove the outliers based on the IQR rule, it will significantly reduce the dataset, thus we will take Winsorize the outliers , capping the outliers at the 2nd and 98th percentiles
# Cap outliers at the 2nd and 98th percentiles
lower_cap_50 = df_50km['Athlete performance'].quantile(0.02)
upper_cap_50 = df_50km['Athlete performance'].quantile(0.98)

df_50km['Athlete performance'] = df_50km['Athlete performance'].clip(lower=lower_cap_50, upper=upper_cap_50)

In [None]:
sns.boxplot(x='Event distance/length', y='Athlete performance', data=df_50km)
plt.show()
df_50km.head(10)

In [None]:
# If we remove the outliers based on the IQR rule, it will significantly reduce the dataset, thus we will take Winsorize the outliers , capping the outliers at the 2nd and 98th percentiles
# Cap outliers at the 2nd and 98th percentiles
lower_cap_100 = df_100km['Athlete performance'].quantile(0.02)
upper_cap_100 = df_100km['Athlete performance'].quantile(0.98)

df_100km['Athlete performance'] = df_100km['Athlete performance'].clip(lower=lower_cap_100, upper=upper_cap_100)

In [None]:
sns.boxplot(x='Event distance/length', y='Athlete performance', data=df_100km)
plt.show()
df_100km.head(10)

In [None]:
# After data cleaning, we will concatenate the two datasets back together
final_df = pd.concat([df_50km, df_100km], axis=0, ignore_index=True)

In [None]:
# We will drop unrelated columns that are not needed for the model
final_df = final_df.drop(['Year of event', 'Event dates', 'Athlete year of birth', 'Athlete average speed', 'Athlete ID'], axis=1)

In [None]:
final_df.head(10)

In [None]:
final_df = final_df.drop(['Event number of finishers', 'Athlete country', 'Athlete country', 'Athlete age category','Event_year'], axis = 1)

In [None]:
final_df.head(10)
final_df['Event name'] = final_df['Event name'].str.split('(').str.get(0)

In [None]:
final_df.head(10)

In [None]:
# Beginning the ML model Traning
# Idea -> By inputting the Athlete age and gender, Event name, Event distance, and Event season, we are able to predict the Athlete's performance
# Constrains, models are only predicting in races Kilometer race cetegories

In [None]:
# First, we do feature engineering through target encoding
# Calculate the average performance for each event
event_performance_mean = final_df.groupby('Event name')['Athlete performance'].mean()

In [None]:
# Replace the event name with its mean performance
final_df['Event name (encoded)'] = final_df['Event name'].map(event_performance_mean)
final_df.head(10)
final_df = final_df.drop(['Event country'], axis=1)


In [None]:
# # To prevent information leakage, split the data into training and testing sets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Split the dataset into training and test sets
X = final_df.drop(columns=['Athlete performance'])
y = final_df['Athlete performance']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Calculate target encoding based on training data
event_performance_mean = X_train.merge(y_train, left_index=True, right_index=True).groupby('Event name')['Athlete performance'].mean()

X_train['Event name (encoded)'] = X_train['Event name'].map(event_performance_mean)
X_test['Event name (encoded)'] = X_test['Event name'].map(event_performance_mean)

# Handle unseen events in the test set (fallback to global mean if the input event was not included in the original dataset) 
global_mean = y_train.mean()
X_test['Event name (encoded)'] = X_test['Event name (encoded)'].fillna(global_mean)

X_train = X_train.drop(columns=['Event name'])
X_test = X_test.drop(columns=['Event name'])

label_encoder = LabelEncoder()
X_train['Athlete gender'] = label_encoder.fit_transform(X_train['Athlete gender'])
X_test['Athlete gender'] = label_encoder.transform(X_test['Athlete gender'])

final_df = final_df.drop(columns=['Event name'])


In [None]:
# Feature engineering: Create non-linear features
X_train['Age_squared'] = X_train['Athlete age'] ** 2  # Capturing non-linear age effects
X_test['Age_squared'] = X_test['Athlete age'] ** 2

# Interaction terms
X_train['Age_gender_interaction'] = X_train['Athlete age'] * X_train['Athlete gender']
X_test['Age_gender_interaction'] = X_test['Athlete age'] * X_test['Athlete gender']

X_train['Distance_age_interaction'] = X_train['Event distance/length'] * X_train['Athlete age']
X_test['Distance_age_interaction'] = X_test['Event distance/length'] * X_test['Athlete age']

# Re-train your best model with these new features

In [None]:
# Since the values of athlete performance are continuous data, then we will use a simple Random Forest Regressor

# Initialize the model
model = RandomForestRegressor(random_state=42)
# Train the model on the training set
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

print("IMPROVING RANDOM FOREST PERFORMANCE")
print("-" * 50)
print("Adding non-linear features and interactions...")

X_train_enhanced = X_train.copy()
X_test_enhanced = X_test.copy()
X_train_enhanced['Age_squared'] = X_train['Athlete age'] ** 2
X_test_enhanced['Age_squared'] = X_test['Athlete age'] ** 2
X_train_enhanced['Age_group'] = pd.cut(X_train['Athlete age'], bins=[0, 30, 40, 50, 100], labels=[0, 1, 2, 3])
X_test_enhanced['Age_group'] = pd.cut(X_test['Athlete age'], bins=[0, 30, 40, 50, 100], labels=[0, 1, 2, 3])
X_train_enhanced['Distance_X_Age'] = X_train['Event distance/length'] * X_train['Athlete age']
X_test_enhanced['Distance_X_Age'] = X_test['Event distance/length'] * X_test['Athlete age']
X_train_enhanced['Gender_X_Age'] = X_train['Athlete gender'] * X_train['Athlete age']
X_test_enhanced['Gender_X_Age'] = X_test['Athlete gender'] * X_test['Athlete age']
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],  
    'min_samples_split': [2, 5],  
    'min_samples_leaf': [1, 2, 4]  
}

rf_grid = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    cv=3,  
    n_jobs=-1,  
    scoring='r2'
)

print("\nTuning Random Forest with original features...")
rf_grid.fit(X_train, y_train)
best_rf_original = rf_grid.best_estimator_
best_rf_original_pred = best_rf_original.predict(X_test)
best_rf_original_r2 = r2_score(y_test, best_rf_original_pred)
best_rf_original_mae = mean_absolute_error(y_test, best_rf_original_pred)
best_rf_original_mse = mean_squared_error(y_test, best_rf_original_pred)

print(f"Best parameters: {rf_grid.best_params_}")
print(f"R² Score: {best_rf_original_r2}")
print(f"MAE: {best_rf_original_mae}")
print(f"MSE: {best_rf_original_mse}")

print("\nTuning Random Forest with enhanced features...")
rf_grid.fit(X_train_enhanced, y_train)
best_rf_enhanced = rf_grid.best_estimator_
best_rf_enhanced_pred = best_rf_enhanced.predict(X_test_enhanced)
best_rf_enhanced_r2 = r2_score(y_test, best_rf_enhanced_pred)
best_rf_enhanced_mae = mean_absolute_error(y_test, best_rf_enhanced_pred)
best_rf_enhanced_mse = mean_squared_error(y_test, best_rf_enhanced_pred)

print(f"Best parameters: {rf_grid.best_params_}")
print(f"R² Score: {best_rf_enhanced_r2}")
print(f"MAE: {best_rf_enhanced_mae}")
print(f"MSE: {best_rf_enhanced_mse}")

gb_model = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)
print("\nTraining Gradient Boosting with original features...")
gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)
gb_r2 = r2_score(y_test, gb_pred)
gb_mae = mean_absolute_error(y_test, gb_pred)
gb_mse = mean_squared_error(y_test, gb_pred)

print(f"R² Score: {gb_r2}")
print(f"MAE: {gb_mae}")
print(f"MSE: {gb_mse}")

print("\nMODEL COMPARISON:")
print("-" * 50)
models = {
    "Linear Regression (Baseline)": [baseline_r2, baseline_mae, baseline_mse],
    "Random Forest (Default)": [r2, mae, mse],
    "Random Forest (Tuned)": [best_rf_original_r2, best_rf_original_mae, best_rf_original_mse],
    "Random Forest (Enhanced Features)": [best_rf_enhanced_r2, best_rf_enhanced_mae, best_rf_enhanced_mse],
    "Gradient Boosting": [gb_r2, gb_mae, gb_mse]
}

try:
    models["XGBoost"] = [xgb_r2, xgb_mae, xgb_mse]
except:
    pass

print(f"{'Model':<30} {'R²':<10} {'MAE':<10} {'MSE':<15}")
print("-" * 65)
for model_name, metrics in models.items():
    print(f"{model_name:<30} {metrics[0]:<10.4f} {metrics[1]:<10.2f} {metrics[2]:<15.2f}")

best_model = max(models.items(), key=lambda x: x[1][0])
print(f"\nBest model: {best_model[0]} with R² = {best_model[1][0]:.4f}")

plt.figure(figsize=(12, 6))
plt.bar(models.keys(), [m[0] for m in models.values()])
plt.xlabel('Model')
plt.ylabel('R² Score')
plt.title('Model Performance Comparison')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
feature_importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

# Plot feature importances
plt.barh(feature_importances['Feature'], feature_importances['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.show()

In [None]:
# import joblib
# joblib.dump(model, 'athlete_performance_model.pkl')
# loaded_model = joblib.load('athlete_performance_model.pkl')

In [None]:
# Example user input
user_input = {
    'Event name': 'Flatland Marathon',
    'Event distance/length': 50,
    'Event season': 2,
    'Athlete age': 30,
    'Athlete gender': 'M'
}

# Preprocess the user input
user_input_encoded = user_input.copy()
user_input_encoded['Event name (encoded)'] = event_performance_mean.get(
    user_input['Event name'], global_mean 
)
user_input_encoded.pop('Event name')
user_input_encoded['Athlete gender'] = label_encoder.transform([user_input['Athlete gender']])[0]
user_input_df = pd.DataFrame([user_input_encoded])
user_input_df = user_input_df.reindex(columns=X_train.columns, fill_value=0)


user_prediction = model.predict(user_input_df)
print(f"Predicted Athlete Performance: {user_prediction[0]}")