In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# Load the dataset
file_path = 'summerOly_athletes.csv'
athletes_data = pd.read_csv(file_path)

# Step 1: Data preprocessing
# Simplify Medal column to binary: Medal (1) vs No Medal (0)
athletes_data['Medal'] = athletes_data['Medal'].apply(lambda x: 1 if x != 'No medal' else 0)

# Encode categorical columns using LabelEncoder
categorical_cols = ['Sex', 'Team', 'NOC', 'City', 'Sport', 'Event']
label_encoders = {col: LabelEncoder() for col in categorical_cols}

for col in categorical_cols:
    athletes_data[col] = label_encoders[col].fit_transform(athletes_data[col])

# Select features and target variable
X = athletes_data[['Sex', 'Team', 'NOC', 'Year', 'City', 'Sport', 'Event']]
y = athletes_data['Medal']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 2: Train a Gradient Boosting Classifier
# Set class weight to balance the classes
gbdt_model = GradientBoostingClassifier(random_state=42)
gbdt_model.fit(X_train, y_train)

# Step 3: Evaluate the model on the test set
y_pred = gbdt_model.predict(X_test)
report = classification_report(y_test, y_pred, target_names=['No Medal', 'Medal'])

# Output evaluation results
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

    No Medal       0.86      0.99      0.92     42749
       Medal       0.72      0.07      0.14      7764

    accuracy                           0.85     50513
   macro avg       0.79      0.53      0.53     50513
weighted avg       0.83      0.85      0.80     50513

