# MotoGP Lap Time Prediction

This notebook demonstrates a complete workflow for predicting MotoGP lap times using machine learning. It covers data loading, EDA, preprocessing, model training, evaluation, feature importance, hyperparameter tuning, and submission file creation.

## 1. Import Libraries and Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

In [None]:
# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
print("Training data shape:", train_data.shape)
print("Test data shape:", test_data.shape)

## 2. Exploratory Data Analysis (EDA)

In [None]:
print("Training Data Info:")
print(train_data.info())
print("\nTest Data Info:")
print(test_data.info())

In [None]:
print("Missing values in training data:")
print(train_data.isnull().sum())
print("\nMissing values in test data:")
print(test_data.isnull().sum())

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(train_data['Lap_Time_Seconds'], bins=50)
plt.title('Distribution of Lap Times')
plt.xlabel('Lap Time (seconds)')
plt.ylabel('Count')
plt.show()

In [None]:
numeric_cols = train_data.select_dtypes(include=[np.number]).columns
correlation_matrix = train_data[numeric_cols].corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()

## 3. Data Preprocessing

In [None]:
categorical_cols = train_data.select_dtypes(include=['object']).columns
numerical_cols = train_data.select_dtypes(include=[np.number]).columns.drop('Lap_Time_Seconds')
print("Categorical columns:", categorical_cols.tolist())
print("Numerical columns:", numerical_cols.tolist())

In [None]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])
X = train_data.drop('Lap_Time_Seconds', axis=1)
y = train_data['Lap_Time_Seconds']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## 4. Model Training and Evaluation

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42)
}
results = {}
for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    results[name] = {'RMSE': rmse, 'MAE': mae, 'R2': r2}
    print(f'\n{name} Results:')
    print(f'RMSE: {rmse:.4f}')
    print(f'MAE: {mae:.4f}')
    print(f'R2 Score: {r2:.4f}')

## 5. Feature Importance Analysis (XGBoost)

In [None]:
# Fit XGBoost on all training data for feature importance
xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(n_estimators=100, random_state=42))
])
xgb_pipeline.fit(X_train, y_train)
# Get feature names
ohe = xgb_pipeline.named_steps['preprocessor'].named_transformers_['cat']
cat_features = ohe.get_feature_names_out(categorical_cols)
feature_names = np.concatenate([numerical_cols, cat_features])
importances = xgb_pipeline.named_steps['model'].feature_importances_
feat_imp = pd.DataFrame({'feature': feature_names, 'importance': importances})
feat_imp = feat_imp.sort_values('importance', ascending=False).head(15)
plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feat_imp)
plt.title('Top 15 Most Important Features (XGBoost)')
plt.show()

## 6. Hyperparameter Tuning (XGBoost)

In [None]:
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [3, 5],
    'model__learning_rate': [0.01, 0.1],
    'model__subsample': [0.8, 1.0],
    'model__colsample_bytree': [0.8, 1.0]
}
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(random_state=42))
])
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)
print('Best parameters:', grid_search.best_params_)
print('Best RMSE:', np.sqrt(-grid_search.best_score_))

## 7. Final Model Training and Predictions

In [None]:
best_params = grid_search.best_params_
final_model = Pipeline([
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(**{k.replace('model__', ''): v for k, v in best_params.items()}, random_state=42))
])
final_model.fit(X, y)
test_predictions = final_model.predict(test_data)
submission = pd.DataFrame({'Lap_Time_Seconds': test_predictions})
submission.to_csv('submission.csv', index=False)
print('Submission file created successfully!')

## 8. Model Validation

In [None]:
def validate_submission(submission_file, test_file):
    submission = pd.read_csv(submission_file)
    test = pd.read_csv(test_file)
    if 'Lap_Time_Seconds' not in submission.columns:
        print("Error: 'Lap_Time_Seconds' column missing in submission file")
        return False
    if len(submission) != len(test):
        print(f"Error: Number of rows mismatch. Expected {len(test)}, got {len(submission)}")
        return False
    if submission['Lap_Time_Seconds'].isnull().any():
        print("Error: Missing values found in predictions")
        return False
    if (submission['Lap_Time_Seconds'] <= 0).any():
        print("Error: Found non-positive lap times")
        return False
    print("Submission file validation passed!")
    return True
validate_submission('submission.csv', 'test.csv')