# MotoGP Lap Time Prediction - Model Improvement

This notebook focuses on improving the model performance through feature importance analysis and hyperparameter tuning.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.feature_selection import SelectFromModel
import warnings
warnings.filterwarnings('ignore')

## Load and Prepare Data

In [None]:
1. Load Data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
print('Train shape:', train_data.shape)
print('Test shape:', test_data.shape)

## Data Preprocessing

In [None]:
2. Outlier Handling (IQR Method)
Q1 = train_data['Lap_Time_Seconds'].quantile(0.25)
Q3 = train_data['Lap_Time_Seconds'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
filtered_train = train_data[(train_data['Lap_Time_Seconds'] >= lower) & (train_data['Lap_Time_Seconds'] <= upper)]
print(f'Original train size: {len(train_data)}, After outlier removal: {len(filtered_train)}')

## Feature Importance Analysis

In [None]:
3. Preprocessing and Feature Selection
categorical_cols = filtered_train.select_dtypes(include=['object']).columns
numerical_cols = filtered_train.select_dtypes(include=[np.number]).columns.drop('Lap_Time_Seconds')
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])
X = filtered_train.drop('Lap_Time_Seconds', axis=1)
y = filtered_train['Lap_Time_Seconds']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_prep = preprocessor.fit_transform(X_train)
X_val_prep = preprocessor.transform(X_val)
test_prep = preprocessor.transform(test_data)

## Model Training and Hyperparameter Tuning

In [None]:
Feature selection using XGBoost
xgb_selector = XGBRegressor(n_estimators=100, random_state=42)
xgb_selector.fit(X_train_prep, y_train)
selector = SelectFromModel(xgb_selector, prefit=True, threshold='median')
X_train_sel = selector.transform(X_train_prep)
X_val_sel = selector.transform(X_val_prep)
test_sel = selector.transform(test_prep)
print('Selected features:', X_train_sel.shape[1])

## Model Evaluation

In [None]:
4. Model Training: XGBoost and LightGBM
xgb = XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=7, subsample=0.9, colsample_bytree=0.9, random_state=42)
lgbm = LGBMRegressor(n_estimators=300, learning_rate=0.05, max_depth=7, subsample=0.9, colsample_bytree=0.9, random_state=42)
xgb.fit(X_train_sel, y_train)
lgbm.fit(X_train_sel, y_train)
xgb_pred = xgb.predict(X_val_sel)
lgbm_pred = lgbm.predict(X_val_sel)
print('XGBoost RMSE:', np.sqrt(mean_squared_error(y_val, xgb_pred)))
print('LightGBM RMSE:', np.sqrt(mean_squared_error(y_val, lgbm_pred)))

## Generate Predictions for Test Data

In [None]:
5. Ensemble (Averaging)
ensemble_pred = (xgb_pred + lgbm_pred) / 2
ensemble_rmse = np.sqrt(mean_squared_error(y_val, ensemble_pred))
print('Ensemble RMSE:', ensemble_rmse)