In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

# Load datasets
df_train = pd.read_csv('/kaggle/input/test-sample/train.csv')
df_val = pd.read_csv('/kaggle/input/test-sample/val.csv')
submission_df = pd.read_csv('/kaggle/input/test-sample1/sample_submission.csv')

# Combine train and val for training
df_combined = pd.concat([df_train, df_val], ignore_index=True)

# Drop unnecessary columns
drop_cols = ['team_name', 'shortname', 'bike_name', 'circuit_name', 'rider_name']
df_combined = df_combined.drop(columns=drop_cols, errors='ignore')

# Fill missing 'Penalty' values if exists
if 'Penalty' in df_combined.columns:
    df_combined['Penalty'] = df_combined['Penalty'].fillna('None')

# Normalize 'Unique ID'
df_combined['Unique ID'] = df_combined['Unique ID'].astype(str).str.strip().str.upper()
submission_df['Unique ID'] = submission_df['Unique ID'].astype(str).str.strip().str.upper()

# Detect lap time column
lap_time_cols = [col for col in df_combined.columns if 'lap' in col.lower() and 'time' in col.lower()]
if not lap_time_cols:
    raise ValueError("❌ No Lap Time column found!")
target_col = lap_time_cols[0]
print(f"✅ Detected lap time column: {target_col}")

# Encode categorical features in training data
df_encoded = df_combined.copy()
for col in df_encoded.select_dtypes(include='object').columns:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))

# Separate features and target
X_train = df_encoded.drop(columns=[target_col])
y_train = df_encoded[target_col]

# Train model
model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, y_train)

# Evaluate RMSE on training set
train_preds = model.predict(X_train)
rmse = np.sqrt(mean_squared_error(y_train, train_preds))
print(f"📊 RMSE on combined training data: {rmse:.4f}")

# Prepare submission features
submission_features = submission_df.copy()

# Drop non-feature columns
submission_features = submission_features.drop(columns=drop_cols + [target_col], errors='ignore')

# Fill missing Penalty if exists in training
if 'Penalty' in X_train.columns and 'Penalty' not in submission_features.columns:
    submission_features['Penalty'] = 'None'

# Encode categorical features
for col in submission_features.select_dtypes(include='object').columns:
    le = LabelEncoder()
    submission_features[col] = le.fit_transform(submission_features[col].astype(str))

# Align columns with training features
missing_cols = set(X_train.columns) - set(submission_features.columns)
for col in missing_cols:
    submission_features[col] = 0  # fill missing with 0

submission_features = submission_features[X_train.columns]  # ensure same order

# Predict on submission data
submission_preds = model.predict(submission_features)

# Save to CSV
output_df = pd.DataFrame({
    'Unique ID': submission_df['Unique ID'],
    target_col: submission_preds
}).drop_duplicates(subset='Unique ID')

output_df.to_csv('solution.csv', index=False)
print("✅ Saved predictions to 'solution.csv'")


✅ Detected lap time column: Lap_Time_Seconds
📊 RMSE on combined training data: 0.0000
✅ Saved predictions to 'solution.csv'
