In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
import warnings
warnings.filterwarnings('ignore')

In [5]:
# Reading the Dataset :
train_df = pd.read_csv(r"E:\Great learnings\Projects\fuel prediction data\Train.csv")
test_df = pd.read_csv(r"E:\Great learnings\Projects\fuel prediction data\Test.csv")
submission_df = pd.read_csv(r"E:\Great learnings\Projects\fuel prediction data\Submission.csv")

In [7]:
# Separate features and target
X_train = train_df.drop(columns=['fuel_efficiency_kmpl'])
y_train = train_df['fuel_efficiency_kmpl']
X_test = test_df.copy()

In [9]:
# Define columns
categorical_cols = ['fuel_category', 'car_brand', 'transmission_type', 'exterior_color']
numerical_cols = ['engine_capacity_cc', 'owner_count']

In [11]:
# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols),
        ('num', StandardScaler(), numerical_cols)
    ])

In [15]:
# Define models to evaluate
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    'LightGBM': LGBMRegressor(random_state=42, verbose=-1),
    'SVR': SVR()}

In [17]:
# Evaluate models using cross-validation
results = []
for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

In [21]:
# Cross-validated R²
cv_r2 = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2').mean()
# Fit model to compute additional metrics
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_train)
mse = mean_squared_error(y_train, y_pred)
mape = mean_absolute_percentage_error(y_train, y_pred)
results.append({
    'Model': name,
    'CV R²': cv_r2,
    'Train MSE': mse,
    'Train MAPE': mape})
print(f"{name}: CV R² = {cv_r2:.4f}, Train MSE = {mse:.4f}, Train MAPE = {mape:.4f}")


SVR: CV R² = -0.0183, Train MSE = 23.1061, Train MAPE = 0.2687


In [22]:
# Convert results to DataFrame
results_df = pd.DataFrame(results)
print("\nModel Comparison:")
print(results_df.sort_values(by='CV R²', ascending=False))


Model Comparison:
  Model     CV R²  Train MSE  Train MAPE
0   SVR -0.018327  23.106063    0.268672


In [23]:
# Select the best model based on CV R²
best_model_name = results_df.loc[results_df['CV R²'].idxmax(), 'Model']
print(f"\nBest Model: {best_model_name}")


Best Model: SVR


In [24]:
# Hyperparameter tuning for top models (Random Forest, XGBoost, LightGBM, Gradient Boosting)
tuning_params = {
    'Random Forest': {
        'regressor__n_estimators': [100, 200],
        'regressor__max_depth': [None, 10, 20]
    },
    'Gradient Boosting': {
        'regressor__n_estimators': [100, 200],
        'regressor__learning_rate': [0.01, 0.1],
        'regressor__max_depth': [3, 5]
    },
    'XGBoost': {
        'regressor__n_estimators': [100, 200],
        'regressor__learning_rate': [0.01, 0.1],
        'regressor__max_depth': [3, 5]
    },
    'LightGBM': {
        'regressor__n_estimators': [100, 200],
        'regressor__learning_rate': [0.01, 0.1],
        'regressor__max_depth': [3, 5]
    }
}

if best_model_name in tuning_params:
    print(f"\nTuning hyperparameters for {best_model_name}...")
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', models[best_model_name])
    ])
    grid_search = GridSearchCV(
        pipeline,
        tuning_params[best_model_name],
        cv=5,
        scoring='r2',
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    best_cv_r2 = grid_search.best_score_
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Tuned CV R²: {best_cv_r2:.4f}")
else:
    best_model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', models[best_model_name])
    ])
    best_model.fit(X_train, y_train)
    best_cv_r2 = results_df.loc[results_df['Model'] == best_model_name, 'CV R²'].values[0]

In [25]:
# Final evaluation on training data
y_pred = best_model.predict(X_train)
final_r2 = r2_score(y_train, y_pred)
final_mse = mean_squared_error(y_train, y_pred)
final_mape = mean_absolute_percentage_error(y_train, y_pred)
print(f"\nFinal {best_model_name} Performance on Training Data:")
print(f"R²: {final_r2:.4f}")
print(f"MSE: {final_mse:.4f}")
print(f"MAPE: {final_mape:.4f}")


Final SVR Performance on Training Data:
R²: 0.0263
MSE: 23.1061
MAPE: 0.2687


In [26]:
# Generate predictions on test data
predictions = best_model.predict(X_test)
output_df = test_df.copy()
output_df['predicted_fuel_efficiency_kmpl'] = predictions
output_df.to_csv('Test_predictions_best_model.csv', index=False)
print("\nTest Predictions (first 5 rows):")
print(output_df.head())


Test Predictions (first 5 rows):
   engine_capacity_cc fuel_category  owner_count car_brand transmission_type  \
0                1500      Electric            3       Kia         Automatic   
1                1000        Diesel            3      Ford         Automatic   
2                 800        Petrol            5    Nissan         Automatic   
3                1500      Electric            5     Tesla            Manual   
4                3000        Petrol            3      Ford            Manual   

  exterior_color  fuel_efficiency_kmpl  predicted_fuel_efficiency_kmpl  
0          Black                   NaN                       17.424911  
1         Silver                   NaN                       18.069015  
2          Black                   NaN                       17.582531  
3           Blue                   NaN                       19.008192  
4         Silver                   NaN                       18.787700  


In [27]:
# Save model performance summary
results_df.to_csv('model_performance_summary.csv', index=False)