In [46]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
import pickle



In [47]:
# Load the dataset
data = pd.read_csv('vehicle_data.csv')  # Replace with your actual dataset file

In [48]:
# Handle missing values (if any)
data = data.dropna()  # Or use imputation if necessary

In [49]:
# Preprocessing (same as before)
label_encoders = {}
categorical_columns = ['Vehicle Class']
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

In [50]:
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])


In [51]:
# Define features (X) and target (y)
features = ['Engine Size(L)', 'Cylinders', 'Fuel Consumption City (L/100 km)', 
            'Fuel Consumption Hwy (L/100 km)', 'Fuel Consumption Comb (L/100 km)', 
            'Vehicle Class']  # Adjust as needed based on your dataset
target = 'CO2 Emissions(g/km)'

X = data[features]
y = data[target]


In [52]:
# Standardize the features (optional but recommended for MLR models)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [53]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [54]:
# Cross-Validation
cross_val = cross_val_score(LinearRegression(), X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print(f"Cross-Validation Mean RMSE: {np.sqrt(-cross_val.mean()):.2f}")


Cross-Validation Mean RMSE: 19.92


In [55]:
# Initialize and train the MLR model (Linear Regression)
mlr_model = LinearRegression()
mlr_model.fit(X_train, y_train)

In [56]:
# Make predictions
y_train_pred = mlr_model.predict(X_train)
y_test_pred = mlr_model.predict(X_test)

In [57]:
# Evaluate the model
train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
train_r2 = r2_score(y_train, y_train_pred)

test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)
test_r2 = r2_score(y_test, y_test_pred)

print(f"Training RMSE: {train_rmse:.2f}, R²: {train_r2:.2f}")
print(f"Testing RMSE: {test_rmse:.2f}, R²: {test_r2:.2f}")

Training RMSE: 19.88, R²: 0.88
Testing RMSE: 19.94, R²: 0.88




In [58]:
# Hyperparameter tuning using GridSearchCV (for Ridge Regression)
param_grid = {'alpha': [0.1, 1, 10, 100]}
ridge = Ridge()

grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print(f"Best Ridge Model Hyperparameters: {grid_search.best_params_}")

Best Ridge Model Hyperparameters: {'alpha': 10}


In [59]:
# Make predictions with the tuned model
y_test_pred_tuned = best_model.predict(X_test)

In [60]:
# Evaluate the tuned model
tuned_test_rmse = mean_squared_error(y_test, y_test_pred_tuned, squared=False)
tuned_test_r2 = r2_score(y_test, y_test_pred_tuned)
print(f"Tuned Model Testing RMSE: {tuned_test_rmse:.2f}, R²: {tuned_test_r2:.2f}")

Tuned Model Testing RMSE: 19.95, R²: 0.88




In [61]:
# Save the trained model and scaler
with open('mlr_model.pkl', 'wb') as file:
    pickle.dump(mlr_model, file)

with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)


In [62]:
# Save the tuned model
with open('ridge_tuned_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

print("MLR Model, Scaler, and Tuned Ridge Model saved!")

MLR Model, Scaler, and Tuned Ridge Model saved!
