In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor
from scipy.stats import uniform, randint

In [6]:
# Load the CSV file
df_processed = pd.read_csv('df_cleaned.csv')

# Verify that the data loaded correctly
print(df_processed.head())


   Item_Fat_Content_Regular  Item_Type_Breads  Item_Type_Breakfast  \
0                       0.0               0.0                  0.0   
1                       1.0               0.0                  0.0   
2                       0.0               0.0                  0.0   
3                       1.0               0.0                  0.0   
4                       0.0               0.0                  0.0   

   Item_Type_Canned  Item_Type_Dairy  Item_Type_Frozen Foods  \
0               0.0              1.0                     0.0   
1               0.0              0.0                     0.0   
2               0.0              0.0                     0.0   
3               0.0              0.0                     0.0   
4               0.0              0.0                     0.0   

   Item_Type_Fruits and Vegetables  Item_Type_Hard Drinks  \
0                              0.0                    0.0   
1                              0.0                    0.0   
2          

In [8]:
# Initialize models with default parameters
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0, random_state=42),
    "Lasso Regression": Lasso(alpha=0.1, random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
}

# Function to evaluate models using cross-validation
def evaluate_models(models, X, y):
    results = {}
    for name, model in models.items():
        # Perform 5-fold cross-validation
        cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
        results[name] = {
            "Mean R²": cv_scores.mean(),
            "Standard Deviation": cv_scores.std()
        }
    return results

# Evaluate all models on the important features
model_results = evaluate_models(models, X_train, y_train)

# Display results
for model, metrics in model_results.items():
    print(f"{model}: Mean R² = {metrics['Mean R²']:.4f}, Std Dev = {metrics['Standard Deviation']:.4f}")

# Choose the best model based on the highest mean R² score
best_model_name = max(model_results, key=lambda x: model_results[x]["Mean R²"])
print(f"\nBest Model: {best_model_name}")

Linear Regression: Mean R² = 0.7660, Std Dev = 0.0107
Ridge Regression: Mean R² = 0.7660, Std Dev = 0.0107
Lasso Regression: Mean R² = 0.4729, Std Dev = 0.0134
Random Forest: Mean R² = 0.8729, Std Dev = 0.0068
XGBoost: Mean R² = 0.8557, Std Dev = 0.0068

Best Model: Random Forest


In [9]:
#Hyperparameter tuning 
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


In [10]:
rf = RandomForestRegressor(random_state=42)

In [11]:
grid_search_rf = GridSearchCV(
    estimator=rf, 
    param_grid=param_grid_rf, 
    scoring='r2', cv=3, 
    n_jobs=-1)

grid_search_rf.fit(X_train, y_train)
best_rf_model = grid_search_rf.best_estimator_
best_rf_params = grid_search_rf.best_params_

In [12]:
y_pred_rf = best_rf_model.predict(X_test)


In [13]:
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest Best Model -  R²: {r2_rf} , Mean Squared Error: {mse_rf}")

# Printing the best hyperparameters for both models
print("\n--- Best Hyperparameters ---")
print(f"Random Forest: {best_rf_params}")

Random Forest Best Model -  R²: 0.8802292268046391 , Mean Squared Error: 0.11179801474309381

--- Best Hyperparameters ---
Random Forest: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
