In [1]:
# Importing necessary libraries for data manipulation, model training, and evaluation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score, mean_squared_error
from xgboost import XGBRegressor
from scipy.stats import uniform, randint

In [2]:
# Load the CSV file
df_processed = pd.read_csv('df_cleaned.csv')

# Verify that the data loaded correctly
print(df_processed.head())


   Item_Fat_Content_Regular  Item_Type_Breads  Item_Type_Breakfast  \
0                       0.0               0.0                  0.0   
1                       1.0               0.0                  0.0   
2                       0.0               0.0                  0.0   
3                       1.0               0.0                  0.0   
4                       0.0               0.0                  0.0   

   Item_Type_Canned  Item_Type_Dairy  Item_Type_Frozen Foods  \
0               0.0              1.0                     0.0   
1               0.0              0.0                     0.0   
2               0.0              0.0                     0.0   
3               0.0              0.0                     0.0   
4               0.0              0.0                     0.0   

   Item_Type_Fruits and Vegetables  Item_Type_Hard Drinks  \
0                              0.0                    0.0   
1                              0.0                    0.0   
2          

In [3]:
# Separating features (X) and target variable (y)
X = df_processed.drop('Item_Outlet_Sales', axis=1)  # Features
y = df_processed['Item_Outlet_Sales']  # Target

In [4]:
# Splitting the data into training and test sets (80% train, 20% test)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Using RandomForestRegressor to determine feature importance
feature_selector = RandomForestRegressor(n_estimators=100, random_state=42)
feature_selector.fit(x_train, y_train)

# Selecting the top 10 important features based on feature importance scores
feature_importances = pd.Series(feature_selector.feature_importances_, index=X.columns)
top_features = feature_importances.nlargest(10).index
X_top = X[top_features]

x_train, x_test = x_train[top_features], x_test[top_features]

In [6]:
# Defining multiple regression models for evaluation
models = {
    "Linear Regression": LinearRegression(),
    "Polynomial Regression": make_pipeline(PolynomialFeatures(degree=2), LinearRegression()),
    "Decision Tree": DecisionTreeRegressor(random_state=2),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Support Vector Regression": make_pipeline(StandardScaler(), SVR(kernel='linear')),
    "KNN Regression": KNeighborsRegressor(n_neighbors=5),
    "XGBoost Regression": XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, verbosity=0),
}

# Setting up to perform repeated train-test splits for model stability
n_runs = 10
results = {}

# Loop for each model evaluation
for model_name, model in models.items():
    test_r2_scores = []
    
    # Repeating train-test split for stability in R² scores
    for i in range(n_runs):
        x_train, x_test, y_train, y_test = train_test_split(X_top, y, test_size=0.2, random_state=i)
        
        model.fit(x_train, y_train)
        y_test_pred = model.predict(x_test)
       
        test_r2 = r2_score(y_test, y_test_pred)
        test_r2_scores.append(test_r2)
    
    mean_r2 = np.mean(test_r2_scores)
    std_dev_r2 = np.std(test_r2_scores)
    
    results[model_name] = {
        "Mean Test R2": mean_r2,
        "Standard Deviation of Test R2": std_dev_r2
    }
    
    print(f"\n--- {model_name} ---")
    print(f"Mean Test R2: {results[model_name]['Mean Test R2']}")
    print(f"Standard Deviation of Test R2: {results[model_name]['Standard Deviation of Test R2']}")


--- Linear Regression ---
Mean Test R2: 0.6875082023184321
Standard Deviation of Test R2: 0.018257196067445077

--- Polynomial Regression ---
Mean Test R2: 0.8244922235467511
Standard Deviation of Test R2: 0.017198904906456484

--- Decision Tree ---
Mean Test R2: 0.7803167245588302
Standard Deviation of Test R2: 0.020503712494689543

--- Random Forest ---
Mean Test R2: 0.879117440432801
Standard Deviation of Test R2: 0.010174340297516016

--- Support Vector Regression ---
Mean Test R2: 0.6684668333284747
Standard Deviation of Test R2: 0.02243539608859824

--- KNN Regression ---
Mean Test R2: 0.8282371037682331
Standard Deviation of Test R2: 0.015544817137465288

--- XGBoost Regression ---
Mean Test R2: 0.8662096634280386
Standard Deviation of Test R2: 0.010390493178880118


In [7]:
#Hyperparameter tuning 
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


In [8]:
rf = RandomForestRegressor(random_state=42)

In [11]:
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, scoring='r2', cv=3, n_jobs=-1)


grid_search_rf.fit(x_train, y_train)
best_rf_model = grid_search_rf.best_estimator_
best_rf_params = grid_search_rf.best_params_

In [13]:
y_pred_rf = best_rf_model.predict(x_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest -  R²: {r2_rf} , Mean Squared Error: {mse_rf}")

# Printing the best hyperparameters for both models
print("\n--- Best Hyperparameters ---")
print(f"Random Forest: {best_rf_params}")

Random Forest -  R²: 0.8751764171779072 , Mean Squared Error: 0.12372554338197045

--- Best Hyperparameters ---
Random Forest: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
