##                                                   BigMart Sales Prediction

In [1]:
##########################
# IMPORTING LIBRARIES
##########################

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score, mean_squared_error, root_mean_squared_error

from sklearn.preprocessing import LabelEncoder

In [2]:
# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
df= pd.read_csv("Files/model_dataset.csv")

In [4]:
df.head()

Unnamed: 0,Outlet_Identifier,Outlet_Age,Item_Weight,Item_Visibility,Item_MRP,Item_Visibility_Log,Item_Outlet_Sales,Outlet_Size_High,Outlet_Size_Medium,Outlet_Size_Small,...,Outlet_Location_Type_Tier 3,Weight_Bucket_Low,Weight_Bucket_Mode,Weight_Bucket_High,Item_Category_Drinks,Item_Category_Food,Item_Category_Non-Consumables,Visibility_Bucket_Low,Visibility_Bucket_Medium,Visibility_Bucket_High
0,9,14,9.3,0.016047,249.8092,-3.612308,3735.138,0,1,0,...,0,1,0,0,0,1,0,1,0,0
1,3,4,5.92,0.019278,48.2692,-3.612308,443.4228,0,1,0,...,1,1,0,0,1,0,0,1,0,0
2,9,14,17.5,0.01676,141.618,-3.612308,2097.27,0,1,0,...,0,0,0,1,0,1,0,1,0,0
3,0,15,19.2,0.0,182.095,-3.612308,732.38,0,0,1,...,1,0,0,1,0,1,0,0,0,0
4,1,26,8.93,0.0,53.8614,-3.612308,994.7052,1,0,0,...,1,1,0,0,0,0,1,0,0,0


In [5]:
df.shape

(8523, 32)

In [11]:
final_cat_features= ['Item_Fat_Content', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type',
                     'Outlet_Type', 'Item_Category', 'Outlet_Age', 'MRP_Bucket', 'Weight_Bucket', 'Visibility_Bucket'
                    ]
final_num_features= ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Item_Visibility_Log']

### Model Experimentations

In [14]:
X= df.drop(columns= 'Item_Outlet_Sales')
y= df['Item_Outlet_Sales']

X_train, X_test, y_train, y_test= train_test_split(X,y, test_size= 0.3, random_state= 42)
X_train.shape, y_train.shape

((5966, 31), (5966,))

In [16]:
def ModelResults(model, params= {}):
    m= model(**params)
    m.fit(X_train, y_train)
    pred_train= m.predict(X_train)
    pred_test= m.predict(X_test)
    
    r2_train= r2_score(y_train, pred_train)
    rmse_train= root_mean_squared_error(y_train, pred_train)
    
    r2_test= r2_score(y_test, pred_test)
    rmse_test= root_mean_squared_error(y_test, pred_test)
    
    df= pd.DataFrame([{
        'model': model.__name__, 
        'R2 Train': r2_train,
        'RMSE Train': rmse_train,
        'R2 Test': r2_test,
        'RMSE Test': rmse_test 
    }])
    return df

In [18]:
ModelResults(XGBRegressor)

Unnamed: 0,model,R2 Train,RMSE Train,R2 Test,RMSE Test
0,XGBRegressor,0.87381,610.901093,0.521718,1157.438658


In [20]:
df_res= pd.DataFrame()
for model in [Lasso, RandomForestRegressor, XGBRegressor, LGBMRegressor, CatBoostRegressor]:
    df_model= ModelResults(model)
    df_res= pd.concat([df_res, df_model], axis= 0)

df_res

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000259 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1061
[LightGBM] [Info] Number of data points in the train set: 5966, number of used features: 31
[LightGBM] [Info] Start training from score 2204.882521
Learning rate set to 0.054292
0:	learn: 1666.3060931	total: 57.5ms	remaining: 57.5s
1:	learn: 1620.1407865	total: 58.4ms	remaining: 29.1s
2:	learn: 1574.8110340	total: 59.3ms	remaining: 19.7s
3:	learn: 1535.5028964	total: 60.3ms	remaining: 15s
4:	learn: 1496.9298788	total: 61.2ms	remaining: 12.2s
5:	learn: 1461.8318754	total: 62.1ms	remaining: 10.3s
6:	learn: 1428.8258300	total: 63ms	remaining: 8.94s
7:	learn: 1398.3091984	total: 63.9ms	remaining: 7.92s
8:	learn: 1370.4589078	total: 64.8ms	remaining: 7.14s
9:	learn: 1346.1665186	total: 65.7ms	remaining: 6.51s
10:	learn: 1322.1473712	total: 66.7ms	remaining: 6s
11:	learn: 1300.6959144	total: 67.6ms	

Unnamed: 0,model,R2 Train,RMSE Train,R2 Test,RMSE Test
0,Lasso,0.560107,1140.593412,0.568279,1099.657874
0,RandomForestRegressor,0.936489,433.392356,0.559261,1111.083521
0,XGBRegressor,0.87381,610.901093,0.521718,1157.438658
0,LGBMRegressor,0.732057,890.182586,0.57395,1092.411074
0,CatBoostRegressor,0.767366,829.457153,0.583086,1080.636097


In [21]:
#df_imp= pd.DataFrame({'features': X.columns, 'importance': (xgb.feature_importances_)*100})
#df_imp.sort_values('importance', ascending= False)

#### Hyper Parameter Tuning for CatBoost Model

In [23]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

X_train_tune, X_val_tune, y_train_tune, y_val_tune=  train_test_split(X, y, test_size= 0.3, random_state= 32)

# Define the objective function
def objective(params):
    model = CatBoostRegressor(
        random_state = 42,
        #iterations= int(params['iterations']),
        iterations= 500,
        depth= int(params['depth']),
        border_count= int(params['border_count']),
    
    # Float parameters
    learning_rate= params['learning_rate'],
    l2_leaf_reg= params['l2_leaf_reg'],
    bagging_temperature= params['bagging_temperature'],
    random_strength= params['random_strength'],
    subsample= params['subsample']
    )
    
    model.fit(X_train_tune, y_train_tune)
    y_pred = model.predict(X_val_tune)
    mse = mean_squared_error(y_val_tune, y_pred)
    return {'loss': mse, 'status': STATUS_OK}

# Define the hyperparameter space
space = {
    # Integer parameters
    #'iterations': hp.quniform('iterations', 500, 2000, 50),
    'depth': hp.quniform('depth', 4, 10, 1),
    'border_count': hp.quniform('border_count', 32, 255, 1),
    
    # Float parameters
    'learning_rate': hp.loguniform('learning_rate', -5, 0),  # ~0.006 to 1
    'l2_leaf_reg': hp.loguniform('l2_leaf_reg', -5, 2),
    'bagging_temperature': hp.uniform('bagging_temperature', 0, 1),
    'random_strength': hp.uniform('random_strength', 0, 1),
    'subsample': hp.uniform('subsample', 0.6, 1.0)
}

# Run the hyperparameter optimization
trials = Trials()
best = fmin(fn= objective, space= space, algo=tpe.suggest, max_evals= 100, trials= trials, verbose= False)


0:	learn: 1695.8947381	total: 699us	remaining: 349ms
1:	learn: 1687.1204212	total: 1.39ms	remaining: 346ms
2:	learn: 1679.0316847	total: 1.96ms	remaining: 326ms
3:	learn: 1671.0286533	total: 2.56ms	remaining: 317ms
4:	learn: 1663.0851262	total: 3.13ms	remaining: 310ms
5:	learn: 1654.8223917	total: 3.72ms	remaining: 307ms
6:	learn: 1646.4538721	total: 4.29ms	remaining: 302ms
7:	learn: 1638.8128815	total: 4.91ms	remaining: 302ms
8:	learn: 1630.9924206	total: 5.49ms	remaining: 299ms
9:	learn: 1622.9053333	total: 6.06ms	remaining: 297ms
10:	learn: 1615.2634676	total: 6.75ms	remaining: 300ms
11:	learn: 1607.8208668	total: 7.39ms	remaining: 300ms
12:	learn: 1600.0850205	total: 7.97ms	remaining: 299ms
13:	learn: 1592.8909290	total: 8.59ms	remaining: 298ms
14:	learn: 1585.7598804	total: 9.2ms	remaining: 297ms
15:	learn: 1578.5219958	total: 9.8ms	remaining: 296ms
16:	learn: 1571.8878852	total: 10.4ms	remaining: 296ms
17:	learn: 1564.8232543	total: 11.1ms	remaining: 296ms
18:	learn: 1557.9809340

In [24]:
print(best)

{'bagging_temperature': 0.48722103623294954, 'border_count': 33.0, 'depth': 4.0, 'l2_leaf_reg': 0.10935304065248312, 'learning_rate': 0.012568872787051584, 'random_strength': 0.14884817645172904, 'subsample': 0.8999111848006902}


In [25]:
X_train

Unnamed: 0,Outlet_Identifier,Outlet_Age,Item_Weight,Item_Visibility,Item_MRP,Item_Visibility_Log,Outlet_Size_High,Outlet_Size_Medium,Outlet_Size_Small,Item_Fat_Content_Low Fat,...,Outlet_Location_Type_Tier 3,Weight_Bucket_Low,Weight_Bucket_Mode,Weight_Bucket_High,Item_Category_Drinks,Item_Category_Food,Item_Category_Non-Consumables,Visibility_Bucket_Low,Visibility_Bucket_Medium,Visibility_Bucket_High
1921,0,15,7.720000,0.147904,116.6466,-1.911190,0,0,1,0,...,1,1,0,0,0,1,0,0,0,1
655,8,16,19.750000,0.014301,103.0332,-3.612308,0,0,1,1,...,0,0,0,1,0,0,1,1,0,0
2229,5,28,12.857645,0.054221,129.1310,-2.914694,0,1,0,0,...,1,0,1,0,0,1,0,0,1,0
2537,7,11,19.350000,0.118342,222.6088,-2.134174,0,0,1,1,...,0,0,0,1,0,1,0,0,0,1
3528,1,26,18.350000,0.092150,184.8266,-2.384338,1,0,0,0,...,1,0,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,0,15,9.395000,0.286345,139.1838,-1.250558,0,0,1,0,...,1,1,0,0,0,1,0,0,0,1
5191,2,6,15.600000,0.117575,75.6670,-2.140683,0,0,1,1,...,0,0,0,1,0,1,0,0,0,1
5390,7,11,17.600000,0.018944,237.3590,-3.612308,0,0,1,1,...,0,0,0,1,0,0,1,1,0,0
860,2,6,20.350000,0.054363,117.9466,-2.912077,0,0,1,1,...,0,0,0,1,0,1,0,0,1,0


In [26]:
#best['iterations'] = int(best['iterations'])
best['depth'] = int(best['depth'])
best['border_count'] = int(best['border_count'])

# ---------------------------
# 5. Train Final Model with Best Params
# ---------------------------
best_model = CatBoostRegressor(**best, random_state=42, iterations= 500)
best_model.fit(X_train, y_train)

# Final Evaluation
y_pred = best_model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred)
print(f"Final RMSE after Optuna tuning: {rmse:.2f}")

print("R2 Score Train: ", r2_score(y_train, best_model.predict(X_train)))
print("R2 Score: ", r2_score(y_test, y_pred))

0:	learn: 1707.8638015	total: 504us	remaining: 252ms
1:	learn: 1696.2244853	total: 1.1ms	remaining: 273ms
2:	learn: 1684.7983612	total: 2.1ms	remaining: 349ms
3:	learn: 1673.5835151	total: 2.58ms	remaining: 320ms
4:	learn: 1662.5758041	total: 3.14ms	remaining: 311ms
5:	learn: 1651.6932291	total: 3.65ms	remaining: 300ms
6:	learn: 1641.0899198	total: 4.12ms	remaining: 290ms
7:	learn: 1630.5738021	total: 4.61ms	remaining: 284ms
8:	learn: 1620.2708333	total: 5.1ms	remaining: 278ms
9:	learn: 1610.1540634	total: 5.59ms	remaining: 274ms
10:	learn: 1600.2996663	total: 6.21ms	remaining: 276ms
11:	learn: 1590.6324976	total: 6.85ms	remaining: 279ms
12:	learn: 1581.0469029	total: 7.38ms	remaining: 276ms
13:	learn: 1571.6377517	total: 7.86ms	remaining: 273ms
14:	learn: 1562.3543097	total: 8.34ms	remaining: 270ms
15:	learn: 1553.2302241	total: 8.82ms	remaining: 267ms
16:	learn: 1544.4562971	total: 9.26ms	remaining: 263ms
17:	learn: 1535.7449027	total: 9.71ms	remaining: 260ms
18:	learn: 1527.0080432	

### Test Predictions

In [28]:
df_test= pd.read_csv("Files/test_AbJTz2l.csv")

In [None]:
df_test.head()

In [None]:
df_test.shape

In [None]:
df_test.isna().sum()

#### Test data preprocessing and Feature Engg.

In [None]:
def Preprocessing(dataset):
    dataset['Item_Weight'].fillna(dataset['Item_Weight'].mean(), inplace= True)
    dataset['Outlet_Size'] = dataset.groupby('Outlet_Type')['Outlet_Size'].transform(lambda x: x.fillna(x.mode()[0]))
    dataset['Item_Fat_Content'].replace({'LF': 'Low Fat', 'reg': 'Regular', 'low fat': 'Low Fat'}, inplace= True)
    return dataset

# Define mapping
food_items = ['Dairy', 'Meat', 'Fruits and Vegetables', 'Snack Foods',
              'Breads', 'Breakfast', 'Frozen Foods', 'Canned', 'Starchy Foods']
drink_items = ['Soft Drinks', 'Hard Drinks']
non_consumables = ['Household', 'Health and Hygiene', 'Others']

def categorize_item(item):
    if item in food_items:
        return 'Food'
    elif item in drink_items:
        return 'Drinks'
    else:
        return 'Non-Consumables'

def FeatureEng(dataset):
    dataset['Item_Category'] = dataset['Item_Type'].apply(categorize_item)
    dataset['Outlet_Age']= 2013- dataset['Outlet_Establishment_Year']
    dataset['MRP_Bucket'] = pd.cut(dataset['Item_MRP'], 
                          bins=[0, 65, 135, 200, 270], 
                          labels=['Low', 'Medium', 'High', 'Very_High'])

    dataset['Weight_Bucket']= pd.cut(dataset['Item_Weight'], 
                          bins=[0, 12, 13, 22], 
                          labels=['Low', 'Mode', 'High'])

    dataset['Visibility_Bucket']= pd.cut(dataset['Item_Visibility'], 
                          bins=[0, 0.03, 0.08, 0.4], 
                          labels=['Low', 'Medium', 'High'])

    dataset['Item_Visibility_Log']= np.log(dataset['Item_Visibility'])
    dataset['Item_Visibility_Log']= np.where(dataset['Item_Visibility_Log']< np.quantile(dataset['Item_Visibility_Log'], 0.25), 
                                    np.quantile(dataset['Item_Visibility_Log'], 0.25), dataset['Item_Visibility_Log'])

    return dataset
    

df_test= Preprocessing(df_test)
df_test= FeatureEng(df_test)

In [None]:
df_test_final= df_test[final_cat_features+ final_num_features]

In [None]:
df_test_final.columns

In [None]:
label= LabelEncoder()
df_test_final['Outlet_Identifier']= label.fit_transform(df_test_final['Outlet_Identifier'])

df_test_final= pd.get_dummies(df_test_final, columns= list(set(final_cat_features)- set(['Outlet_Identifier', 'Outlet_Age'])), dtype= int)

In [None]:
print(best)

#### Final Modelling and Submission File

In [None]:
final_model = CatBoostRegressor(**best, random_state= 42, iterations= 100)
final_model.fit(X, y)

In [None]:
final_pred= final_model.predict(df_test_final[X.columns])

In [None]:
df_sub= pd.read_csv("Files/sample_submission_8RXa3c6.csv")

In [None]:
df_sub

In [None]:
df_sub['Item_Outlet_Sales']= final_pred.copy()

In [None]:
df_sub.to_csv("Submissions/submission_9.csv", index= False)