In [3]:
# 1. Imports
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [5]:
# 2. Load Data
train_raw = pd.read_csv("C:/Users/pc/Documents/My Palettes/train.csv")
test_raw = pd.read_csv("C:/Users/pc/Documents/My Palettes/test.csv")
submission = pd.read_csv("C:/Users/pc/Documents/My Palettes/SampleSubmission.csv")

In [7]:
# 3. Copy Data
train = train_raw.copy()
test = test_raw.copy()

In [9]:
train.head(10)

Unnamed: 0,Item_ID,Store_ID,Item_Store_ID,Item_Weight,Item_Sugar_Content,Item_Visibility,Item_Type,Item_Price,Store_Start_Year,Store_Size,Store_Location_Type,Store_Type,Item_Store_Returns
0,DRA12,BABATUNJI010,DRA12_BABATUNJI010,11.6,Low Sugar,0.068535,Soft Drinks,357.54,2005,,Cluster 3,Grocery Store,709.08
1,DRA12,BABATUNJI013,DRA12_BABATUNJI013,11.6,Low Sugar,0.040912,Soft Drinks,355.79,1994,High,Cluster 3,Supermarket Type1,6381.69
2,DRA12,BABATUNJI017,DRA12_BABATUNJI017,11.6,Low Sugar,0.041178,Soft Drinks,350.79,2014,,Cluster 2,Supermarket Type1,6381.69
3,DRA12,BABATUNJI018,DRA12_BABATUNJI018,11.6,Low Sugar,0.041113,Soft Drinks,355.04,2016,Medium,Cluster 3,Supermarket Type2,2127.23
4,DRA12,BABATUNJI035,DRA12_BABATUNJI035,11.6,Ultra Low Sugar,0.0,Soft Drinks,354.79,2011,Small,Cluster 2,Supermarket Type1,2481.77
5,DRA12,BABATUNJI045,DRA12_BABATUNJI045,11.6,Low Sugar,0.0,Soft Drinks,354.04,2009,,Cluster 2,Supermarket Type1,9572.54
6,DRA24,BABATUNJI010,DRA24_BABATUNJI010,19.35,Normal Sugar,0.066832,Soft Drinks,409.72,2005,,Cluster 3,Grocery Store,818.93
7,DRA24,BABATUNJI013,DRA24_BABATUNJI013,19.35,Normal Sugar,0.039895,Soft Drinks,406.22,1994,High,Cluster 3,Supermarket Type1,11055.61
8,DRA24,BABATUNJI017,DRA24_BABATUNJI017,19.35,Normal Sugar,0.040154,Soft Drinks,411.72,2014,,Cluster 2,Supermarket Type1,2866.27
9,DRA24,BABATUNJI019,DRA24_BABATUNJI019,,Normal Sugar,0.069909,Soft Drinks,408.22,1992,Small,Cluster 1,Grocery Store,1228.4


In [11]:
test.head(10)

Unnamed: 0,Item_ID,Store_ID,Item_Store_ID,Item_Weight,Item_Sugar_Content,Item_Visibility,Item_Type,Item_Price,Store_Start_Year,Store_Size,Store_Location_Type,Store_Type
0,DRA59,BABATUNJI010,DRA59_BABATUNJI010,8.27,Normal Sugar,0.214125,Soft Drinks,459.98,2005,,Cluster 3,Grocery Store
1,DRA59,BABATUNJI013,DRA59_BABATUNJI013,8.27,Normal Sugar,0.127821,Soft Drinks,464.98,1994,High,Cluster 3,Supermarket Type1
2,DRB01,BABATUNJI013,DRB01_BABATUNJI013,7.39,Low Sugar,0.082171,Soft Drinks,477.38,1994,High,Cluster 3,Supermarket Type1
3,DRB13,BABATUNJI010,DRB13_BABATUNJI010,6.115,Normal Sugar,0.011791,Soft Drinks,472.63,2005,,Cluster 3,Grocery Store
4,DRB13,BABATUNJI013,DRB13_BABATUNJI013,6.115,Normal Sugar,0.007038,Soft Drinks,473.13,1994,High,Cluster 3,Supermarket Type1
5,DRB25,BABATUNJI017,DRB25_BABATUNJI017,12.3,Low Sugar,0.069853,Soft Drinks,268.73,2014,,Cluster 2,Supermarket Type1
6,DRB25,BABATUNJI027,DRB25_BABATUNJI027,,Low Sugar,0.069123,Soft Drinks,265.23,1992,Medium,Cluster 3,Supermarket Type3
7,DRB25,BABATUNJI035,DRB25_BABATUNJI035,12.3,Low Sugar,0.069447,Soft Drinks,265.98,2011,Small,Cluster 2,Supermarket Type1
8,DRB48,BABATUNJI017,DRB48_BABATUNJI017,16.75,Normal Sugar,0.024994,Soft Drinks,93.71,2014,,Cluster 2,Supermarket Type1
9,DRB48,BABATUNJI027,DRB48_BABATUNJI027,,Normal Sugar,0.024733,Soft Drinks,100.71,1992,Medium,Cluster 3,Supermarket Type3


In [13]:
#Handling missing values and feature engineering

# Impute Item_Weight (median)
imputer_weight = SimpleImputer(strategy='median')
train['Item_Weight'] = imputer_weight.fit_transform(train[['Item_Weight']]).ravel()
test['Item_Weight'] = imputer_weight.transform(test[['Item_Weight']]).ravel()

# Impute Store_Size (most frequent)
imputer_store_size = SimpleImputer(strategy='most_frequent')
train['Store_Size'] = imputer_store_size.fit_transform(train[['Store_Size']]).ravel()
test['Store_Size'] = imputer_store_size.transform(test[['Store_Size']]).ravel()

# Create Store_Age
train['Store_Age'] = 2025 - train['Store_Start_Year']
test['Store_Age'] = 2025 - test['Store_Start_Year']
train.drop(columns=['Store_Start_Year'], inplace=True)
test.drop(columns=['Store_Start_Year'], inplace=True)

In [15]:
#Encode categroical features

cat_cols = ['Item_Sugar_Content', 'Item_Type', 'Store_Size', 'Store_Location_Type', 'Store_Type']
le = LabelEncoder()

for col in cat_cols:
    all_vals = pd.concat([train[col], test[col]])
    le.fit(all_vals)
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

In [17]:
# Price per kg
train['Price_per_kg'] = train['Item_Price'] / (train['Item_Weight'] + 1e-3)
test['Price_per_kg'] = test['Item_Price'] / (test['Item_Weight'] + 1e-3)

# Price deviation from mean
train['Item_Price_Diff'] = train['Item_Price'] - train.groupby('Item_ID')['Item_Price'].transform('mean')
test['Item_Price_Diff'] = test['Item_Price'] - test.groupby('Item_ID')['Item_Price'].transform('mean')

# Interaction features
train['Weight_x_Visibility'] = train['Item_Weight'] * train['Item_Visibility']
test['Weight_x_Visibility'] = test['Item_Weight'] * test['Item_Visibility']

train['Store_Visibility'] = train['Store_Age'] * train['Item_Visibility']
test['Store_Visibility'] = test['Store_Age'] * test['Item_Visibility']

In [19]:
#Define features and target

drop_cols = ['Item_ID', 'Store_ID', 'Item_Store_ID', 'Item_Store_Returns']
X = train.drop(columns=drop_cols)
y = train['Item_Store_Returns']
X_test = test.drop(columns=['Item_ID', 'Store_ID', 'Item_Store_ID'])

# Train/validation split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
#Log transform target
y_train_log = np.log1p(y_train)

In [23]:
#Train model

# XGBoost
xgb_model = xgb.XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=8,
                             subsample=0.8, colsample_bytree=0.8, random_state=42)
xgb_model.fit(X_train, y_train_log)

# Random Forest
rf = RandomForestRegressor(n_estimators=200, max_depth=12, random_state=42)
rf.fit(X_train, y_train)

# Gradient Boosting
gb = GradientBoostingRegressor(n_estimators=200, learning_rate=0.07, max_depth=6, random_state=42)
gb.fit(X_train, y_train)

In [31]:
#Validate using ensemble

# Predict
pred_xgb = np.expm1(xgb_model.predict(X_valid))  # reverse log
pred_rf = rf.predict(X_valid)
pred_gb = gb.predict(X_valid)

# Ensemble (average)
ensemble_valid = (pred_xgb + pred_rf + pred_gb) / 3

# RMSE
rmse = mean_squared_error(y_valid, ensemble_valid, squared=False)
print("Validation RMSE (Ensemble):", rmse)

Validation RMSE (Ensemble): 3049.4935746626675




In [33]:
# Predict on test set
pred_test_xgb = np.expm1(xgb_model.predict(X_test))
pred_test_rf = rf.predict(X_test)
pred_test_gb = gb.predict(X_test)

# Final ensemble prediction
final_preds = (pred_test_xgb + pred_test_rf + pred_test_gb) / 3

In [35]:
# Prepare submission
submission['Item_Store_Returns'] = final_preds
submission.to_csv("submission_boosted.csv", index=False)
print("Submission file saved as: submission_boosted.csv ✅")

Submission file saved as: submission_boosted.csv ✅


In [39]:
submission_boosted = pd.read_csv("C:/Users/pc/Documents/My Palettes/submission_boosted.csv")
submission_boosted

Unnamed: 0,Item_Store_ID,Item_Store_Returns
0,DRA59_BABATUNJI010,1322.441417
1,DRA59_BABATUNJI013,9690.180912
2,DRB01_BABATUNJI013,8359.966146
3,DRB13_BABATUNJI010,343.582497
4,DRB13_BABATUNJI013,6296.399747
...,...,...
3527,NCZ42_BABATUNJI010,1050.256038
3528,NCZ42_BABATUNJI013,9052.215392
3529,NCZ42_BABATUNJI049,7708.501123
3530,NCZ53_BABATUNJI010,1199.524414
