In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.impute import SimpleImputer

In [3]:
train_df = pd.read_csv("C:/Users/pc/Documents/My Palettes/train.csv")
test_df = pd.read_csv("C:/Users/pc/Documents/My Palettes/test.csv")

In [5]:
train_df.head(10)

Unnamed: 0,Item_ID,Store_ID,Item_Store_ID,Item_Weight,Item_Sugar_Content,Item_Visibility,Item_Type,Item_Price,Store_Start_Year,Store_Size,Store_Location_Type,Store_Type,Item_Store_Returns
0,DRA12,BABATUNJI010,DRA12_BABATUNJI010,11.6,Low Sugar,0.068535,Soft Drinks,357.54,2005,,Cluster 3,Grocery Store,709.08
1,DRA12,BABATUNJI013,DRA12_BABATUNJI013,11.6,Low Sugar,0.040912,Soft Drinks,355.79,1994,High,Cluster 3,Supermarket Type1,6381.69
2,DRA12,BABATUNJI017,DRA12_BABATUNJI017,11.6,Low Sugar,0.041178,Soft Drinks,350.79,2014,,Cluster 2,Supermarket Type1,6381.69
3,DRA12,BABATUNJI018,DRA12_BABATUNJI018,11.6,Low Sugar,0.041113,Soft Drinks,355.04,2016,Medium,Cluster 3,Supermarket Type2,2127.23
4,DRA12,BABATUNJI035,DRA12_BABATUNJI035,11.6,Ultra Low Sugar,0.0,Soft Drinks,354.79,2011,Small,Cluster 2,Supermarket Type1,2481.77
5,DRA12,BABATUNJI045,DRA12_BABATUNJI045,11.6,Low Sugar,0.0,Soft Drinks,354.04,2009,,Cluster 2,Supermarket Type1,9572.54
6,DRA24,BABATUNJI010,DRA24_BABATUNJI010,19.35,Normal Sugar,0.066832,Soft Drinks,409.72,2005,,Cluster 3,Grocery Store,818.93
7,DRA24,BABATUNJI013,DRA24_BABATUNJI013,19.35,Normal Sugar,0.039895,Soft Drinks,406.22,1994,High,Cluster 3,Supermarket Type1,11055.61
8,DRA24,BABATUNJI017,DRA24_BABATUNJI017,19.35,Normal Sugar,0.040154,Soft Drinks,411.72,2014,,Cluster 2,Supermarket Type1,2866.27
9,DRA24,BABATUNJI019,DRA24_BABATUNJI019,,Normal Sugar,0.069909,Soft Drinks,408.22,1992,Small,Cluster 1,Grocery Store,1228.4


In [7]:
test_df.head(10)

Unnamed: 0,Item_ID,Store_ID,Item_Store_ID,Item_Weight,Item_Sugar_Content,Item_Visibility,Item_Type,Item_Price,Store_Start_Year,Store_Size,Store_Location_Type,Store_Type
0,DRA59,BABATUNJI010,DRA59_BABATUNJI010,8.27,Normal Sugar,0.214125,Soft Drinks,459.98,2005,,Cluster 3,Grocery Store
1,DRA59,BABATUNJI013,DRA59_BABATUNJI013,8.27,Normal Sugar,0.127821,Soft Drinks,464.98,1994,High,Cluster 3,Supermarket Type1
2,DRB01,BABATUNJI013,DRB01_BABATUNJI013,7.39,Low Sugar,0.082171,Soft Drinks,477.38,1994,High,Cluster 3,Supermarket Type1
3,DRB13,BABATUNJI010,DRB13_BABATUNJI010,6.115,Normal Sugar,0.011791,Soft Drinks,472.63,2005,,Cluster 3,Grocery Store
4,DRB13,BABATUNJI013,DRB13_BABATUNJI013,6.115,Normal Sugar,0.007038,Soft Drinks,473.13,1994,High,Cluster 3,Supermarket Type1
5,DRB25,BABATUNJI017,DRB25_BABATUNJI017,12.3,Low Sugar,0.069853,Soft Drinks,268.73,2014,,Cluster 2,Supermarket Type1
6,DRB25,BABATUNJI027,DRB25_BABATUNJI027,,Low Sugar,0.069123,Soft Drinks,265.23,1992,Medium,Cluster 3,Supermarket Type3
7,DRB25,BABATUNJI035,DRB25_BABATUNJI035,12.3,Low Sugar,0.069447,Soft Drinks,265.98,2011,Small,Cluster 2,Supermarket Type1
8,DRB48,BABATUNJI017,DRB48_BABATUNJI017,16.75,Normal Sugar,0.024994,Soft Drinks,93.71,2014,,Cluster 2,Supermarket Type1
9,DRB48,BABATUNJI027,DRB48_BABATUNJI027,,Normal Sugar,0.024733,Soft Drinks,100.71,1992,Medium,Cluster 3,Supermarket Type3


In [9]:
#Making copies not lose original
train = train_df.copy()
test = test_df.copy()

In [11]:
print("Train shape:", train.shape)
print("Test shape:", test.shape)

Train shape: (4990, 13)
Test shape: (3532, 12)


In [13]:
#Handling missing values

#for Item_Weight
imputer_weight = SimpleImputer(strategy='median')
train['Item_Weight'] = imputer_weight.fit_transform(train[['Item_Weight']])
test['Item_Weight'] = imputer_weight.transform(test[['Item_Weight']])


#for store_size
imputer_store_size = SimpleImputer(strategy='most_frequent')
train['Store_Size'] = imputer_store_size.fit_transform(train[['Store_Size']]).ravel()
test['Store_Size'] = imputer_store_size.transform(test[['Store_Size']]).ravel()

In [15]:
#feature Engineering

#create store_age = 2025-store_start_year
train['Store_Age'] = 2025 - train['Store_Start_Year']
test['Store_Age'] = 2025 - test['Store_Start_Year']

#Drop original_year column
train.drop(columns=['Store_Start_Year'], inplace=True)
test.drop(columns=['Store_Start_Year'], inplace=True)

# Avoid division by zero or missing weight
train['Price_per_kg'] = train['Item_Price'] / (train['Item_Weight'] + 1e-3)
test['Price_per_kg'] = test['Item_Price'] / (test['Item_Weight'] + 1e-3)

# Interaction feature: How visible and how old the store is
train['Visibility_Score'] = train['Item_Visibility'] * train['Store_Age']
test['Visibility_Score'] = test['Item_Visibility'] * test['Store_Age']

In [17]:
print(train.head(10))

  Item_ID      Store_ID       Item_Store_ID  Item_Weight Item_Sugar_Content  \
0   DRA12  BABATUNJI010  DRA12_BABATUNJI010        11.60          Low Sugar   
1   DRA12  BABATUNJI013  DRA12_BABATUNJI013        11.60          Low Sugar   
2   DRA12  BABATUNJI017  DRA12_BABATUNJI017        11.60          Low Sugar   
3   DRA12  BABATUNJI018  DRA12_BABATUNJI018        11.60          Low Sugar   
4   DRA12  BABATUNJI035  DRA12_BABATUNJI035        11.60    Ultra Low Sugar   
5   DRA12  BABATUNJI045  DRA12_BABATUNJI045        11.60          Low Sugar   
6   DRA24  BABATUNJI010  DRA24_BABATUNJI010        19.35       Normal Sugar   
7   DRA24  BABATUNJI013  DRA24_BABATUNJI013        19.35       Normal Sugar   
8   DRA24  BABATUNJI017  DRA24_BABATUNJI017        19.35       Normal Sugar   
9   DRA24  BABATUNJI019  DRA24_BABATUNJI019        12.60       Normal Sugar   

   Item_Visibility    Item_Type  Item_Price Store_Size Store_Location_Type  \
0         0.068535  Soft Drinks      357.54     Medi

In [22]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.1/150.0 MB 326.8 kB/s eta 0:07:39
   ---------------------------------------- 0.1/150.0 MB 403.5 kB/s eta 0:06:12
   ---------------------------------------- 0.1/150.0 MB 403.5 kB/s eta 0:06:12
   ---------------------------------------- 0.1/150.0 MB 403.5 kB/s eta 0:06:12
   ---------------------------------------- 0.1/150.0 MB 256.0 kB/s eta 0:09:46
   ---------------------------------------- 0.1/150.0 MB 303.9 kB/s eta 0:08:13
   --------------------

In [21]:
# Make sure all categorical columns are properly encoded
categorical_cols = [
    'Item_Sugar_Content',
    'Item_Type',
    'Store_Size',
    'Store_Location_Type',
    'Store_Type'
]

# Apply LabelEncoder again if needed
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for col in categorical_cols:
    if train[col].dtype == 'object':
        all_values = pd.concat([train[col], test[col]])
        le.fit(all_values)
        train[col] = le.transform(train[col])
        test[col] = le.transform(test[col])

In [23]:
print(train.dtypes)

Item_ID                 object
Store_ID                object
Item_Store_ID           object
Item_Weight            float64
Item_Sugar_Content       int32
Item_Visibility        float64
Item_Type                int32
Item_Price             float64
Store_Size               int32
Store_Location_Type      int32
Store_Type               int32
Item_Store_Returns     float64
Store_Age                int64
Price_per_kg           float64
Visibility_Score       float64
dtype: object


In [25]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Define features and target
drop_cols = ['Item_ID', 'Store_ID', 'Item_Store_ID', 'Item_Store_Returns']
X = train.drop(columns=drop_cols)
y = train['Item_Store_Returns']
X_test = test.drop(columns=['Item_ID', 'Store_ID', 'Item_Store_ID'])

# Split for validation
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost model
xgb_model = xgb.XGBRegressor(n_estimators=200, max_depth=6, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

# Validate
y_pred = xgb_model.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
print("XGBoost RMSE:", rmse)

XGBoost RMSE: 3129.430980056639


In [27]:
# Predict on test set
test_preds = xgb_model.predict(X_test)

# Prepare submission
submission = pd.read_csv("SampleSubmission.csv")
submission['Item_Store_Returns'] = test_preds
submission.to_csv("xgb_submission.csv", index=False)

print("Submission saved: xgb_submission.csv ✅")

Submission saved: xgb_submission.csv ✅


In [31]:
submission_saved = pd.read_csv("C:/Users/pc/Documents/My Palettes/xgb_submission.csv")
submission_saved

Unnamed: 0,Item_Store_ID,Item_Store_Returns
0,DRA59_BABATUNJI010,1156.41500
1,DRA59_BABATUNJI013,8205.27000
2,DRB01_BABATUNJI013,9084.07900
3,DRB13_BABATUNJI010,434.66913
4,DRB13_BABATUNJI013,5660.27830
...,...,...
3527,NCZ42_BABATUNJI010,1421.18140
3528,NCZ42_BABATUNJI013,7533.61800
3529,NCZ42_BABATUNJI049,8591.56700
3530,NCZ53_BABATUNJI010,3099.00420
