In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
def clean_df(df : pd.DataFrame):

  df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({
      'LF': 'Low Fat', 'reg': 'Regular', 'low fat': 'Low Fat'
      })

  df['Outlet_Age'] = 2013 - df['Outlet_Establishment_Year']

  df['Item_Type_Category'] = df['Item_Identifier'].apply(lambda x: x[0:2])

  df['Item_Visibility'] = df['Item_Visibility'].replace(0, np.nan)

  return df

In [3]:
df = pd.read_csv('/content/Train.csv')

df = clean_df(df)

X, y = df.drop('Item_Outlet_Sales', axis=1), df['Item_Outlet_Sales']

y = np.log1p(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

df

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Outlet_Age,Item_Type_Category
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380,14,FD
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,4,DR
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700,14,FD
3,FDX07,19.200,Regular,,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800,15,FD
4,NCD19,8.930,Low Fat,,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,26,NC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834,26,FD
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,Supermarket Type1,549.2850,11,FD
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136,9,NC
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976,4,FD


In [4]:
cat_cols = X_train.select_dtypes(include=['object']).columns
X_train[cat_cols] = X_train[cat_cols].fillna("Missing")
X_test[cat_cols] = X_test[cat_cols].fillna("Missing")
X_train.fillna(-999, inplace=True)
X_test.fillna(-999, inplace=True)

In [5]:
!pip install catboost



In [7]:
from catboost import CatBoostRegressor

categorical_features_indices = np.where(X.dtypes == object)[0]

model = CatBoostRegressor(
    iterations=1500,
    learning_rate=0.05,
    depth=6,
    loss_function='RMSE',
    cat_features=categorical_features_indices,
    verbose=200,
    random_seed=42
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred)))

print(f"RMSE: {rmse}")

0:	learn: 0.9785046	total: 47.9ms	remaining: 1m 11s
200:	learn: 0.5058901	total: 6.15s	remaining: 39.8s
400:	learn: 0.4922524	total: 11.9s	remaining: 32.7s
600:	learn: 0.4800450	total: 14s	remaining: 21s
800:	learn: 0.4668637	total: 16.2s	remaining: 14.1s
1000:	learn: 0.4560310	total: 18.4s	remaining: 9.15s
1200:	learn: 0.4452235	total: 20.5s	remaining: 5.11s
1400:	learn: 0.4349100	total: 23.8s	remaining: 1.68s
1499:	learn: 0.4300826	total: 24.9s	remaining: 0us
RMSE: 1038.7855577622722
