<a href="https://colab.research.google.com/github/alaaguedda/python-Colab-Trainer/blob/main/restaurant_inventory_forecasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# 1) Install libs (run this cell)
!pip install --quiet datasets xgboost scikit-learn pandas pyarrow

# 2) Imports
from datasets import load_dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

# 3) Load the dataset (train split only)
ds = load_dataset("Dingdong-Inc/FreshRetailNet-50K", split="train")  # ~will download ~100+ MB

# 4) Quick peek
print("columns:", ds.column_names)
print(ds.features)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.parquet:   0%|          | 0.00/106M [00:00<?, ?B/s]

eval.parquet:   0%|          | 0.00/8.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4500000 [00:00<?, ? examples/s]

Generating eval split:   0%|          | 0/350000 [00:00<?, ? examples/s]

columns: ['city_id', 'store_id', 'management_group_id', 'first_category_id', 'second_category_id', 'third_category_id', 'product_id', 'dt', 'sale_amount', 'hours_sale', 'stock_hour6_22_cnt', 'hours_stock_status', 'discount', 'holiday_flag', 'activity_flag', 'precpt', 'avg_temperature', 'avg_humidity', 'avg_wind_level']
{'city_id': Value('int64'), 'store_id': Value('int64'), 'management_group_id': Value('int64'), 'first_category_id': Value('int64'), 'second_category_id': Value('int64'), 'third_category_id': Value('int64'), 'product_id': Value('int64'), 'dt': Value('string'), 'sale_amount': Value('float64'), 'hours_sale': List(Value('float64')), 'stock_hour6_22_cnt': Value('int32'), 'hours_stock_status': List(Value('int64')), 'discount': Value('float64'), 'holiday_flag': Value('int32'), 'activity_flag': Value('int32'), 'precpt': Value('float64'), 'avg_temperature': Value('float64'), 'avg_humidity': Value('float64'), 'avg_wind_level': Value('float64')}


In [5]:
prod_ids = ds.unique("product_id")
print("num unique products:", len(prod_ids))
print("first 20 product_ids:", prod_ids[:20])

# 6) Choose 3 product_ids to act as proxies (you'll pick ones that resemble meat/chicken/bread)
chosen = prod_ids[:3]   # <-- change indexes after inspection
print("chosen product_ids:", chosen)

# 7) Filter dataset to those products (keeps data small)
small = ds.filter(lambda ex, ids=chosen: ex["product_id"] in ids)   # returns a Dataset
df = small.to_pandas()

num unique products: 865
first 20 product_ids: [38, 834, 411, 686, 580, 596, 740, 379, 4, 600, 699, 548, 72, 644, 638, 496, 296, 631, 310, 633]
chosen product_ids: [38, 834, 411]


Filter:   0%|          | 0/4500000 [00:00<?, ? examples/s]

In [6]:
df.head()

Unnamed: 0,city_id,store_id,management_group_id,first_category_id,second_category_id,third_category_id,product_id,dt,sale_amount,hours_sale,stock_hour6_22_cnt,hours_stock_status,discount,holiday_flag,activity_flag,precpt,avg_temperature,avg_humidity,avg_wind_level
0,0,0,0,5,6,65,38,2024-03-28,0.1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, ...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.0,0,0,1.6999,15.48,73.54,1.97
1,0,0,0,5,6,65,38,2024-03-29,0.1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, ...",1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.0,0,0,3.019,15.08,76.56,1.71
2,0,0,0,5,6,65,38,2024-03-30,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,"[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.0,1,0,2.0942,15.91,76.47,1.73
3,0,0,0,5,6,65,38,2024-03-31,0.1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, ...",11,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, ...",1.0,1,0,1.5618,16.13,77.4,1.76
4,0,0,0,5,6,65,38,2024-04-01,0.2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, ...",8,"[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",1.0,0,0,3.5386,15.37,78.26,1.25


In [7]:
df_sample = df.sample(n=80000, random_state=42)

# 4. Reset index for cleanliness
df_sample = df_sample.reset_index(drop=True)

# 5. Save reduced dataset for faster reloads
df_sample.to_csv("freshretailnet_80k.csv", index=False)

print("Original size:", len(df))
print("Reduced size:", len(df_sample))
df_sample.head()

Original size: 99540
Reduced size: 80000


Unnamed: 0,city_id,store_id,management_group_id,first_category_id,second_category_id,third_category_id,product_id,dt,sale_amount,hours_sale,stock_hour6_22_cnt,hours_stock_status,discount,holiday_flag,activity_flag,precpt,avg_temperature,avg_humidity,avg_wind_level
0,0,19,0,28,72,154,834,2024-05-25,1.6,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3, ...",12,"[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, ...",1.0,1,0,1.0087,23.1,67.44,2.02
1,0,246,0,5,6,65,38,2024-04-28,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.75,1,1,3.1548,20.56,76.74,2.16
2,0,137,0,28,72,154,834,2024-04-01,2.3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3, 0.1, ...",7,"[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.0,0,0,3.5385,15.37,78.26,1.25
3,12,224,0,28,72,154,834,2024-04-29,0.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, ...",9,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...",1.0,0,0,4.0912,20.48,80.65,2.24
4,0,172,0,28,72,154,834,2024-05-29,0.9,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2, ...",8,"[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",1.0,0,0,0.2114,22.75,63.18,1.84


In [17]:
df = df_sample
df.head()


Unnamed: 0,city_id,store_id,management_group_id,first_category_id,second_category_id,third_category_id,product_id,dt,sale_amount,hours_sale,stock_hour6_22_cnt,hours_stock_status,discount,holiday_flag,activity_flag,precpt,avg_temperature,avg_humidity,avg_wind_level
0,0,19,0,28,72,154,834,2024-05-25,1.6,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3, ...",12,"[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, ...",1.0,1,0,1.0087,23.1,67.44,2.02
1,0,246,0,5,6,65,38,2024-04-28,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.75,1,1,3.1548,20.56,76.74,2.16
2,0,137,0,28,72,154,834,2024-04-01,2.3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3, 0.1, ...",7,"[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.0,0,0,3.5385,15.37,78.26,1.25
3,12,224,0,28,72,154,834,2024-04-29,0.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, ...",9,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...",1.0,0,0,4.0912,20.48,80.65,2.24
4,0,172,0,28,72,154,834,2024-05-29,0.9,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2, ...",8,"[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",1.0,0,0,0.2114,22.75,63.18,1.84


In [19]:
df["product_id"].unique()

array([834,  38, 411])

In [20]:
product_map = {
    38:"bread",
    411:"chicken",
    834:"meat"
}

df_filtred = df

In [21]:
df_filtred["product_name"] = df_filtred['product_id'].map(product_map)

In [None]:
df_filtred["product_name"].value_counts()

In [24]:
df_filtred = df_filtred.drop(columns = ['city_id',
    'management_group_id',
    'first_category_id',
    'second_category_id',
    'third_category_id'])

In [29]:
cols = ['store_id', 'product_name', 'dt', 'sale_amount',
        'discount', 'holiday_flag', 'activity_flag',
        'precpt', 'avg_temperature', 'avg_humidity', 'avg_wind_level']

In [32]:
df_filtred = df_filtred[cols]

In [None]:
df_filtred.head()

In [34]:
df = df_filtred

In [36]:
df.head()

Unnamed: 0,store_id,product_name,dt,sale_amount,discount,holiday_flag,activity_flag,precpt,avg_temperature,avg_humidity,avg_wind_level
0,19,meat,2024-05-25,1.6,1.0,1,0,1.0087,23.1,67.44,2.02
1,246,bread,2024-04-28,0.0,0.75,1,1,3.1548,20.56,76.74,2.16
2,137,meat,2024-04-01,2.3,1.0,0,0,3.5385,15.37,78.26,1.25
3,224,meat,2024-04-29,0.5,1.0,0,0,4.0912,20.48,80.65,2.24
4,172,meat,2024-05-29,0.9,1.0,0,0,0.2114,22.75,63.18,1.84
