## Tasks
### Feature Engineering
- [ ] OHE for small Cardinality
- [ ] Entity Embeddings for high Cardinality.
- [ ] Make time features
- [ ] Lag features
- [ ] Rolling features
- [ ] 1 - Discount
- [ ] scaling numerical features



 **ADV (but optional)**
- [ ] Circular Time features(Sine , Cosine)
- [ ] outliers flag


### Preparing data for models
- [ ] remove unwanted columns
- [ ] split data with order respectively
- [ ] make window split

### Demo models
- [ ] Build LSTM
- [ ] Build GRU
- [ ] Build TimesNet


### Evaluate the models
- [ ] tuning parameters with evaluation

### Final models
- [ ] retrain the model using all train data
- [ ] Evaluate the model and compare models


In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt
import seaborn as sns


In [17]:
train_df = pd.read_csv(r"train")
test_df = pd.read_csv(r"eval")


In [18]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,city_id,store_id,management_group_id,first_category_id,second_category_id,third_category_id,product_id,dt,sale_amount,hours_sale,stock_hour6_22_cnt,hours_stock_status,discount,holiday_flag,activity_flag,precpt,avg_temperature,avg_humidity,avg_wind_level
0,0,0,0,0,5,6,65,38,2024-03-28,0.1,[0. 0. 0. 0. 0. 0. 0. 0.1 0. 0. 0. 0...,0,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0],1.0,0,0,1.6999,15.48,73.54,1.97
1,1,0,0,0,5,6,65,38,2024-03-29,0.1,[0. 0. 0. 0. 0. 0. 0.1 0. 0. 0. 0. 0...,1,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1],1.0,0,0,3.019,15.08,76.56,1.71
2,2,0,0,0,5,6,65,38,2024-03-30,0.0,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,0,[1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0],1.0,1,0,2.0942,15.91,76.47,1.73
3,3,0,0,0,5,6,65,38,2024-03-31,0.1,[0. 0. 0. 0. 0. 0. 0. 0. 0.1 0. 0. 0...,11,[0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1],1.0,1,0,1.5618,16.13,77.4,1.76
4,4,0,0,0,5,6,65,38,2024-04-01,0.2,[0. 0. 0. 0. 0. 0. 0.1 0. 0. 0. 0. 0...,8,[1 1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1],1.0,0,0,3.5386,15.37,78.26,1.25


### Feature Engineering

In [None]:
def feature_Engineering_TreeBasedModels(df):
    df = df.drop('Unnamed: 0', axis=1)
    df = df.drop('hours_sale', axis=1)
    df = df.drop('hours_stock_status', axis=1)
    
    ### Time features
    df["dt"] = pd.to_datetime(df["dt"])
    
    df["day_of_week"] = df["dt"].dt.dayofweek      
    df["week_of_year"] = df["dt"].dt.isocalendar().week.astype(int)
    df["month"] = df["dt"].dt.month
    df["day"] = df["dt"].dt.day
    
    
    df["is_month_start"] = df["dt"].dt.is_month_start.astype(int)
    df["is_month_end"] = df["dt"].dt.is_month_end.astype(int)

    
    ### Lag  features
    df["lag_1"] = df.groupby(["store_id" , "product_id"])["sale_amount"].shift(1)
    df["lag_7"] = df.groupby(["store_id" , "product_id"])["sale_amount"].shift(7)
    df["lag_14"] = df.groupby(["store_id" , "product_id"])["sale_amount"].shift(14)

    
    ### Rolling mean
    df["rolling_mean_3"] = (
        df.groupby(["store_id" , "product_id"])["sale_amount"]
          .shift(1)
          .rolling(3, min_periods=1)
          .mean()
    )
    
    df["rolling_mean_7"] = (
        df.groupby(["store_id" , "product_id"])["sale_amount"]
          .shift(1)
          .rolling(7, min_periods=1)
          .mean()
    )
    
    df["rolling_mean_14"] = (
        df.groupby(["store_id" , "product_id"])["sale_amount"]
          .shift(1)
          .rolling(14, min_periods=1)
          .mean()
    )

    
    ### Rolling median
    df["rolling_median_3"] = (
        df.groupby(["store_id" , "product_id"])["sale_amount"]
          .shift(1)
          .rolling(3, min_periods=1)
          .median()
    )
    
    df["rolling_median_7"] = (
        df.groupby(["store_id" , "product_id"])["sale_amount"]
          .shift(1)
          .rolling(7, min_periods=1)
          .median()
    )
    
    df["rolling_median_14"] = (
        df.groupby(["store_id" , "product_id"])["sale_amount"]
          .shift(1)
          .rolling(14, min_periods=1)
          .median()
    )

    
    ### Handling discount
    df["discount"] = 1- df["discount"]

    
    ### SKU statistics
    df["sku_mean"] = df.groupby(["store_id" , "product_id"])["sale_amount"].transform("mean")
    df["sku_std"] = df.groupby(["store_id" , "product_id"])["sale_amount"].transform("std")
    df["sku_median"] = df.groupby(["store_id" , "product_id"])["sale_amount"].transform("median")

    return df

In [22]:
train_df = train_df.drop('Unnamed: 0', axis=1)
train_df = train_df.drop('hours_sale', axis=1)
train_df = train_df.drop('hours_stock_status', axis=1)
    

In [None]:

low_card_cols = ['management_group_id', 'first_category_id']

# Using pandas get_dummies for quick OHE
df_encoded = pd.get_dummies(df, columns=low_card_cols, dtype=int)

print(df_encoded.head())

In [26]:
import matplotlib

In [44]:

pip install --upgrade matplotlib==3.4.2


Collecting matplotlib==3.4.2Note: you may need to restart the kernel to use updated packages.

  Downloading matplotlib-3.4.2.tar.gz (37.3 MB)
     ---------------------------------------- 0.0/37.3 MB ? eta -:--:--
     ---------------------------------------- 0.0/37.3 MB ? eta -:--:--
      --------------------------------------- 0.8/37.3 MB 3.0 MB/s eta 0:00:12
     - -------------------------------------- 1.6/37.3 MB 3.4 MB/s eta 0:00:11
     -- ------------------------------------- 2.1/37.3 MB 3.3 MB/s eta 0:00:11
     --- ------------------------------------ 2.9/37.3 MB 3.3 MB/s eta 0:00:11
     --- ------------------------------------ 3.4/37.3 MB 3.3 MB/s eta 0:00:11
     ---- ----------------------------------- 4.2/37.3 MB 3.3 MB/s eta 0:00:11
     ----- ---------------------------------- 4.7/37.3 MB 3.3 MB/s eta 0:00:10
     ----- ---------------------------------- 5.5/37.3 MB 3.3 MB/s eta 0:00:10
     ------ --------------------------------- 6.3/37.3 MB 3.3 MB/s eta 0:00:10
  

  error: subprocess-exited-with-error
  
  python setup.py egg_info did not run successfully.
  exit code: 1
  
  [28 lines of output]
  !!
  
          ********************************************************************************
          Please remove any references to `setuptools.command.test` in all supported versions of the affected package.
  
          This deprecation is overdue, please update your project and remove deprecated
          calls to avoid build errors in the future.
          ********************************************************************************
  
  !!
    from setuptools.command.test import test as TestCommand
  Traceback (most recent call last):
    File "<string>", line 2, in <module>
    File "<pip-setuptools-caller>", line 34, in <module>
    File "C:\Users\Abanob\AppData\Local\Temp\pip-install-l4k4vk7s\matplotlib_cf34137a178649af99ddb49cdaa622b4\setup.py", line 54, in <module>
      __version__ = versioneer.get_version()
                    ^^

In [None]:
import torch
import torch.nn as nn

class SequenceModel(nn.Module):
    def __init__(self, num_products, product_emb_dim, num_stores, store_emb_dim, numeric_features):
        super(SequenceModel, self).__init__()
        
        # 1. Define Embedding Layers
        self.product_embedding = nn.Embedding(num_embeddings=num_products, embedding_dim=product_emb_dim)
        self.store_embedding = nn.Embedding(num_embeddings=num_stores, embedding_dim=store_emb_dim)
        
        # 2. LSTM layer
        # Total input size = product_emb + store_emb + other numeric features
        input_size = product_emb_dim + store_emb_dim + numeric_features
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=64, batch_first=True)
        
        self.fc = nn.Linear(64, 1) # Final prediction

    def forward(self, product_ids, store_ids, continuous_data):
        # product_ids shape: [batch, seq_len]
        # continuous_data shape: [batch, seq_len, num_features]
        
        # Pass IDs through embeddings
        prod_emb = self.product_embedding(product_ids)
        store_emb = self.store_embedding(store_ids)
        
        # Concatenate embeddings with numerical features (price, lags, etc.)
        # Shape becomes: [batch, seq_len, total_features]
        combined = torch.cat([prod_emb, store_emb, continuous_data], dim=-1)
        
        lstm_out, _ = self.lstm(combined)
        
        # Take the output of the last time step for forecasting
        last_step = lstm_out[:, -1, :]
        return self.fc(last_step)