In [None]:
import operator

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from mlforecast import MLForecast
from mlforecast.lag_transforms import Combine, RollingMean
from sklearn.inspection import permutation_importance
from sklearn.ensemble import RandomForestRegressor

In [None]:
#Loading data
future_values = pd.read_csv('future_values.csv',parse_dates = ['date']).rename(columns={'date':'ds','store_id':'unique_id'})
metadata = pd.read_csv('metadata.csv').rename(columns={'store_id':'unique_id'})
sales_data = pd.read_csv('sales_data.csv',parse_dates = ['date']).rename(columns={'date':'ds','store_id':'unique_id','sales':'y'})
sales_data.dtypes

In [None]:
# Grouping data by each time series
grouped = sales_data.groupby('unique_id')

print("\nDate range per unique_id:")
date_range = sales_data.groupby('unique_id')['ds'].agg(['min', 'max', 'count'])
print(date_range)

# Creating a summary dataframe for visualizing data completeness
summary = grouped.agg(
    count_observed=('ds', 'count'),
    start_date=('ds', 'min'),
    end_date=('ds', 'max')
).reset_index()

summary['expected_count'] = (
    (summary['end_date'].dt.to_period('M') - summary['start_date'].dt.to_period('M')).apply(lambda x: x.n) + 1
)

# Identifying which time series are irregular
summary['is_irregular'] = summary['count_observed'] < summary['expected_count']

filtered = summary[summary['is_irregular'] == True]
display(filtered)

In [None]:
#Checking na
future_values.isna ().sum ()
metadata.isna ().sum ()
sales_data.isna().sum()

In [None]:
sales_merged = pd.merge(sales_data, metadata, on='unique_id', how='left')
future_merged = pd.merge(future_values, metadata, on='unique_id', how='left')
sales_merged.head()

In [None]:
import pandas as pd

# Ensure datetime
sales_merged['ds'] = pd.to_datetime(sales_merged['ds'])

# Create a weekly bucket
sales_merged['week'] = sales_merged['ds'].dt.to_period('W-MON').dt.start_time

# Make sure state_holiday is string type
sales_merged['state_holiday'] = sales_merged['state_holiday'].astype(str)

# Count how many times each state_holiday type appears per week per store
holiday_counts = (
    sales_merged
    .groupby(['unique_id', 'week', 'state_holiday'])
    .size()
    .unstack(fill_value=0)  # turns into columns
    .reset_index()
    .rename_axis(None, axis=1)  # remove column name
)

# Optional: Rename columns for clarity
holiday_counts.columns = ['unique_id', 'week'] + [f'state_holiday_{col}' for col in holiday_counts.columns[2:]]

# Now aggregate your normal weekly data
weekly_data = sales_merged.groupby(['unique_id', 'week'], as_index=False).agg({
    'y': 'sum',
    'customers': 'sum',
    'promo': 'sum',
    'open': 'sum',
    'school_holiday': 'sum',
    'store_type': 'first',
    'assortment': 'first',
    'competition_distance': 'first'
})

# Merge the holiday counts in
weekly_data = weekly_data.merge(holiday_counts, on=['unique_id', 'week'], how='left')

# Fill in 0 where a holiday type didn’t occur that week
weekly_data.fillna(0, inplace=True)

weekly_data.head()

In [None]:
filtered_values = np.where((weekly_data['state_holiday_a']>0) & (weekly_data['state_holiday_b']> 0) & (weekly_data['state_holiday_c']>0))
print(filtered_values)
display(weekly_data.loc[filtered_values])