# Day-1

# Understand business problem
 # 1.Objective:
 Predict the next month’s product sales based on historical sales data.

Evaluate predictions using MAPE (Mean Absolute Percentage Error) and RMSE (Root Mean Squared Error).
# 2. Collect Dataset
In this step, we load the historical retail sales dataset.  
The dataset contains monthly sales data for different products over several years.  
We will use this data to analyze sales trends and forecast next month's sales.


<!-- Objective: -->

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta


In [29]:
# Modeling libraries
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
import pmdarima as pm
from prophet import Prophet

In [40]:
# Plotting setup
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)

In [30]:
df = pd.read_csv("Walmart.csv")

In [31]:
df.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,05-02-2010,1643690.9,0,42.31,2.572,211.096358,8.106
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.24217,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,05-03-2010,1554806.68,0,46.5,2.625,211.350143,8.106


In [32]:
df.columns

Index(['Store', 'Date', 'Weekly_Sales', 'Holiday_Flag', 'Temperature',
       'Fuel_Price', 'CPI', 'Unemployment'],
      dtype='object')

In [33]:
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)


In [34]:
df = df.sort_values('Date')
df.head()


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,2010-02-05,1643690.9,0,42.31,2.572,211.096358,8.106
1287,10,2010-02-05,2193048.75,0,54.34,2.962,126.442065,9.765
5148,37,2010-02-05,536006.73,0,45.97,2.572,209.852966,8.554
2288,17,2010-02-05,789036.02,0,23.11,2.666,126.442065,6.548
4147,30,2010-02-05,465108.52,0,39.05,2.572,210.752605,8.324


In [35]:
df.shape

(6435, 8)

# Day-2

In [36]:
# Step 1
df.isnull().sum()

Store           0
Date            0
Weekly_Sales    0
Holiday_Flag    0
Temperature     0
Fuel_Price      0
CPI             0
Unemployment    0
dtype: int64

In [37]:
df.columns

Index(['Store', 'Date', 'Weekly_Sales', 'Holiday_Flag', 'Temperature',
       'Fuel_Price', 'CPI', 'Unemployment'],
      dtype='object')

In [38]:
df = df.sort_values(["Store", "Date"])
df.head()


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,2010-02-05,1643690.9,0,42.31,2.572,211.096358,8.106
1,1,2010-02-12,1641957.44,1,38.51,2.548,211.24217,8.106
2,1,2010-02-19,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,2010-02-26,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,2010-03-05,1554806.68,0,46.5,2.625,211.350143,8.106


In [39]:
df['Lag_1'] = df.groupby('Store')['Weekly_Sales'].shift(1)
df['Lag_2'] = df.groupby('Store')['Weekly_Sales'].shift(2)
df['Lag_3'] = df.groupby('Store')['Weekly_Sales'].shift(3)

df[['Lag_1','Lag_2','Lag_3']] = df[['Lag_1','Lag_2','Lag_3']].fillna(0)
df.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Lag_1,Lag_2,Lag_3
0,1,2010-02-05,1643690.9,0,42.31,2.572,211.096358,8.106,0.0,0.0,0.0
1,1,2010-02-12,1641957.44,1,38.51,2.548,211.24217,8.106,1643690.9,0.0,0.0
2,1,2010-02-19,1611968.17,0,39.93,2.514,211.289143,8.106,1641957.44,1643690.9,0.0
3,1,2010-02-26,1409727.59,0,46.63,2.561,211.319643,8.106,1611968.17,1641957.44,1643690.9
4,1,2010-03-05,1554806.68,0,46.5,2.625,211.350143,8.106,1409727.59,1611968.17,1641957.44


# Day 3 – Modeling
### •	Build ARIMA, Prophet, XGBoost models
### •	Cross-validation


In [48]:
# ============================================================================
# 1. LOAD AND PREPARE DATA
# ============================================================================
print("="*60)
print("STEP 1: LOAD AND PREPARE DATA")
print("="*60)

# Let's create/load time series data
# If you have saved data from Day 2, load it:
try:
    # Try loading engineered data from Day 2
    df = pd.read_csv('Walmart.csv', index_col=0, parse_dates=True)
    print("✓ Loaded engineered data from Day 2")
except:
    # Create sample data if no saved file exists
    print("Creating sample data...")
    np.random.seed(42)
    date_range = pd.date_range(start='2023-01-01', end='2023-12-31', freq='D')
    n_samples = len(date_range)
    
    
    # Create some basic features (if you don't have Day 2 features)
    df['day_of_week'] = df.index.dayofweek
    df['month'] = df.index.month
    df['day_of_year'] = df.index.dayofyear

print(f"\nDataset shape: {df.shape}")
print(f"Date range: {df.index.min()} to {df.index.max()}")
print(f"Frequency: Daily")

# Display data
print("\nData Overview:")
display(df.head())
print("\n" + "="*60)

STEP 1: LOAD AND PREPARE DATA
✓ Loaded engineered data from Day 2

Dataset shape: (6435, 7)
Date range: 1 to 45
Frequency: Daily

Data Overview:


Unnamed: 0_level_0,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
Store,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,05-02-2010,1643690.9,0,42.31,2.572,211.096358,8.106
1,12-02-2010,1641957.44,1,38.51,2.548,211.24217,8.106
1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
1,05-03-2010,1554806.68,0,46.5,2.625,211.350143,8.106





In [51]:
print("\n" + "="*60)
print("STEP 4: PREPARE FOR MODELING")
print("="*60)

if target_col and target_col in df_model.columns:
    # Set target and features
    y = df_model[target_col]
    
    # Features are all columns except the target
    X = df_model.drop(columns=[target_col])
    
    print(f"Target: {target_col}")
    print(f"Number of features: {X.shape[1]}")
    print(f"Feature names: {list(X.columns)}")
    
    # Split data (80% train, 20% test)
    train_size = int(len(X) * 0.8)
    X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
    y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]
    
    print(f"\nTrain set: {X_train.shape[0]} samples")
    print(f"Test set:  {X_test.shape[0]} samples")
    
    # Simple visualization
    plt.figure(figsize=(10, 4))
    plt.plot(y_train.index, y_train.values, 'b-', label='Train', alpha=0.7)
    plt.plot(y_test.index, y_test.values, 'r-', label='Test', alpha=0.7)
    plt.axvline(x=y_train.index[-1], color='k', linestyle='--', label='Split')
    plt.title(f'Train-Test Split: {target_col}')
    plt.xlabel('Time')
    plt.ylabel(target_col)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()
    
else:
    print(f"Target column '{target_col}' not found in dataset!")
    print("Available columns:", list(df_model.columns))


STEP 4: PREPARE FOR MODELING


NameError: name 'df_model' is not defined