# 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

print("Library installed")

In [None]:
df = pd.read_csv("retail_store.csv")
print("Data Loaded")

In [None]:
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')


In [None]:
df['Date'] = pd.to_datetime(df['Date'],format= "%Y-%m-%d %H: %M:%S")

In [None]:
print(df['Date'].dtype)


In [None]:
df.head()

In [None]:
print("\nMissing values per column")
print(df.isnull().sum())

# *Overall Sales trend*

In [None]:
daily_sales = df.groupby('Date')['Units Sold'].sum().reset_index()


In [None]:
plt.figure(figsize=(12,6))
plt.plot(daily_sales['Date'],daily_sales['Units Sold'], color='blue', alpha=0.6, label="Daily Sales")
plt.xlabel("Date")
plt.ylabel("Units Sold")
plt.title("Daily Total Units Sold Over Time")
plt.legend()
plt.show()


# BAR CHART 

In [None]:
plt.figure(figsize=(12,6))
plt.bar(daily_sales['Date'],daily_sales['Units Sold'])
plt.xlabel("Date")
plt.ylabel("Units Sold")
plt.title("Daily Total Units Sold Over Time")
plt.show()


# *Sales by Seasonality and promotions*


# *-------Montly Sales------*

In [None]:
df['Month']= df['Date'].dt.month
monthly_sales= df.groupby('Month')['Units Sold'].sum().reset_index()

plt.figure(figsize=(8,4))
sns.barplot(data = monthly_sales, x= 'Month',y= 'Units Sold', palette = "viridis")
plt.title("Total Units Sold over months")
plt.show()



# *Sales by Holidays/Promotion*

In [None]:
promo_sales = df.groupby('Holiday/Promotion')['Units Sold'].mean().reset_index()

plt.figure(figsize=(4,4))
sns.barplot(data=promo_sales, x='Holiday/Promotion', y='Units Sold', palette="Set2")
plt.title("Average Units Sold: Promotion vs Non-Promotion")
plt.xticks([0,1], ["No Promo","Promo"])
plt.show()

# *Region and Category*

In [None]:
region_sales = df.groupby('Region')['Units Sold'].sum().sort_values(ascending=False)

plt.figure(figsize=(6,4))
sns.barplot(x=region_sales.index, y=region_sales.values, palette="mako")
plt.title("Total Units Sold by Region")
plt.xticks(rotation=45)
plt.show()

category_sales = df.groupby('Category')['Units Sold'].sum().sort_values(ascending=False)

plt.figure(figsize=(8,4))
sns.barplot(x=category_sales.index, y=category_sales.values, palette="Set2")
plt.title("Total Units Sold by Category")
plt.xticks(rotation=45)
plt.show()

# *Data Preprossesing*

# 5.1 Handle Missing Values
# Fill numeric with 0 or mean, categorical with mode

In [None]:
df['Units Sold'] = df['Units Sold'].fillna(0)
df['Price'] = df['Price'].fillna(df['Price'].mean())
df['Discount'] = df['Discount'].fillna(0)
df['Competitor Pricing'] = df['Competitor Pricing'].fillna(df['Competitor Pricing'].mean())

df['Region'] = df['Region'].fillna(df['Region'].mode()[0])
df['Category'] = df['Category'].fillna(df['Category'].mode()[0])

print("It working")


# **Feature Engineering**

In [None]:
# Step 3: Feature Engineering
# ===============================

import numpy as np

# Extract Date parts
df['Day'] = df['Date'].dt.day
df['Week'] = df['Date'].dt.isocalendar().week.astype(int)
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year
df['DayOfWeek'] = df['Date'].dt.dayofweek
df['IsWeekend'] = df['DayOfWeek'].isin([5,6]).astype(int)

# Extra Features
if 'Price' in df.columns and 'Discount' in df.columns:
    # Simulate competitor pricing (if missing)
    if 'Competitor Pricing' not in df.columns:
        df['Competitor Pricing'] = df['Price'] * (1 + np.random.uniform(-0.1,0.1,len(df)))
    
    df['Price_Diff'] = df['Price'] - df['Competitor Pricing']
    df['Discount_Flag'] = (df['Discount'] > 0).astype(int)




# Lag features (previous sales)
df['Lag_1'] = df['Units Sold'].shift(1)
df['Lag_7'] = df['Units Sold'].shift(7)
df['Lag_30'] = df['Units Sold'].shift(30)

# Rolling averages
df['Roll_7'] = df['Units Sold'].rolling(7).mean()
df['Roll_30'] = df['Units Sold'].rolling(30).mean()
df['Roll_7_STD'] = df['Units Sold'].rolling(7).std()


# Drop NA rows created by lags/rolling
df = df.dropna()

print("✅ Feature Engineering Completed")
print(df.head())


In [None]:
df = df.dropna()


# **Train/ Test/ Validate**

In [None]:
# Clean column names
df.columns = df.columns.str.strip()
print(df.columns)   # Check again



In [None]:
# ===============================
# Step 5: Feature Engineering & Data Split
# ===============================

from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Load your data
df = pd.read_csv("retail_store.csv")

# 1. Make sure Date is datetime
df['Date'] = pd.to_datetime(df['Date'])

# 2. Sort by Date (important for time series)
df = df.sort_values('Date')

# 3. Drop IDs (not useful for forecasting)
df = df.drop(columns=['Store ID','Product ID'])

# 4. Encode categorical columns
cat_cols = ['Category', 'Region', 'Holiday/Promotion','Seasonality']
encoder = LabelEncoder()
for col in cat_cols:
    df[col] = encoder.fit_transform(df[col].astype(str))

# 5. Create time-based features
df['Day'] = df['Date'].dt.day
df['Week'] = df['Date'].dt.isocalendar().week
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year
df['DayOfWeek'] = df['Date'].dt.dayofweek
df['IsWeekend'] = df['DayOfWeek'].isin([5,6]).astype(int)

# 6. Lag features (past sales)
df['Lag_1'] = df['Units Sold'].shift(1)
df['Lag_7'] = df['Units Sold'].shift(7)
df['Lag_30'] = df['Units Sold'].shift(30)

# 7. Rolling statistics
df['Roll_7'] = df['Units Sold'].rolling(7).mean()
df['Roll_30'] = df['Units Sold'].rolling(30).mean()
df['Roll_7_STD'] = df['Units Sold'].rolling(7).std()


# 8. Drop NaN (caused by lag/rolling)
df = df.dropna()

df['Price_Diff'] = df['Price'] - df['Competitor_Pricing']

df['Discount_Flag'] = df['Discount'].apply(lambda x: 1 if x > 0 else 0)
# 9. Define features & target
feature_cols = [
    'Price','Discount','Competitor Pricing','Price_Diff', 'Discount_Flag',
    'Day','Week','Month','Year','DayOfWeek','IsWeekend',
    'Lag_1','Lag_7','Lag_30','Roll_7','Roll_30','Roll_7_STD',
    'Category','Region','Holiday/Promotion','Seasonality'
]
target_col = 'Units Sold'

# 10. Train/Validation/Test split (time-based)
train_size = int(0.7 * len(df))
valid_size = int(0.15 * len(df))

train = df.iloc[:train_size]
valid = df.iloc[train_size:train_size+valid_size]
test = df.iloc[train_size+valid_size:]

X_train, y_train = train[feature_cols], train[target_col]
X_valid, y_valid = valid[feature_cols], valid[target_col]
X_test, y_test = test[feature_cols], test[target_col]

print("✅ Step 5 Done:")
print("Train:", X_train.shape, "Valid:", X_valid.shape, "Test:", X_test.shape)


In [None]:
# ===============================
# Step 5 + Step 6: Feature Engineering, Data Split, and Modeling
# ===============================

import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

%matplotlib inline

# ---- Step 5: Feature Engineering ----
# Make sure Date is datetime
df['Date'] = pd.to_datetime(df['Date'])

# Sort by Date (important for time-series)
df = df.sort_values('Date')

# Create Price Difference column
df['Price_Diff'] = df['Price'] - df['Competitor_Pricing']

# Create Discount Flag column
df['Discount_Flag'] = df['Discount'].apply(lambda x: 1 if x > 0 else 0)

# Extract useful date features
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['DayOfWeek'] = df['Date'].dt.dayofweek
df['IsWeekend'] = df['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)

# Define features and target
feature_cols = ['Price', 'Competitor_Pricing', 'Discount', 
                'Price_Diff', 'Discount_Flag', 
                'Year', 'Month', 'DayOfWeek', 'IsWeekend']
target_col = 'Units_Sold'

# Time-based split
cutoff_valid = df['Date'].quantile(0.8)   # 80% train
cutoff_test = df['Date'].quantile(0.9)    # 10% final test

train = df[df['Date'] < cutoff_valid]
valid = df[(df['Date'] >= cutoff_valid) & (df['Date'] < cutoff_test)]
test  = df[df['Date'] >= cutoff_test]

X_train, y_train = train[feature_cols], train[target_col]
X_valid, y_valid = valid[feature_cols], valid[target_col]
X_test, y_test   = test[feature_cols], test[target_col]

print("Shapes:")
print("Train:", X_train.shape, y_train.shape)
print("Valid:", X_valid.shape, y_valid.shape)
print("Test:", X_test.shape, y_test.shape)


# ---- Step 6: Modeling ----
def evaluate_model(model, X_train, y_train, X_valid, y_valid):
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    rmse = mean_squared_error(y_valid, preds, squared=False)  # RMSE
    mae = mean_absolute_error(y_valid, preds)                # MAE
    return rmse, mae, preds

# Store results
results = {}
predictions = {}

# 1. Linear Regression
lr = LinearRegression()
rmse, mae, preds_lr = evaluate_model(lr, X_train, y_train, X_valid, y_valid)
results['Linear Regression'] = {'RMSE': rmse, 'MAE': mae}
predictions['Linear Regression'] = preds_lr

# 2. Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rmse, mae, preds_rf = evaluate_model(rf, X_train, y_train, X_valid, y_valid)
results['Random Forest'] = {'RMSE': rmse, 'MAE': mae}
predictions['Random Forest'] = preds_rf

# 3. XGBoost
xgb = XGBRegressor(n_estimators=200, learning_rate=0.1, random_state=42, objective='reg:squarederror')
rmse, mae, preds_xgb = evaluate_model(xgb, X_train, y_train, X_valid, y_valid)
results['XGBoost'] = {'RMSE': rmse, 'MAE': mae}
predictions['XGBoost'] = preds_xgb

# ---- Show results ----
print("\n=== Model Evaluation Results ===")
for model, metrics in results.items():
    print(f"{model}: RMSE={metrics['RMSE']:.2f}, MAE={metrics['MAE']:.2f}")


# Features for modeling

In [None]:
print("Train columns:", train.columns.tolist())
print("Valid columns:", valid.columns.tolist())
print("Test columns:", test.columns.tolist())


In [None]:
missing = [col for col in feature_cols if col not in train.columns]
print("Missing features:", missing)



In [None]:
feature_cols = [col for col in feature_cols if col in train.columns]

X_train, y_train = train[feature_cols], train[target_col]
X_valid, y_valid = valid[feature_cols], valid[target_col]
X_test,  y_test  = test[feature_cols],  test[target_col]


In [None]:
feature_cols = [
    'Price','Discount','Competitor Pricing','Price_Diff','Discount_Flag',
    'Day','Week','Month','Year','DayOfWeek','IsWeekend',
    'Lag_1','Lag_7','Lag_30','Roll_7','Roll_30','Roll_7_STD',
    'Holiday/Promotion','Seasonality'
]

target_col = 'Units Sold'




In [None]:
print("Train shape:", X_train.shape)
print("Valid shape:", X_valid.shape)
print("Test shape:", X_test.shape)

# Model Training and Evalution

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline


print("Library loaded")

In [None]:
# function to train and evaluate
def evaluate_model(model, X_train, y_train, X_valid, y_valid):
    model.fit(X_train, y_train) # Train model
    y_valid_preds = model.predict(X_valid)
    preds = model.predict(X_valid,y_valid)    # Make predictions on validation data
    rmse = np.sqrt(mean_squared_error(y_valid, y_valid_preds))  # Root Mean Square Error
    mae = mean_absolute_error(y_valid,y_valid_preds)           # Mean Absolute Error
    return rmse, mae, preds

results = {}
predictions = {}

#For Linear Regression
rmse_lr= np.sqrt(mean_squared_error(y_valid, preds))
mae_lr = mean_absolute_error(y_valid,preds)

# lr = LinearRegression()
# rmse, mae, preds_lr = evaluate_model(lr, X_train, y_train, X_valid, y_valid)
# results['Linear Regression'] = {'RMSE': rmse, 'MAE': mae}
# predictions['Linear Regression'] = preds_lr

#rmse_lr, mar_lr, y_valid_preds= evaluate_model(model, X_train, y_train,X_valid, y_valid)


print("Linear Regreessions RMSE:",rmse_lr, "And MAE: ",mae_lr)

In [None]:
print(X_train.shape, y_train.shape)
print(X_valid.shape, y_valid.shape)


In [None]:
# ===============================
# Step 6: Model Training & Evaluation
# ===============================

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

# Helper function to train & evaluate
def evaluate_model(model, X_train, y_train, X_valid, y_valid):
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    rmse = np.sqrt(mean_squared_error(y_valid, preds))
    mae = mean_absolute_error(y_valid, preds)
    return rmse, mae, preds

# Dictionary to store results
results = {}
predictions = {}

# 1. Linear Regression
lr = LinearRegression()
rmse, mae, preds_lr = evaluate_model(lr, X_train, y_train, X_valid, y_valid)
results['Linear Regression'] = {'RMSE': rmse, 'MAE': mae}
predictions['Linear Regression'] = preds_lr

# 2. Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rmse, mae, preds_rf = evaluate_model(rf, X_train, y_train, X_valid, y_valid)
results['Random Forest'] = {'RMSE': rmse, 'MAE': mae}
predictions['Random Forest'] = preds_rf

# 3. XGBoost
xgb = XGBRegressor(n_estimators=200, learning_rate=0.1, random_state=42, objective='reg:squarederror')
rmse, mae, preds_xgb = evaluate_model(xgb, X_train, y_train, X_valid, y_valid)
results['XGBoost'] = {'RMSE': rmse, 'MAE': mae}
predictions['XGBoost'] = preds_xgb

# Show results
print("\n=== Model Evaluation Results ===")
for model, metrics in results.items():
    print(f"{model}: RMSE={metrics['RMSE']:.2f}, MAE={metrics['MAE']:.2f}")

# Visualization (Actual vs Predicted)
def plot_predictions(y_valid, preds, title):
    plt.figure(figsize=(10,5))
    plt.plot(y_valid.values[:50], label="Actual", marker='o')
    plt.plot(preds[:50], label="Predicted", marker='x')
    plt.title(title)
    plt.xlabel("Time Steps (sampled)")
    plt.ylabel("Units Sold")
    plt.legend()
    plt.show()

# Plot for each model
plot_predictions(y_valid, preds_lr, "Linear Regression - Actual vs Predicted")
plot_predictions(y_valid, preds_rf, "Random Forest - Actual vs Predicted")
plot_predictions(y_valid, preds_xgb, "XGBoost - Actual vs Predicted")

print(results)

best_model = min(results, key=lambda x: results[x]['RMSE'])
print(f"\n✅ Best Model: {best_model} with RMSE = {results[best_model]['RMSE']:.2f} and MAE = {results[best_model]['MAE']:.2f}")

In [None]:
# Step 7: Final Testing on Unseen Data 

# ⏳ Split last 2 months as "Test"
cutoff_test = df['Date'].max() - pd.Timedelta(days=60)

train_valid = df[df['Date'] < cutoff_test]
test = df[df['Date'] >= cutoff_test]

X_train_valid = train_valid[feature_cols]
y_train_valid = train_valid[target_col]
X_test = test[feature_cols]
y_test = test[target_col]

# 🔥 Retrain the BEST model (Linear Regression) on Train+Valid
best_model = LinearRegression()
best_model.fit(X_train_valid, y_train_valid)

# Predictions on Test Set
preds_test = best_model.predict(X_test)

# 📊 Evaluation Metrics
rmse_test = np.sqrt(mean_squared_error(y_test, preds_test))
mae_test = mean_absolute_error(y_test, preds_test)

print("=== Final Test Evaluation (Unseen Data - Linear Regression) ===")
print(f"RMSE: {rmse_test:.2f}, MAE: {mae_test:.2f}")

# 📉 Plot Actual vs Predicted
plt.figure(figsize=(10,5))
plt.plot(y_test.values, label="Actual", marker='o')
plt.plot(preds_test, label="Predicted", marker='x')
plt.title("Linear Regression - Final Test Performance (Unseen Data)")
plt.xlabel("Time Steps (sampled)")
plt.ylabel("Units Sold")
plt.legend()
plt.show()
