In [None]:
# Task 2: Store Sales Prediction

## Import Libraries
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Load Data
data = pd.read_csv('../data/processed/cleaned_data.csv')
print("Data Loaded Successfully.")
print(data.head())

# Feature Engineering
data['DayOfWeek'] = data['Date'].dt.dayofweek
data['IsWeekend'] = data['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)
data['IsMonthStart'] = data['Date'].dt.is_month_start.astype(int)
data['IsMonthEnd'] = data['Date'].dt.is_month_end.astype(int)
data['DaysToHoliday'] = data['Date'].apply(
    lambda x: min([(pd.to_datetime(h) - x).days for h in ['2024-01-01', '2024-12-25']] + [np.inf])
)
print("Feature Engineering Completed.")

# Train-Test Split
X = data.drop(['Sales', 'Date'], axis=1)
y = data['Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training (Random Forest)
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])
pipeline.fit(X_train, y_train)
print("Model Training Completed.")

# Predictions and Evaluation
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")

# Feature Importance
feature_importances = pipeline.named_steps['regressor'].feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
feature_importance_df.sort_values('Importance', ascending=False, inplace=True)
plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance_df, x='Importance', y='Feature')
plt.title('Feature Importance')
plt.show()

# Save the Model
joblib.dump(pipeline, '../data/models/random_forest.pkl')
print("Model Saved.")

# Deep Learning (LSTM)
sales = data['Sales'].values.reshape(-1, 1)
scaler = MinMaxScaler(feature_range=(-1, 1))
sales_scaled = scaler.fit_transform(sales)

# Prepare Time Series Data
window_size = 30
X_lstm, y_lstm = [], []
for i in range(len(sales_scaled) - window_size):
    X_lstm.append(sales_scaled[i:i + window_size])
    y_lstm.append(sales_scaled[i + window_size])
X_lstm, y_lstm = np.array(X_lstm), np.array(y_lstm)

# Build and Train LSTM
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(50, activation='relu', input_shape=(window_size, 1)),
    tf.keras.layers.Dense(1)
])
model.compile(optimizer='adam', loss='mse')
history = model.fit(X_lstm, y_lstm, epochs=10, batch_size=32)
print("LSTM Training Completed.")

# Save LSTM Model
model.save('../data/models/lstm_model.h5')
print("LSTM Model Saved.")

# Plot Training Loss
plt.plot(history.history['loss'], label='Loss')
plt.title('LSTM Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
