# Rossmann Pharmaceuticals Sales Forecast Analysis

In [None]:
import sys
sys.path.append('../src')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from preprocessor import Preprocessor
from model_builder import ModelBuilder
from evaluator import Evaluator

## Data Loading and Preprocessing

In [None]:
# Load data
train = pd.read_csv('../resources/Data/train.csv')
test = pd.read_csv('../resources/Data/test.csv')
store = pd.read_csv('../resources/Data/store.csv')

# Merge data
train = pd.merge(train, store, on='Store', how='left')
test = pd.merge(test, store, on='Store', how='left')

# Preprocess data
preprocessor = Preprocessor()
train_processed = preprocessor.preprocess(train)
test_processed = preprocessor.preprocess(test)

print(train_processed.head())
print(train_processed.info())

## Model Training and Evaluation

In [None]:
# Split data
X = train_processed.drop(['Sales', 'Customers'], axis=1)
y = train_processed['Sales']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Build and train model
model_builder = ModelBuilder()
model = model_builder.build_model(X_train, y_train)

# Make predictions
y_pred = model.predict(X_val)

# Evaluate model
evaluator = Evaluator()
metrics = evaluator.evaluate(y_val, y_pred)

print("Model Performance Metrics:")
for metric, value in metrics.items():
    print(f"{metric.upper()}: {value:.4f}")

## Visualization of Results

In [None]:
# Plot residuals
evaluator.plot_residuals(y_val, y_pred, './figures')
plt.show()

# Plot actual vs predicted
evaluator.plot_actual_vs_predicted(y_val, y_pred, './figures')
plt.show()

# Plot feature importance
evaluator.plot_feature_importance(model, X.columns, './figures')
plt.show()

## Analysis of Results

In [None]:
# Load test predictions
test_predictions = pd.read_csv('../predictions/test_predictions.csv')

# Analyze predictions
print(test_predictions.describe())

# Plot distribution of predicted sales
plt.figure(figsize=(10, 6))
sns.histplot(test_predictions['Predicted_Sales'], kde=True)
plt.title('Distribution of Predicted Sales')
plt.xlabel('Predicted Sales')
plt.ylabel('Count')
plt.show()

# Analyze predictions by store type
test_with_store = pd.merge(test_predictions, store, on='Store', how='left')
plt.figure(figsize=(12, 6))
sns.boxplot(x='StoreType', y='Predicted_Sales', data=test_with_store)
plt.title('Predicted Sales by Store Type')
plt.show()

# Time series plot of predictions
test_with_date = pd.merge(test_predictions, test[['Id', 'Date']], on='Id', how='left')
test_with_date['Date'] = pd.to_datetime(test_with_date['Date'])
test_with_date = test_with_date.sort_values('Date')

plt.figure(figsize=(15, 6))
plt.plot(test_with_date['Date'], test_with_date['Predicted_Sales'])
plt.title('Time Series of Predicted Sales')
plt.xlabel('Date')
plt.ylabel('Predicted Sales')
plt.xticks(rotation=45)
plt.show()

## Conclusion and Insights

Based on the analysis above, we can draw the following conclusions:

1. Model Performance: Our Random Forest model achieved [insert metrics here], indicating [interpretation of performance].

2. Feature Importance: The most important features for predicting sales are [list top features]. This suggests that [interpretation of feature importance].

3. Sales Distribution: The distribution of predicted sales shows [describe distribution], which aligns with [or differs from] our expectations based on historical data.

4. Store Type Impact: There are notable differences in predicted sales across store types, with [describe differences]. This suggests that [interpretation of store type impact].

5. Temporal Trends: The time series plot of predicted sales reveals [describe any patterns or trends observed], which could be attributed to [possible explanations].

These insights can help Rossmann Pharmaceuticals in the following ways:
- [List practical applications of the insights]
- [Suggest areas for further investigation or improvement]

Future work could include:
- [Suggest potential enhancements or extensions to the current model]
- [Propose additional analyses that could provide more insights]