# Rossmann Pharmaceuticals Sales Forecast Analysis

In [10]:
import sys
sys.path.append('../src')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

if 'preprocessor' in sys.modules:
    del sys.modules['preprocessor']
if 'model_builder' in sys.modules:
    del sys.modules['model_builder']
if 'evaluator' in sys.modules:
    del sys.modules['evaluator']

from preprocessor import Preprocessor
from model_builder import ModelBuilder
from evaluator import Evaluator

## Data Loading and Preprocessing

In [11]:
# Load data
train = pd.read_csv('../resources/Data/train.csv')
test = pd.read_csv('../resources/Data/test.csv')
store = pd.read_csv('../resources/Data/store.csv')

# Merge data
train = pd.merge(train, store, on='Store', how='left')
test = pd.merge(test, store, on='Store', how='left')


  train = pd.read_csv('../resources/Data/train.csv')


In [12]:
# Preprocess data
preprocessor = Preprocessor()
train_processed = preprocessor.preprocess(train)
test_processed = preprocessor.preprocess(test)

print(train_processed.head())
print(train_processed.info())

2024-09-21 15:34:19,902 - INFO - Initialized Preprocessor with 12 holidays
2024-09-21 15:34:19,904 - INFO - Starting preprocessing...
2024-09-21 15:34:19,906 - INFO - Initial DataFrame shape: (1017209, 18)
 columns: ['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval']
2024-09-21 15:34:19,907 - INFO - Handling missing values, encoding categorical variables, and scaling numerical variables...


2024-09-21 15:34:25,426 - INFO - Missing value handling, encoding, and scaling completed in 5.52 seconds
2024-09-21 15:34:25,426 - INFO - DataFrame shape after processing: (1017209, 28)
2024-09-21 15:34:25,998 - INFO - Final DataFrame shape: (1017209, 32)
2024-09-21 15:34:26,014 - INFO - Preprocessing completed.
2024-09-21 15:34:26,047 - INFO - Starting preprocessing...
2024-09-21 15:34:26,048 - INFO - Initial DataFrame shape: (41088, 17)
 columns: ['Id', 'Store', 'DayOfWeek', 'Date', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval']
2024-09-21 15:34:26,049 - INFO - Handling missing values, encoding categorical variables, and scaling numerical variables...
2024-09-21 15:34:26,278 - INFO - Missing value handling, encoding, and scaling completed in 0.23 seconds
2024-09-21 15:34:26,278 - INFO - DataFrame shape after pro

      Sales  Customers      Open     Promo  SchoolHoliday  \
0 -0.132683  -0.168269  0.452399  1.273237       2.144211   
1  0.075373  -0.017540  0.452399  1.273237       2.144211   
2  0.659800   0.404499  0.452399  1.273237       2.144211   
3  2.135414   1.862258  0.452399  1.273237       2.144211   
4 -0.247231  -0.159656  0.452399  1.273237       2.144211   

   CompetitionDistance  CompetitionOpenSinceMonth  CompetitionOpenSinceYear  \
0            -0.539900                   0.669941                 -0.139458   
1            -0.630746                   1.423898                 -0.341504   
2             1.129083                   1.800876                 -0.543550   
3            -0.624257                   0.669941                  0.062588   
4             3.177025                  -1.214950                  1.274865   

     Promo2  Promo2SinceWeek  ...  Assortment_b  Assortment_c  \
0 -1.001128     2.849875e-15  ...           0.0           0.0   
1  0.998873    -1.029693e+00

In [13]:
train_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 32 columns):
 #   Column                          Non-Null Count    Dtype         
---  ------                          --------------    -----         
 0   Sales                           1017209 non-null  float64       
 1   Customers                       1017209 non-null  float64       
 2   Open                            1017209 non-null  float64       
 3   Promo                           1017209 non-null  float64       
 4   SchoolHoliday                   1017209 non-null  float64       
 5   CompetitionDistance             1017209 non-null  float64       
 6   CompetitionOpenSinceMonth       1017209 non-null  float64       
 7   CompetitionOpenSinceYear        1017209 non-null  float64       
 8   Promo2                          1017209 non-null  float64       
 9   Promo2SinceWeek                 1017209 non-null  float64       
 10  Promo2SinceYear                 1017209 no

## Model Training and Evaluation

In [None]:
# Split data
X = train_processed.drop(['Sales', 'Customers'], axis=1)
y = train_processed['Sales']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Build and train model
model_builder = ModelBuilder()
model = model_builder.build_model(X_train, y_train)



### Fit and Predict

In [None]:
# Make predictions
y_pred = model.predict(X_val)

# Evaluate model
evaluator = Evaluator()
metrics = evaluator.evaluate(y_val, y_pred)

print("Model Performance Metrics:")
for metric, value in metrics.items():
    print(f"{metric.upper()}: {value:.4f}")


 # Save model
model_builder.save_model('./models')

## Visualization of Results

In [None]:
# Plot residuals
evaluator.plot_residuals(y_val, y_pred, './figures/model')
plt.show()

# Plot actual vs predicted
evaluator.plot_actual_vs_predicted(y_val, y_pred, './figures/model')
plt.show()

# Plot feature importance
evaluator.plot_feature_importance(model, X.columns, './figures/model')
plt.show()

## Save Test Predictions

In [None]:
 # Make predictions on test set
test_predictions = model.predict(test_processed.drop(['Sales', 'Customers'], axis=1))

# Inverse transform the scaled predictions
test_predictions = preprocessor.inverse_transform_sales(test_predictions)

# Save predictions
test['Predicted_Sales'] = test_predictions
test[['Id', 'Predicted_Sales']].to_csv('./models/predictions/test_predictions.csv', index=False)

## Analysis of Results

In [None]:
# Load test predictions
test_predictions = pd.read_csv('./models/predictions/test_predictions.csv')

# Analyze predictions
print(test_predictions.describe())

# Plot distribution of predicted sales
plt.figure(figsize=(10, 6))
sns.histplot(test_predictions['Predicted_Sales'], kde=True)
plt.title('Distribution of Predicted Sales')
plt.xlabel('Predicted Sales')
plt.ylabel('Count')
plt.show()

# Analyze predictions by store type
test_with_store = pd.merge(test_predictions, store, on='Store', how='left')
plt.figure(figsize=(12, 6))
sns.boxplot(x='StoreType', y='Predicted_Sales', data=test_with_store)
plt.title('Predicted Sales by Store Type')
plt.show()

# Time series plot of predictions
test_with_date = pd.merge(test_predictions, test[['Id', 'Date']], on='Id', how='left')
test_with_date['Date'] = pd.to_datetime(test_with_date['Date'])
test_with_date = test_with_date.sort_values('Date')

plt.figure(figsize=(15, 6))
plt.plot(test_with_date['Date'], test_with_date['Predicted_Sales'])
plt.title('Time Series of Predicted Sales')
plt.xlabel('Date')
plt.ylabel('Predicted Sales')
plt.xticks(rotation=45)
plt.show()

## Conclusion and Insights

Based on the analysis above, we can draw the following conclusions:

1. Model Performance: Our Random Forest model achieved [insert metrics here], indicating [interpretation of performance].

2. Feature Importance: The most important features for predicting sales are [list top features]. This suggests that [interpretation of feature importance].

3. Sales Distribution: The distribution of predicted sales shows [describe distribution], which aligns with [or differs from] our expectations based on historical data.

4. Store Type Impact: There are notable differences in predicted sales across store types, with [describe differences]. This suggests that [interpretation of store type impact].

5. Temporal Trends: The time series plot of predicted sales reveals [describe any patterns or trends observed], which could be attributed to [possible explanations].

These insights can help Rossmann Pharmaceuticals in the following ways:
- [List practical applications of the insights]
- [Suggest areas for further investigation or improvement]

Future work could include:
- [Suggest potential enhancements or extensions to the current model]
- [Propose additional analyses that could provide more insights]