In [35]:
# MACHINE LEARNING MODELS WITH SKLEARN PIPELINES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import pickle
import joblib
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# ML Libraries
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
import xgboost as xgb


In [41]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [43]:
# Load the data
logger.info("Loading Rossmann sales preprocessed data...")

try:
    train_df = pd.read_csv('processed-data-set/train_preprocessed.csv')
    test_df = pd.read_csv('processed-data-set/test_cleaned.csv',low_memory=False)
    logger.info(f"Data loaded successfully. Train shape: {train_df.shape}, Test shape: {test_df.shape}")
except FileNotFoundError:
    logger.error("Data files not found.")

2025-06-16 15:42:33,226 - INFO - Loading Rossmann sales preprocessed data...
2025-06-16 15:42:42,779 - INFO - Data loaded successfully. Train shape: (1017209, 44), Test shape: (41088, 17)


In [45]:
# TASK 2.2: BUILD MODELS WITH SKLEARN PIPELINES

In [47]:
# Assuming you have train_df and test_df ready after preprocessing
logger.info("Starting Task 2.2: Building ML Pipeline")

# Prepare data
X = train_df.drop(['Sales'], axis=1)
y = train_df['Sales']

# Handle categorical variables
categorical_cols = X.select_dtypes(include=['object']).columns
label_encoders = {}

for col in categorical_cols:
    if col != 'Date':
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
        label_encoders[col] = le

2025-06-16 15:44:10,203 - INFO - Starting Task 2.2: Building ML Pipeline
