In [None]:
from abc import ABC, abstractmethod
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb

class ForecastModel(ABC):
    """Classe base abstrata representando um modelo de previsão."""
    
    @abstractmethod
    def train(self, X_train, y_train):
        pass
    
    @abstractmethod
    def predict(self, X):
        pass

class RandomForestModel(ForecastModel):
    """Estratégia concreta encapsulando um modelo Random Forest."""
    
    def __init__(self, params=None):
        self.model = RandomForestRegressor(**(params or {}))
        
    def train(self, X_train, y_train):
        self.model.fit(X_train, y_train)
        
    def predict(self, X):
        return self.model.predict(X)

class XGBoostModel(ForecastModel):  
    """Estratégia concreta encapsulando um modelo XGBoost."""
    
    def __init__(self, params=None):
        self.model = xgb.XGBRegressor(**(params or {}))
        
    def train(self, X_train, y_train):
        self.model.fit(X_train, y_train)
        
    def predict(self, X):
        return self.model.predict(X)
    
class LGBMModel(ForecastModel):
    """Estratégia concreta encapsulando um modelo LightGBM."""  
    
    def __init__(self, params=None):
        self.model = lgb.LGBMRegressor(**(params or {}))
        
    def train(self, X_train, y_train):  
        self.model.fit(X_train, y_train)
        
    def predict(self, X):
        return self.model.predict(X)
    
    
class SalesForecastingFacade:
    """Fachada fornecendo uma interface simplificada para o processo de previsão de vendas."""
    
    def __init__(self, data_dir):
        self.data_dir = data_dir  
        self.data = None
        
    def load_data(self):
      # Carregando dados de treino 
      train = pd.read_csv(f"{self.data_dir}/train.csv")
      train['date'] = pd.to_datetime(train['date'])  # Converte 'date' para datetime
      
      # Carregando dados de teste (previsão)
      test = pd.read_csv(f"{self.data_dir}/test.csv") 
      test['date'] = pd.to_datetime(test['date'])  # Converte 'date' para datetime

      self.data = {"train": train, "test": test}
        
    def preprocess_data(self):
          train = self.data["train"]
          test = self.data["test"]

          # Mesclando os conjuntos de treino e teste
          df = pd.concat([train, test], sort=False)
          
          # Criação de atributos de data (feature engineering)
          df['year'] = df.date.dt.year
          df['month'] = df.date.dt.month
          df['day'] = df.date.dt.day
          df['dayofweek'] = df.date.dt.dayofweek

          # ----> Identify and handle categorical columns <----
          # Assuming 'family' column contains 'AUTOMOTIVE' and other categories
          categorical_cols = ['family']  # Add other categorical columns if any
          for col in categorical_cols:
            if col in df.columns:
                df[col] = pd.factorize(df[col])[0] # Convert categories to numerical labels


          # Check if the columns exist before applying get_dummies
          columns_to_encode = ['store', 'item', 'dayofweek', 'month']
          existing_columns = [col for col in columns_to_encode if col in df.columns]
          
          df = pd.get_dummies(df, columns=existing_columns) # Only encode existing columns


          # Dividindo novamente em treino e teste
          train = df.loc[df.sales.notnull()]  
          test = df.loc[df.sales.isnull()]
          
          # Divisão em X e y
          X_train = train.drop(['date', 'sales', 'id'], axis=1)
          y_train = train['sales']
          X_test = test.drop(['date', 'sales', 'id'], axis=1)

          # Removendo a coluna 'date' do conjunto de teste
          test_data = test.drop('date', axis=1)
          
          self.data = {
              "train": train,
              "test": test, 
              "X_train": X_train,
              "y_train": y_train,
              "X_test": X_test,
              "test_data": test_data
          }
                  
    def train_model(self, model):
        model.train(self.data['X_train'], self.data['y_train'])
        
    def evaluate_model(self, model):
        predictions = model.predict(self.data['X_test']) 
        rmse = mean_squared_error(self.data['test']['sales'], predictions, squared=False)
        print(f"{model.__class__.__name__} RMSE: {rmse:.3f}")
        
    def make_submission(self, model):  
        test_data = self.data['test_data']
        test_data['sales'] = model.predict(self.data['X_test'])
        submission = test_data[['id', 'sales']]
        submission.to_csv(f"{model.__class__.__name__}_submission.csv", index=False)


if __name__ == "__main__":
    # Assumindo que os arquivos estão em uma pasta chamada 'store_sales_data' no Google Drive -> importe de https://www.kaggle.com/competitions/store-sales-time-series-forecasting/data
    data_dir = "/content/drive/MyDrive/store_sales_data"
    
    print("Pipeline de Previsão de Vendas de Lojas")
    
    facade = SalesForecastingFacade(data_dir=data_dir)
    
    print("Carregando dados...")
    facade.load_data()
    
    print("Pré-processando dados...")
    facade.preprocess_data()
    
    for model_cls in [RandomForestModel, XGBoostModel, LGBMModel]:
        print(f"\nTreinando e avaliando {model_cls.__name__}...")
        model = model_cls()
        facade.train_model(model)
        facade.evaluate_model(model)
        print(f"Gerando submissão para {model_cls.__name__}...") 
        facade.make_submission(model)
        
    print("Pipeline de previsão concluído.")
