In [4]:
import warnings
warnings.filterwarnings("ignore", message=r"Passing", category=FutureWarning)
warnings.filterwarnings("ignore", message=r"Implicit", category=UserWarning)

In [5]:
import carla.models.catalog.load_model as loading_utils
import carla.models.catalog.train_model as training_utils
import numpy as np
import pandas as pd

from carla import log
from carla import MLModelCatalog
from carla.data.catalog import DataCatalog
from sklearn.model_selection import train_test_split
from typing import List

In [9]:
class DynamicCsvCatalog(DataCatalog):
    """
    Wrapper class for the DataCatalog similar to the built-in CsvCatalog but with new capabilities
    required to control data in the experiments.
    
    Attributes:
        file_path (str): 
            Path to the .csv file containing the dataset.
        categorical (List[str]): 
            Names of columns describing categorical features.
        continuous (List[str]):
            Names of columns describing continuous (i.e. numerical) features.
        immutables (List[str]):
            Names of columns describing immutable features, not supported by all generators.
        target (str):
            Name of the column that contains the target variable.
        test_size (float):
            Proportion of the dataset which should be withheld as an independent test set.
    """
    
    
    def __init__(self, file_path: str, categorical: List[str],  continuous: List[str],
                 immutables: List[str], target: str, test_size: float,
                 scaling_method: str = "MinMax", encoding_method: str = "OneHot_drop_binary"):
        
        self._categorical = categorical
        self._continuous = continuous
        self._immutables = immutables
        self._target = target

        # Load the raw data
        raw = pd.read_csv(file_path)
        train_raw, test_raw = train_test_split(raw, test_size=test_size)

        super().__init__("custom", raw, train_raw, test_raw,
                         scaling_method, encoding_method)

    @property
    def categorical(self) -> List[str]:
        return self._categorical

    @property
    def continuous(self) -> List[str]:
        return self._continuous

    @property
    def immutables(self) -> List[str]:
        return self._immutables

    @property
    def target(self) -> str:
        return self._target

In [8]:
class DynamicMLModelCatalog(MLModelCatalog):
    """
    Wrapper class for the MLModelCatalog that introduces additional functions
    allowing for the efficient and unbiased measurement of the dynamics of recourse.
    
    Attributes:
        data (DataCatalog):
            Dataset which will be used to train a model and conduct experiments.
        model_type (str):
            Black-box model used for classification, currently this class supports only ANNs and Logistic Regression.
        backend (str):
            Specifies the framework used on the backend, currently this class supports only PyTorch.
        cache (Boolean):
            If True, the framework will attempt to load a model that was previously cached.
        models_home (str):
            Path to the directory where models should be saved after they are trained.
        load_online: (Boolean):
            If True, a pretrained model will be loaded.
        kwargs (dict):
            Dictionary of optional keyworded arguments.
    """
    def __init__(self, data: DataCatalog, model_type: str, backend: str = "pytorch",
        cache: bool = True, models_home: str = None, load_online: bool = True, **kwargs) -> None:
        
        super().__init__(data, model_type, backend, cache,
                         models_home, load_online, **kwargs)
        
    def retrain(self, learning_rate=0.01, epochs=5, batch_size=1):
        """
        Loads a cached model and retrains it on an updated dataset.
        
        Args:
            learning_rate (float):
                Size of the step at each epoch of the model training.
            epochs (int):
                Number of iterations of training.
            batch_size (int):
                Number of samples used at once in a gradient descent step, if '1' the procedure is stochastic.
                
            
        """
        
        # TODO: Apply similar strategy for other backends available in CARLA
        if self.backend != 'pytorch':
            raise ValueError(f"Only PyTorch models are currently supported")
        
        layer_string = "_".join([str(size) for size in hidden_size])
        if self.model_type == "linear":
            save_name = f"{self.model_type}"
        elif self.model_type == "ann":
            save_name = f"{self.model_type}_layers_{layer_string}"
        else:
            raise NotImplementedError(f"Model type not supported: {self.model_type}")
        
        # Attempt to load the saved model
        self._model = loading_utils.load_trained_model(save_name=save_name,
                                         data_name=self.data.name,
                                         backend=self.backend)
        
        # This method should only be used when a model is already available
        if self._model is None:
            raise ValueError(f"No trained model found for {save_name}")
        
        # Sanity check to see if loaded model accuracy makes sense
        if self._model is not None:
            self._test_accuracy()
            
        # Get preprocessed data
        df_train = self.data.df_train
        df_test = self.data.df_test

        # All dataframes may have possibly changed
        x_train = df_train[list(set(df_train.columns) - {self.data.target})]
        y_train = df_train[self.data.target]
        x_test = df_test[list(set(df_test.columns) - {self.data.target})]
        y_test = df_test[self.data.target]

        # Order data (column-wise) before training
        x_train = self.get_ordered_features(x_train)
        x_test = self.get_ordered_features(x_test)
        
        log.info(f"Current balance: train set {y_train.mean()}, test set {y_test.mean()}")
        
        # Access the data in a format expected by PyTorch
        train_dataset = training_utils.DataFrameDataset(x_train, y_train)
        train_loader = training_utils.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        test_dataset = training_utils.DataFrameDataset(x_test, y_test)
        test_loader = training_utils.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

        # Retrain the model
        training_utils._training_torch(self._model, train_loader, test_loader,
                                       learning_rate, epochs)

        loading_utils.save_model(model=self._model, save_name=save_name,
                                 data_name=self.data.name, backend=self.backend)