In [1]:
from pathlib import Path
from typing import Dict, List
from pydantic import BaseModel
from strictyaml import load

### Entendendo o Pydantic

In [2]:
class Formulario(BaseModel):
    nome: str
    idade: float
    email: str

In [7]:
dados_do_formulario = {
    "nome": "Alice",
    "idade": "30",
    "email": "alice@example.com"
}

In [8]:
try:
    formulario_validado = Formulario(**dados_do_formulario)
    print("Dados válidos:", formulario_validado.model_dump())
except Exception as e:
    print("Erro na validação:", e)

Dados válidos: {'nome': 'Alice', 'idade': 30.0, 'email': 'alice@example.com'}


### Gerando modelos de dados a serem validados

In [10]:
PACKAGE_ROOT = Path().resolve().parents[0]
ASSETS_PATH =  PACKAGE_ROOT / "assets"
CONFIG_FILE_PATH = ASSETS_PATH / "config.yml"
print(PACKAGE_ROOT)
print(ASSETS_PATH)
print(CONFIG_FILE_PATH)

C:\Users\Carolina\OneDrive\Github\mlops
C:\Users\Carolina\OneDrive\Github\mlops\assets
C:\Users\Carolina\OneDrive\Github\mlops\assets\config.yml


In [11]:
class ModelConfig(BaseModel):
    """
    All configuration relevant to model
    training and feature engineering.
    """
    target: str
    features: List[str]
    trained_model_file: str
    train_data_path : str
    result_data_path : str
    predict_data_path : str
    r2_score_limit : float

In [12]:
class DataConfig(BaseModel):
    """
    All configuration relevant to data
    sanitization and transformer classes
    """

    input_data_train: List[str]
    input_data_pred: List[str]
    categorical_variables: List[str]
    numerical_variables: List[str]
    map_variables : List[str]
    rare_encode : List[str]
    scale_vars : List[str]
    temporal_vars : Dict[str, str]
    zipcode_encoded : Dict[int, str]
    view_encoded: Dict[int, str]
    condition_encoded: Dict[int, str]
    grade_encoded: Dict[int, str]

In [13]:

class DataSchema(BaseModel):
    """
    Data Input schema
    """
    bedrooms: int
    bathrooms: float
    sqft_living: int
    sqft_lot: int
    floors: float
    waterfront: int
    view: int
    condition: int
    grade: int
    sqft_above: int
    sqft_basement: int
    yr_built: int
    yr_renovated: int
    zipcode: int
    sqft_living15: int
    sqft_lot15: int

In [14]:

class Config(BaseModel):
    """Master config object."""

    data_config: DataConfig
    ml_config: ModelConfig

In [15]:

class MultipleDataSchema(BaseModel):
    inputs: List[DataSchema]

In [16]:
try:
    with open(CONFIG_FILE_PATH, "r") as conf_file:
        parsed_config = load(conf_file.read())
except:
    raise OSError(f"Did not find config file at path: {CONFIG_FILE_PATH}")

In [18]:
parsed_config.data

{'input_data_pred': ['bedrooms',
  'bathrooms',
  'sqft_living',
  'sqft_lot',
  'floors',
  'waterfront',
  'view',
  'condition',
  'grade',
  'sqft_above',
  'sqft_basement',
  'yr_built',
  'yr_renovated',
  'zipcode',
  'sqft_living15',
  'sqft_lot15'],
 'input_data_train': ['bedrooms',
  'bathrooms',
  'sqft_living',
  'sqft_lot',
  'floors',
  'waterfront',
  'view',
  'condition',
  'grade',
  'sqft_above',
  'sqft_basement',
  'yr_built',
  'yr_renovated',
  'zipcode',
  'sqft_living15',
  'sqft_lot15',
  'price'],
 'categorical_variables': ['zipcode',
  'waterfront',
  'view',
  'condition',
  'grade'],
 'numerical_variables': ['bedrooms',
  'bathrooms',
  'sqft_living',
  'sqft_lot',
  'floors',
  'sqft_above',
  'sqft_basement',
  'sqft_living15'],
 'map_variables': ['zipcode_encoded',
  'view_encoded',
  'condition_encoded',
  'grade_encoded'],
 'rare_encode': ['zipcode', 'view', 'condition', 'grade'],
 'temporal_vars': {'tempo_imovel': 'yr_built',
  'tempo_renovacao': 'yr

In [19]:
def create_and_validate_config(cfg_path = CONFIG_FILE_PATH) -> Config:
    """Run validation on config values."""

    parsed_config = None
    try:
        with open(CONFIG_FILE_PATH, "r") as conf_file:
            parsed_config = load(conf_file.read())
    except:
        raise OSError(f"Did not find config file at path: {CONFIG_FILE_PATH}")

    
    _config = Config(
        data_config=DataConfig(**parsed_config.data),
        ml_config=ModelConfig(**parsed_config.data),
    )

    return _config

In [20]:
config = create_and_validate_config()

In [21]:
config

Config(data_config=DataConfig(input_data_train=['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'sqft_living15', 'sqft_lot15', 'price'], input_data_pred=['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'sqft_living15', 'sqft_lot15'], categorical_variables=['zipcode', 'waterfront', 'view', 'condition', 'grade'], numerical_variables=['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'sqft_above', 'sqft_basement', 'sqft_living15'], map_variables=['zipcode_encoded', 'view_encoded', 'condition_encoded', 'grade_encoded'], rare_encode=['zipcode', 'view', 'condition', 'grade'], scale_vars=['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'sqft_living15', 'sqft_lot15'], temporal_vars={'tempo_imovel': 'yr_built', 'tempo_ren

In [24]:
config.ml_config.features

['sqft_living',
 'sqft_lot',
 'sqft_above',
 'sqft_basement',
 'sqft_living15',
 'sqft_lot15',
 'bedrooms',
 'bathrooms',
 'floors',
 'view_encoded',
 'condition_encoded',
 'grade_encoded',
 'tempo_imovel',
 'tempo_renovacao',
 'zipcode_encoded']

In [25]:
config.data_config.categorical_variables

['zipcode', 'waterfront', 'view', 'condition', 'grade']