![](image.png)

### Deployment de Modelo de Regressão com PyTorch Usando MLflow

In [0]:
# Substitua o catálogo e o schema que irá usar e crie
CATALOG_NAME = "baraldi_catalog_new"
SCHEMA_NAME = "regression"

spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG_NAME}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG_NAME}.{SCHEMA_NAME}")

DataFrame[]

In [0]:
# Instale as bibliotecas necessárias
%pip install -Uqqq mlflow pytorch-lightning optuna skorch uv optuna-integration[pytorch_lightning]
%restart_python

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
# Importe as bibliotecas
from typing import Tuple, Optional, Dict, List, Any

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

import mlflow
from mlflow.models import infer_signature
from mlflow.tracking import MlflowClient
from mlflow.entities import Metric, Param  

import optuna
from optuna.integration import PyTorchLightningPruningCallback

import time

In [0]:
# Configure o registry URI
mlflow.set_registry_uri("databricks-uc")

In [0]:
# Crie os Dados Sintéticos
def create_regression_data(
    n_samples: int, 
    n_features: int,
    seed: int = 1994,
    noise_level: float = 0.3,
    nonlinear: bool = True
) -> Tuple[pd.DataFrame, pd.Series]:
    """Generates synthetic regression data with interesting correlations for MLflow and PyTorch demonstrations.

    This function creates a DataFrame of continuous features and computes a target variable with nonlinear
    relationships and interactions between features. The data is designed to be complex enough to demonstrate
    the capabilities of deep learning, but not so complex that a reasonable model can't be learned.

    Args:
        n_samples (int): Number of samples (rows) to generate.
        n_features (int): Number of feature columns.
        seed (int, optional): Random seed for reproducibility. Defaults to 1994.
        noise_level (float, optional): Level of Gaussian noise to add to the target. Defaults to 0.3.
        nonlinear (bool, optional): Whether to add nonlinear feature transformations. Defaults to True.

    Returns:
        Tuple[pd.DataFrame, pd.Series]:
            - pd.DataFrame: DataFrame containing the synthetic features.
            - pd.Series: Series containing the target labels.

    Example:
        >>> df, target = create_regression_data(n_samples=1000, n_features=10)
    """
    rng = np.random.RandomState(seed)
    
    # Gere features aleatórias
    X = rng.uniform(-5, 5, size=(n_samples, n_features))
    
    # Gere features com seus nomes
    columns = [f"feature_{i}" for i in range(n_features)]
    df = pd.DataFrame(X, columns=columns)
    
    # Gere a variável alvo com relação linear para um conjunto de features. Use features relevantes e irrelevantes
    weights = rng.uniform(-2, 2, size=n_features//2)
    target = np.dot(X[:, :n_features//2], weights)
    
    # Adicione transformações não-lineares caso necessário
    if nonlinear:
        # Adicione termo ao quadrado para primeira feature
        target += 0.5 * X[:, 0]**2
        
        # Adicione interação entre a segunda e a terceira features
        if n_features >= 3:
            target += 1.5 * X[:, 1] * X[:, 2]
        
        # Adicione transformação com seno para a quarta
        if n_features >= 4:
            target += 2 * np.sin(X[:, 3])
        
        # Adicione exponencial para a quinta
        if n_features >= 5:
            target += 0.1 * np.exp(X[:, 4] / 2)
            
        # Adicione efeito de limiar para a sexta
        if n_features >= 6:
            target += 3 * (X[:, 5] > 1.5).astype(float)
    
    # Adicione ruído de gaussiana
    noise = rng.normal(0, noise_level * target.std(), size=n_samples)
    target += noise
    
    # Adicione mais features ao DataFrame
    
    # Adicione uma feature correlata (mas não usada no alvo)
    if n_features >= 7:
        df['feature_correlated'] = df['feature_0'] * 0.8 + rng.normal(0, 0.2, size=n_samples)
    
    # Adicione uma feature cíclica
    df['feature_cyclical'] = np.sin(np.linspace(0, 4*np.pi, n_samples))
    
    # Adicione uma feature com outliers
    df['feature_with_outliers'] = rng.normal(0, 1, size=n_samples)
    
    # Adicione outliers
    outlier_idx = rng.choice(n_samples, size=n_samples//100, replace=False)
    df.loc[outlier_idx, 'feature_with_outliers'] = rng.uniform(10, 15, size=len(outlier_idx))
    
    return df, pd.Series(target, name='target')

In [0]:
# Realize a análise exploratória
def plot_feature_distributions(X: pd.DataFrame, y: pd.Series, n_cols: int = 3) -> plt.Figure:
    """
    Creates a grid of histograms for each feature in the dataset.

    Args:
        X (pd.DataFrame): DataFrame containing features.
        y (pd.Series): Series containing the target variable.
        n_cols (int): Number of columns in the grid layout.

    Returns:
        plt.Figure: The matplotlib Figure object containing the distribution plots.
    """
    features = X.columns
    n_features = len(features)
    n_rows = (n_features + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4 * n_rows))
    axes = axes.flatten() if n_rows * n_cols > 1 else [axes]
    
    for i, feature in enumerate(features):
        if i < len(axes):
            ax = axes[i]
            sns.histplot(X[feature], ax=ax, kde=True, color='skyblue')
            ax.set_title(f'Distribution of {feature}')
    
    # Esconda qualquer plot não usado
    for i in range(n_features, len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    fig.suptitle('Feature Distributions', y=1.02, fontsize=16)
    plt.close(fig)
    return fig

def plot_correlation_heatmap(X: pd.DataFrame, y: pd.Series) -> plt.Figure:
    """
    Creates a correlation heatmap of all features and the target variable.

    Args:
        X (pd.DataFrame): DataFrame containing features.
        y (pd.Series): Series containing the target variable.

    Returns:
        plt.Figure: The matplotlib Figure object containing the heatmap.
    """
    # Combine features e target em um DataFrame
    data = X.copy()
    data['target'] = y
    
    # Calcule a matriz de correlação
    corr_matrix = data.corr()
    
    # Configure a figura
    fig, ax = plt.subplots(figsize=(12, 10))
    
    # Desenhe o heatmap
    cmap = sns.diverging_palette(220, 10, as_cmap=True)
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap=cmap,
                center=0, square=True, linewidths=0.5, ax=ax)
    
    ax.set_title('Feature Correlation Heatmap', fontsize=16)
    plt.close(fig)
    return fig

def plot_feature_target_relationships(X: pd.DataFrame, y: pd.Series, n_cols: int = 3) -> plt.Figure:
    """
    Creates a grid of scatter plots showing the relationship between each feature and the target.

    Args:
        X (pd.DataFrame): DataFrame containing features.
        y (pd.Series): Series containing the target variable.
        n_cols (int): Number of columns in the grid layout.

    Returns:
        plt.Figure: The matplotlib Figure object containing the relationship plots.
    """
    features = X.columns
    n_features = len(features)
    n_rows = (n_features + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4 * n_rows))
    axes = axes.flatten() if n_rows * n_cols > 1 else [axes]
    
    for i, feature in enumerate(features):
        if i < len(axes):
            ax = axes[i]
            # Plote com a regressão
            sns.regplot(x=X[feature], y=y, ax=ax, 
                       scatter_kws={'alpha': 0.5, 'color': 'blue'}, 
                       line_kws={'color': 'red'})
            ax.set_title(f'{feature} vs Target')
    
    for i in range(n_features, len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    fig.suptitle('Feature vs Target Relationships', y=1.02, fontsize=16)
    plt.close(fig)
    return fig

def plot_pairwise_relationships(X: pd.DataFrame, y: pd.Series, features: list[str]) -> plt.Figure:
    """
    Creates a pairplot showing relationships between selected features and the target.

    Args:
        X (pd.DataFrame): DataFrame containing features.
        y (pd.Series): Series containing the target variable.
        features (List[str]): List of feature names to include in the plot.

    Returns:
        plt.Figure: The matplotlib Figure object containing the pairplot.
    """
    # Garanta que features existam no DataFrame
    valid_features = [f for f in features if f in X.columns]
    
    if not valid_features:
        fig, ax = plt.subplots()
        ax.text(0.5, 0.5, "No valid features provided", ha='center', va='center')
        return fig
    
    # Combine features selecionadas e a variável alvo
    data = X[valid_features].copy()
    data['target'] = y
    
    # Crie um plot
    pairgrid = sns.pairplot(data, diag_kind="kde", 
                          plot_kws={"alpha": 0.6, "s": 50},
                          corner=True)
    
    pairgrid.fig.suptitle("Pairwise Feature Relationships", y=1.02, fontsize=16)
    plt.close(pairgrid.fig)
    return pairgrid.fig

def plot_outliers(X: pd.DataFrame, n_cols: int = 3) -> plt.Figure:
    """
    Creates a grid of box plots to detect outliers in each feature.

    Args:
        X (pd.DataFrame): DataFrame containing features.
        n_cols (int): Number of columns in the grid layout.

    Returns:
        plt.Figure: The matplotlib Figure object containing the outlier plots.
    """
    features = X.columns
    n_features = len(features)
    n_rows = (n_features + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4 * n_rows))
    axes = axes.flatten() if n_rows * n_cols > 1 else [axes]
    
    for i, feature in enumerate(features):
        if i < len(axes):
            ax = axes[i]
            # Use box plot para verificar outliers
            sns.boxplot(x=X[feature], ax=ax, color='skyblue')
            ax.set_title(f'Outlier Detection for {feature}')
            ax.set_xlabel(feature)
    
    # Esconda plots não usados
    for i in range(n_features, len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    fig.suptitle('Outlier Detection for Features', y=1.02, fontsize=16)
    plt.close(fig)
    return fig

def plot_residuals(y_true: pd.Series, y_pred: np.ndarray) -> plt.Figure:
    """
    Creates a residual plot to analyze model prediction errors.
    
    Args:
        y_true (pd.Series): True target values.
        y_pred (np.ndarray): Predicted target values.
        
    Returns:
        plt.Figure: The matplotlib Figure object containing the residual plot.
    """
    residuals = y_true - y_pred
    
    fig, ax = plt.subplots(figsize=(10, 6))
    
    # Plote valores preditos vs residuais
    ax.scatter(y_pred, residuals, alpha=0.5)
    ax.axhline(y=0, color='r', linestyle='-')
    
    ax.set_xlabel('Predicted Values')
    ax.set_ylabel('Residuals')
    ax.set_title('Residual Plot')
    
    plt.tight_layout()
    plt.close(fig)
    return fig

In [0]:
# Crie a rede neural para regressão
class RegressionNN(nn.Module):
    """
    A flexible feedforward neural network for regression tasks.
    
    Attributes:
        input_dim (int): Number of input features.
        hidden_dims (List[int]): List of hidden layer dimensions.
        dropout_rate (float): Dropout probability for regularization.
        use_layer_norm (bool): Whether to use layer normalization.
    """
    
    def __init__(
        self,
        input_dim: int,
        hidden_dims: List[int] = [64, 32],
        dropout_rate: float = 0.1,
        use_layer_norm: bool = True
    ):
        """
        Initialize the neural network.
        
        Args:
            input_dim (int): Number of input features.
            hidden_dims (List[int]): List of hidden layer dimensions.
            dropout_rate (float): Dropout probability for regularization.
            use_layer_norm (bool): Whether to use layer normalization.
        """
        super().__init__()
        
        self.input_dim = input_dim
        self.hidden_dims = hidden_dims
        self.dropout_rate = dropout_rate
        self.use_layer_norm = use_layer_norm
        
        # Construa as camadas dinamicamente
        layers = []
        
        # Camada de entrada
        prev_dim = input_dim
        
        # Camadas escondidas
        for dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, dim))
            
            if use_layer_norm:
                layers.append(nn.LayerNorm(dim))
                
            layers.append(nn.ReLU())
            
            if dropout_rate > 0:
                layers.append(nn.Dropout(dropout_rate))
                
            prev_dim = dim
        
        # Camada de saída
        layers.append(nn.Linear(prev_dim, 1))
        
        # Combine todas as camadas
        self.model = nn.Sequential(*layers)
    
    def forward(self, x):
        """Forward pass through the network."""
        return self.model(x).squeeze()
    
    def get_params(self) -> Dict[str, Any]:
        """Return model parameters as a dictionary for MLflow logging."""
        return {
            "input_dim": self.input_dim,
            "hidden_dims": self.hidden_dims,
            "dropout_rate": self.dropout_rate,
            "use_layer_norm": self.use_layer_norm
        }

In [0]:
# Crie o wrapper para a rede neural
class RegressionLightningModule(pl.LightningModule):
    """
    PyTorch Lightning module for regression tasks.
    
    This class wraps the RegressionNN model and adds training, validation,
    and testing logic using the PyTorch Lightning framework.
    """
    
    def __init__(
        self,
        input_dim: int,
        hidden_dims: List[int] = [64, 32],
        dropout_rate: float = 0.1,
        use_layer_norm: bool = True,
        learning_rate: float = 1e-3,
        weight_decay: float = 1e-5
    ):
        """
        Initialize the Lightning module.
        
        Args:
            input_dim (int): Number of input features.
            hidden_dims (List[int]): List of hidden layer dimensions.
            dropout_rate (float): Dropout probability for regularization.
            use_layer_norm (bool): Whether to use layer normalization.
            learning_rate (float): Learning rate for the optimizer.
            weight_decay (float): Weight decay for L2 regularization.
        """
        super().__init__()
        
        # Salve os hiperparâmetros
        self.save_hyperparameters()
        
        # Crie o modelo
        self.model = RegressionNN(
            input_dim=input_dim,
            hidden_dims=hidden_dims,
            dropout_rate=dropout_rate,
            use_layer_norm=use_layer_norm
        )
        
        # Função de perda
        self.loss_fn = nn.MSELoss()
    
    def forward(self, x):
        """Forward pass through the network."""
        return self.model(x)
    
    def configure_optimizers(self):
        """Configure the optimizer for training."""
        optimizer = torch.optim.Adam(
            self.parameters(),
            lr=self.hparams.learning_rate,
            weight_decay=self.hparams.weight_decay
        )
        return optimizer
    
    def training_step(self, batch, batch_idx):
        """Perform a training step."""
        x, y = batch
        y_pred = self(x)
        loss = self.loss_fn(y_pred, y)
        self.log('train_loss', loss, prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        """Perform a validation step."""
        x, y = batch
        y_pred = self(x)
        loss = self.loss_fn(y_pred, y)
        self.log('val_loss', loss, prog_bar=True)
        
        # Calcule métricas
        rmse = torch.sqrt(loss)
        mae = torch.mean(torch.abs(y_pred - y))
        
        self.log('val_rmse', rmse, prog_bar=True)
        self.log('val_mae', mae, prog_bar=True)
        
        return loss
    
    def test_step(self, batch, batch_idx):
        """Perform a test step."""
        x, y = batch
        y_pred = self(x)
        loss = self.loss_fn(y_pred, y)
        
        # Calcule métricas
        rmse = torch.sqrt(loss)
        mae = torch.mean(torch.abs(y_pred - y))
        
        self.log('test_loss', loss)
        self.log('test_rmse', rmse)
        self.log('test_mae', mae)
        
        return loss
    
    def get_params(self) -> Dict[str, Any]:
        """Return model parameters as a dictionary for MLflow logging."""
        return {
            "input_dim": self.hparams.input_dim,
            "hidden_dims": self.hparams.hidden_dims,
            "dropout_rate": self.hparams.dropout_rate,
            "use_layer_norm": self.hparams.use_layer_norm,
            "learning_rate": self.hparams.learning_rate,
            "weight_decay": self.hparams.weight_decay
        }

In [0]:
# Crie o carregamento de dados
def prepare_dataloader(
    X_train, y_train, X_val, y_val, X_test, y_test, batch_size: int = 32
):
    """
    Create PyTorch DataLoaders for training, validation, and testing.
    
    Args:
        X_train, y_train: Training data and labels.
        X_val, y_val: Validation data and labels.
        X_test, y_test: Test data and labels.
        batch_size (int): Batch size for the DataLoaders.
        
    Returns:
        Tuple of (train_loader, val_loader, test_loader, scaler)
    """
    # Inicialize o scaler
    scaler = StandardScaler()
    
    # Use fit e transform nos dados de treinamento
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    # Converta para tensores do PyTorch
    X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
    
    X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32)
    y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)
    
    X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)
    
    # Crie TensorDatasets
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
    
    # Crie DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
    return train_loader, val_loader, test_loader, scaler

In [0]:
# Crie o dataset de regressão
n_samples = 1000
n_features = 10
X, y = create_regression_data(n_samples=n_samples, n_features=n_features, nonlinear=True)

# Crie os gráficos de análise
dist_plot = plot_feature_distributions(X, y)
corr_plot = plot_correlation_heatmap(X, y)
scatter_plot = plot_feature_target_relationships(X, y)
corr_with_target = X.corrwith(y).abs().sort_values(ascending=False)
top_features = corr_with_target.head(4).index.tolist()
pairwise_plot = plot_pairwise_relationships(X, y, top_features)
outlier_plot = plot_outliers(X)

# Divida entre treino, teste e validação
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Prepare os DataLoaders
batch_size = 32
train_loader, val_loader, test_loader, scaler = prepare_dataloader(
    X_train, y_train, X_val, y_val, X_test, y_test, batch_size=batch_size)

# Defina parâmetros do modelo
input_dim = X_train.shape[1]
hidden_dims = [64, 32]
dropout_rate = 0.1
use_layer_norm = True
learning_rate = 1e-3
weight_decay = 1e-5

# Crie o wrapper PyTorch Lightning
model = RegressionLightningModule(
    input_dim=input_dim,
    hidden_dims=hidden_dims,
    dropout_rate=dropout_rate,
    use_layer_norm=use_layer_norm,
    learning_rate=learning_rate,
    weight_decay=weight_decay
)

# Defina a parada e os callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    mode='min'
)

checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    dirpath='./checkpoints',
    filename='pytorch-regression-{epoch:02d}-{val_loss:.4f}',
    save_top_k=1,
    mode='min'
)

# Defina o trainer
trainer = pl.Trainer(
    max_epochs=100,
    callbacks=[early_stopping, checkpoint_callback],
    enable_progress_bar=True,
    log_every_n_steps=5
)

# Treine o modelo
trainer.fit(model, train_loader, val_loader)

# Teste o modelo
test_results = trainer.test(model, test_loader)

# Faça predições no teste
model.eval()
test_preds = []
true_values = []

with torch.no_grad():
    for batch in test_loader:
        x, y = batch
        y_pred = model(x)
        test_preds.extend(y_pred.numpy())
        true_values.extend(y.numpy())

test_preds = np.array(test_preds)
true_values = np.array(true_values)

# Calcule métricas
rmse = np.sqrt(mean_squared_error(true_values, test_preds))
mae = mean_absolute_error(true_values, test_preds)
r2 = r2_score(true_values, test_preds)

# Crie o gráfico residual
residual_plot = plot_residuals(pd.Series(true_values), test_preds)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/local_disk0/.ephemeral_nfs/envs/pythonEnv-0ba82a78-e76e-45b6-b42e-e208ee580462/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
/local_disk0/.ephemeral_nfs/envs/pythonEnv-0ba82a78-e76e-45b6-b42e-e208ee580462/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:751: Checkpoint directory /Workspace/Users/daniel.baraldi@databricks.com/LaboratorioDatabricks/checkpoints exists and is not empty.

  | Na

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/local_disk0/.ephemeral_nfs/envs/pythonEnv-0ba82a78-e76e-45b6-b42e-e208ee580462/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
/local_disk0/.ephemeral_nfs/envs/pythonEnv-0ba82a78-e76e-45b6-b42e-e208ee580462/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

/local_disk0/.ephemeral_nfs/envs/pythonEnv-0ba82a78-e76e-45b6-b42e-e208ee580462/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

In [0]:
# Registre o modelo e resultados de treino com MLflow
with mlflow.start_run() as run:
    # Crie o client do MLflow client para registro em batch
    mlflow_client = MlflowClient()
    run_id = run.info.run_id
    
    # Extraia métricas
    final_train_loss = trainer.callback_metrics.get("train_loss").item() if "train_loss" in trainer.callback_metrics else None
    final_val_loss = trainer.callback_metrics.get("val_loss").item() if "val_loss" in trainer.callback_metrics else None
    
    # Extraia parametros para registro
    model_params = model.get_params()
     
    # Crie uma lista para armazenar todas as métricas para registro em batch
    all_metrics = []
    
    # Adicione cada métrica à lista
    if final_train_loss is not None:
        all_metrics.append(Metric(key="train_loss", value=final_train_loss, timestamp=0, step=0))
    if final_val_loss is not None:
        all_metrics.append(Metric(key="val_loss", value=final_val_loss, timestamp=0, step=0))
    
    # Adicione métricas de teste
    all_metrics.append(Metric(key="test_rmse", value=rmse, timestamp=0, step=0))
    all_metrics.append(Metric(key="test_mae", value=mae, timestamp=0, step=0))
    all_metrics.append(Metric(key="test_r2", value=r2, timestamp=0, step=0))
    
    # Colete todos os parâmetros para registrar
    # Note: The code uses log_params for model_params since there could be many parameters,
    # but converts the individual param calls to batch
    from mlflow.entities import Param
    all_params = [
        Param(key="batch_size", value=str(batch_size)),
        Param(key="early_stopping_patience", value=str(early_stopping.patience)),
        Param(key="max_epochs", value=str(trainer.max_epochs)),
        Param(key="actual_epochs", value=str(trainer.current_epoch))
    ]
    
    # Gere uma assinatura do modelo usando o infer signature no MLflow
    input_example = X_train.iloc[[0]].values.astype(np.float32)
    input_example_scaled = scaler.transform(input_example).astype(np.float32)
    
    model.eval()
    with torch.no_grad():
        tensor_input = torch.tensor(input_example_scaled, dtype=torch.float32)
        signature_preds = model(tensor_input)
    
    signature = infer_signature(input_example, signature_preds.numpy().reshape(-1).astype(np.float32))
    
    # Registre os parâmetros do modelo primeiro
    mlflow.log_params(model_params)
    
    # Registre todas as métricas e parâmetros restantes
    mlflow_client.log_batch(
        run_id=run_id,
        metrics=all_metrics,
        params=all_params
    )
    
    # Registre o modelo para o MLflow e para o Unity Catalog
    model_info = mlflow.pytorch.log_model(
        model,
        artifact_path="model",
        input_example=input_example,
        signature=signature,
        registered_model_name="baraldi_catalog.regression.pytorch_regression_model",
    )
    
    # Registre os gráficos
    mlflow.log_figure(dist_plot, "feature_distributions.png")
    mlflow.log_figure(corr_plot, "correlation_heatmap.png")
    mlflow.log_figure(scatter_plot, "feature_target_relationships.png")
    mlflow.log_figure(pairwise_plot, "pairwise_relationships.png")
    mlflow.log_figure(outlier_plot, "outlier_detection.png")
    mlflow.log_figure(residual_plot, "residual_plot.png")
    
    # Execute a avaliação do MLflow para gerar métricas adicionais
    evaluation_data = X_test.copy()
    evaluation_data["label"] = y_test
    
    # Pule o mlflow.evaluate agora para evitar problemas de tipagem
    print(f"Model logged: {model_info.model_uri}")
    print(f"Test RMSE: {rmse:.4f}")
    print(f"Test MAE: {mae:.4f}")
    print(f"Test R²: {r2:.4f}")

🔗 View Logged Model at: https://e2-demo-field-eng.cloud.databricks.com/ml/experiments/2964497923484677/models/m-9b952b2f8a8d43b48dff347da285bf6a?o=1444828305810485
Registered model 'baraldi_catalog.regression.pytorch_regression_model' already exists. Creating a new version of this model...


Uploading artifacts:   0%|          | 0/12 [00:00<?, ?it/s]

🔗 Created version '3' of model 'baraldi_catalog.regression.pytorch_regression_model': https://e2-demo-field-eng.cloud.databricks.com/explore/data/models/baraldi_catalog/regression/pytorch_regression_model/version/3?o=1444828305810485


Model logged: models:/m-9b952b2f8a8d43b48dff347da285bf6a
Test RMSE: 6.1826
Test MAE: 4.9699
Test R²: 0.8468


In [0]:
# Crie um callback como fallback
class PyTorchLightningPruningCallback(pl.Callback):
    """PyTorch Lightning callback to prune unpromising trials.
    
    This is a simplified version for use when the optuna-integration package isn't available.
    """
    
    def __init__(self, trial, monitor):
        super().__init__()
        self._trial = trial
        self.monitor = monitor
        
    def on_validation_end(self, trainer, pl_module):
        # Reporte a métrica de validação ao Optuna
        metrics = trainer.callback_metrics
        current_score = metrics.get(self.monitor)
        
        if current_score is not None:
            self._trial.report(current_score.item(), trainer.current_epoch)
            
            if self._trial.should_prune():
                message = "Trial was pruned at epoch {}.".format(trainer.current_epoch)
                raise optuna.TrialPruned(message)

# Gere um dataset maior para tuning de hiperparâmetros
n_samples = 2000
n_features = 10

X, y = create_regression_data(n_samples=n_samples, n_features=n_features, nonlinear=True)

# Divida os dados
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Prepare a avaliação
evaluation_data = X_test.copy()
evaluation_data["label"] = y_test

# Crie os data loaders
batch_size = 32
train_loader, val_loader, test_loader, scaler = prepare_dataloader(
    X_train, y_train, X_val, y_val, X_test, y_test, batch_size=batch_size)

def objective(trial):
    """Optuna objective function to minimize validation loss."""
    
    # Defina o espaço de busca
    n_layers = trial.suggest_int("n_layers", 1, 3)
    
    # Crie as dimensões escondidas baseadp no número de camadas
    hidden_dims = []
    for i in range(n_layers):
        hidden_dims.append(trial.suggest_int(f"hidden_dim_{i}", 16, 128))
    
    # Outros hiperparâmetros
    dropout_rate = trial.suggest_float("dropout_rate", 0.0, 0.5)
    learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-3, log=True)
    use_layer_norm = trial.suggest_categorical("use_layer_norm", [True, False])
    
    # Inicie uma execução nested do MLflow
    with mlflow.start_run(nested=True) as child_run:
        # Crie o client do MLflow para registro batch
        mlflow_client = MlflowClient()
        run_id = child_run.info.run_id
        
        # Prepare os parãmetros para registro batch
        params_list = []
        param_dict = {
            "n_layers": n_layers,
            "hidden_dims": str(hidden_dims),
            "dropout_rate": dropout_rate,
            "learning_rate": learning_rate,
            "weight_decay": weight_decay,
            "use_layer_norm": use_layer_norm,
            "batch_size": batch_size
        }
        
        # Converta parametros para Param objects
        for key, value in param_dict.items():
            params_list.append(Param(key, str(value)))
        
        # Crie o modelo com esses hiperparametros
        model = RegressionLightningModule(
            input_dim=X_train.shape[1],
            hidden_dims=hidden_dims,
            dropout_rate=dropout_rate,
            use_layer_norm=use_layer_norm,
            learning_rate=learning_rate,
            weight_decay=weight_decay
        )
        
        # Callbacks
        early_stopping = EarlyStopping(
            monitor='val_loss',
            patience=5,
            mode='min'
        )
        
        pruning_callback = PyTorchLightningPruningCallback(
            trial, monitor="val_loss"
        )
        
        # Defina o trainer
        trainer = pl.Trainer(
            max_epochs=50,
            callbacks=[early_stopping, pruning_callback],
            enable_progress_bar=False,
            log_every_n_steps=10
        )
        
        # Treine e valide o modelo
        trainer.fit(model, train_loader, val_loader)
        
        # Selecione o melhor na validação
        best_val_loss = trainer.callback_metrics.get("val_loss").item()
        val_rmse = np.sqrt(best_val_loss)
        
        # Prepare métricas para registro batch
        current_time = int(time.time() * 1000)
        metrics_list = [
            Metric("val_loss", best_val_loss, current_time, 0),
            Metric("val_rmse", val_rmse, current_time, 0)
        ]
        
        # Use log_batch
        mlflow_client.log_batch(run_id, metrics=metrics_list, params=params_list)
        
    # Armazene o modelo com user attributes
    trial.set_user_attr("model", model)
    
    # Retorne o valor para minimizar
    return best_val_loss

best_model_version = None
with mlflow.start_run() as run:
    mlflow_client = MlflowClient()
    run_id = run.info.run_id
    
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=20)

    best_trial = study.best_trial
    best_model = best_trial.user_attrs["model"]
    
    # Teste o melhor modelo
    trainer = pl.Trainer(
        enable_progress_bar=True,
        log_every_n_steps=5
    )
    test_results = trainer.test(best_model, test_loader)
    
    # Faça predições no teste
    best_model.eval()
    test_preds = []
    true_values = []
    
    with torch.no_grad():
        for batch in test_loader:
            x, y = batch
            y_pred = best_model(x)
            test_preds.extend(y_pred.numpy())
            true_values.extend(y.numpy())
    
    test_preds = np.array(test_preds)
    true_values = np.array(true_values)
    
    # Calcule métricas
    rmse = np.sqrt(mean_squared_error(true_values, test_preds))
    mae = mean_absolute_error(true_values, test_preds)
    r2 = r2_score(true_values, test_preds)
    
    # Prepare parâmetros para registro
    best_params_list = []
    for key, value in best_trial.params.items():
        best_params_list.append(Param(f"best_{key}", str(value)))
    
    # Prepare métricas
    current_time = int(time.time() * 1000)
    metrics_list = [
        Metric("best_val_loss", best_trial.value, current_time, 0),
        Metric("test_rmse", rmse, current_time, 0),
        Metric("test_mae", mae, current_time, 0),
        Metric("test_r2", r2, current_time, 0)
    ]
    
    # Registre métricas e parâmetros
    mlflow_client.log_batch(run_id, metrics=metrics_list, params=best_params_list)

    # Gere a assinatura do modelo
    input_example = X_train.iloc[[0]].values.astype(np.float32)
    input_example_scaled = scaler.transform(input_example).astype(np.float32)
    
    best_model.eval()
    with torch.no_grad():
        tensor_input = torch.tensor(input_example_scaled, dtype=torch.float32)
        signature_preds = best_model(tensor_input)
    
    signature = infer_signature(input_example, signature_preds.numpy().reshape(-1).astype(np.float32))

    # Registre o modelo
    model_info = mlflow.pytorch.log_model(
        best_model,
        artifact_path="model",
        input_example=input_example,
        signature=signature,
        registered_model_name="baraldi_catalog.regression.pytorch_regression_optimized",
    )
    
    # Crie o gráfico residual
    residual_plot = plot_residuals(pd.Series(true_values), test_preds)
    
    # Registre as figuras
    mlflow.log_figure(dist_plot, "feature_distributions.png")
    mlflow.log_figure(corr_plot, "correlation_heatmap.png")
    mlflow.log_figure(scatter_plot, "feature_target_relationships.png")
    mlflow.log_figure(pairwise_plot, "pairwise_relationships.png")
    mlflow.log_figure(outlier_plot, "outlier_detection.png")
    mlflow.log_figure(residual_plot, "residual_plot.png")

    # Pule o MLflow para evitar erros
    print(f"Best model logged: {model_info.model_uri}")
    print(f"Best parameters: {best_trial.params}")
    print(f"Test RMSE: {rmse:.4f}")
    print(f"Test MAE: {mae:.4f}")
    print(f"Test R²: {r2:.4f}")

    best_model_version = model_info.registered_model_version

[I 2025-09-24 17:23:07,542] A new study created in memory with name: no-name-9761ab9f-4cef-4e11-9c3f-d30c12db5e74
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name    | Type         | Params | Mode 
-------------------------------------------------
0 | model   | RegressionNN | 13.2 K | train
1 | loss_fn | MSELoss      | 0      | train
-------------------------------------------------
13.2 K    Trainable params
0         Non-trainable params
13.2 K    Total params
0.053     Total estimated model params size (MB)
12        Modules in train mode
0         Modules in eval mode
/local_disk0/.ephemeral_nfs/envs/pythonEnv-0ba82a78-e76e-45b6-b42e-e208ee580462/lib/python3.10/site-packages/pytorch_lightning/trainer/connector

Testing: |          | 0/? [00:00<?, ?it/s]

🔗 View Logged Model at: https://e2-demo-field-eng.cloud.databricks.com/ml/experiments/2964497923484677/models/m-57fc0bf92d054d22a4327d4d2ccec1d7?o=1444828305810485
Registered model 'baraldi_catalog.regression.pytorch_regression_optimized' already exists. Creating a new version of this model...


Uploading artifacts:   0%|          | 0/12 [00:00<?, ?it/s]

🔗 Created version '3' of model 'baraldi_catalog.regression.pytorch_regression_optimized': https://e2-demo-field-eng.cloud.databricks.com/explore/data/models/baraldi_catalog/regression/pytorch_regression_optimized/version/3?o=1444828305810485


Best model logged: models:/m-57fc0bf92d054d22a4327d4d2ccec1d7
Best parameters: {'n_layers': 1, 'hidden_dim_0': 71, 'dropout_rate': 0.09295014938940827, 'learning_rate': 0.006342731825213631, 'weight_decay': 6.949821071041785e-05, 'use_layer_norm': True}
Test RMSE: 6.1250
Test MAE: 4.8221
Test R²: 0.8517


In [0]:
from mlflow import MlflowClient

# Inicialize o client do MLflow
client = MlflowClient()

# Configure um alias para a melhor versão
# This makes it easier to reference specific model versions programmatically
client.set_registered_model_alias("baraldi_catalog.regression.pytorch_regression_optimized", "best", int(best_model_version))

In [0]:
# Referencie o modelo pelo alias
model_uri = "models:/baraldi_catalog.regression.pytorch_regression_optimized@best"

# Valide se o modelo está deployado
mlflow.models.predict(model_uri=model_uri, input_data=X_test, env_manager="uv")

Downloading artifacts:   0%|          | 0/12 [00:00<?, ?it/s]

2025/09/24 17:28:01 INFO mlflow.models.flavor_backend_registry: Selected backend for flavor 'python_function'


Downloading artifacts:   0%|          | 0/12 [00:00<?, ?it/s]

2025/09/24 17:28:02 INFO mlflow.utils.virtualenv: Creating a new environment in /tmp/virtualenv_envs/mlflow-062a26c0832d863b3ba4dd142d5ea52c63c5e637 with python version 3.10.12 using uv
Using CPython 3.10.12 interpreter at: [36m/usr/bin/python3.10[39m
Creating virtual environment at: [36m/tmp/virtualenv_envs/mlflow-062a26c0832d863b3ba4dd142d5ea52c63c5e637[39m
Activate with: [32msource /tmp/virtualenv_envs/mlflow-062a26c0832d863b3ba4dd142d5ea52c63c5e637/bin/activate[39m
2025/09/24 17:28:03 INFO mlflow.utils.virtualenv: Installing dependencies
[2mUsing Python 3.10.12 environment at: /tmp/virtualenv_envs/mlflow-062a26c0832d863b3ba4dd142d5ea52c63c5e637[0m
[2mResolved [1m3 packages[0m [2min 123ms[0m[0m
[36m[1mDownloading[0m[39m pip [2m(2.0MiB)[0m
[36m[1mDownloading[0m[39m setuptools [2m(1.2MiB)[0m
 [32m[1mDownloading[0m[39m setuptools
 [32m[1mDownloading[0m[39m pip
[2mPrepared [1m3 packages[0m [2min 115ms[0m[0m
[2mInstalled [1m3 packages[0m [2min 

{"predictions": [4.560913562774658, 68.66831970214844, 21.464685440063477, -2.942098617553711, -29.286977767944336, 0.31037479639053345, 60.218746185302734, 27.46315574645996, 2.072153329849243, -19.889680862426758, 11.099003791809082, -27.871129989624023, -1.8581024408340454, 2.1966629028320312, 18.643047332763672, 9.982495307922363, 13.43415641784668, -15.176512718200684, -7.65878438949585, -23.674821853637695, 61.83961868286133, -30.394691467285156, 34.45264434814453, -6.393752574920654, 11.448152542114258, -12.735569953918457, -3.6670424938201904, 43.3541259765625, -4.210549354553223, 28.95589828491211, 51.63481903076172, 38.606719970703125, 37.61429977416992, -18.438140869140625, 10.493430137634277, -29.15499496459961, 23.041114807128906, -18.788503646850586, 46.033809661865234, 17.189565658569336, -2.350978374481201, 42.12752914428711, -10.552478790283203, 30.31363296508789, 13.198495864868164, 5.433516502380371, 30.524444580078125, 20.052837371826172, -23.371023178100586, 1.6279

In [0]:
# Converta os dados para float32
X_test = X_test.astype('float32')

# Carregue o modelo usando a interface pyfunc
loaded_model = mlflow.pyfunc.load_model(model_uri=model_uri)

# Faça predições com o modelo
predictions = loaded_model.predict(X_test)

print(f"Shape das predições: {predictions.shape}")
print(f"Primeiras 5 predições: {predictions[:5]}")
print(f"Primeiros 5 valores reais: {y_test.values[:5]}")

Downloading artifacts:   0%|          | 0/12 [00:00<?, ?it/s]

Shape das predições: (300,)
Primeiras 5 predições: [  4.5609136  68.66832    21.464685   -2.9420986 -29.286978 ]
Primeiros 5 valores reais: [  4.88613324  47.23380529  14.22780521 -11.25203095 -35.36736806]


In [0]:
from pyspark.sql.functions import array, col

# Converta o teste para um Spark DataFrame
X_spark = spark.createDataFrame(X_test)

# Crie uma array de todas as features
# This step is necessary because:
# 1. The PyTorch model expects an input tensor with shape [-1, 13]
# 2. The model_udf needs to receive each row as a single array of 13 values
# 3. Without this array transformation, 13 separate columns would be passed to the model
#    which wouldn't match the expected tensor structure
X_spark_with_array = X_spark.withColumn(
    "features_array", 
    array(*[col(c) for c in X_spark.columns])
)

# Crie um Spark UDF do modelo registrado
model_udf = mlflow.pyfunc.spark_udf(spark, model_uri=model_uri)

# Aplique o MLflow UDF na array
X_spark_with_predictions = X_spark_with_array.withColumn(
    "prediction", 
    model_udf("features_array")
)

display(X_spark_with_predictions.limit(5))

Downloading artifacts:   0%|          | 0/12 [00:00<?, ?it/s]



Downloading artifacts:   0%|          | 0/13 [00:00<?, ?it/s]

2025/09/24 17:29:20 INFO mlflow.models.flavor_backend_registry: Selected backend for flavor 'python_function'


feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_correlated,feature_cyclical,feature_with_outliers,features_array,prediction
0.7972175,-1.7150403,-0.383587,1.1108135,-1.1922657,-2.426833,1.6496313,-2.6474361,3.225119,2.1215987,0.53807855,-0.9909859,0.027851623,"List(0.7972175, -1.7150403, -0.383587, 1.1108135, -1.1922657, -2.426833, 1.6496313, -2.6474361, 3.225119, 2.1215987, 0.53807855, -0.9909859, 0.027851623)",List(4.5609136)
4.467177,-4.731319,-4.3519573,-3.2893207,-2.328453,4.3216467,-1.8794254,4.6037183,1.223765,-1.1317749,3.729179,-0.97028726,-0.5052138,"List(4.467177, -4.731319, -4.3519573, -3.2893207, -2.328453, 4.3216467, -1.8794254, 4.6037183, 1.223765, -1.1317749, 3.729179, -0.97028726, -0.5052138)",List(68.66832)
-3.3402305,-0.55784345,-2.9934723,0.02449961,-3.8878224,4.8379717,-4.2266774,-2.5728123,-2.286605,-0.3515369,-2.5367458,0.62058926,0.39984718,"List(-3.3402305, -0.55784345, -2.9934723, 0.02449961, -3.8878224, 4.8379717, -4.2266774, -2.5728123, -2.286605, -0.3515369, -2.5367458, 0.62058926, 0.39984718)",List(21.464685)
2.7361856,-0.72013474,4.019054,-2.2003388,-4.8401675,4.219471,-0.17828454,3.6004436,4.0995502,0.4238995,2.2600062,-0.97252566,0.39559528,"List(2.7361856, -0.72013474, 4.019054, -2.2003388, -4.8401675, 4.219471, -0.17828454, 3.6004436, 4.0995502, 0.4238995, 2.2600062, -0.97252566, 0.39559528)",List(-2.9420986)
-4.0238957,-4.995931,4.066411,-0.64817244,-4.279194,-2.181568,-2.3677466,-0.06761297,0.92267203,3.686881,-3.2872674,-0.97537553,0.10561319,"List(-4.0238957, -4.995931, 4.066411, -0.64817244, -4.279194, -2.181568, -2.3677466, -0.06761297, 0.92267203, 3.686881, -3.2872674, -0.97537553, 0.10561319)",List(-29.28698)
