In [31]:
import kfp
import kfp.compiler as cpl
from kfp import components
from kfp import dsl
from kfp.components import create_component_from_func, InputPath, OutputPath

In [32]:
chicago_taxi_dataset_op = components.load_component('chicago_data_component.yaml')
pandas_transform_csv_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/6162d55998b176b50267d351241100bb0ee715bc/components/pandas/Transform_DataFrame/in_CSV_format/component.yaml')
download_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/240543e483076ae718f82c6f280441daa2f041fd/components/web/Download/component.yaml')
create_fully_connected_pytorch_network_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/4e1facea1a270535b515a9e8cc59422d1ad76a9e/components/PyTorch/Create_fully_connected_network/component.yaml')
convert_to_onnx_from_pytorch_script_module_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/e011e4affa85542ef2b24d63fdac27f8d939bbee/components/PyTorch/Convert_to_OnnxModel_from_PyTorchScriptModule/component.yaml')
create_pytorch_model_archive_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/abc180be2b2b5538d19eb87124684629ec45e620/components/PyTorch/Create_PyTorch_Model_Archive/component.yaml')

In [33]:
def train_pytorch_model_from_csv(
    model_path: InputPath('PyTorchScriptModule'),
    training_data_path: InputPath('CSV'),
    trained_model_path: OutputPath('PyTorchScriptModule'),
    label_column_name: str,
    loss_function_name: str = 'mse_loss',
    number_of_epochs: int = 1,
    learning_rate: float = 0.1,
    optimizer_name: str = 'Adadelta',
    optimizer_parameters: dict = None,
    batch_size: int = 32,
    batch_log_interval: int = 100,
    random_seed: int = 0,
):
    '''Trains PyTorch model'''
    import pandas
    import torch

    torch.manual_seed(random_seed)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    model = torch.jit.load(model_path)
    model.to(device)
    model.train()

    optimizer_class = getattr(torch.optim, optimizer_name, None)
    if not optimizer_class:
        raise ValueError(f'Optimizer "{optimizer_name}" was not found.')

    optimizer_parameters = optimizer_parameters or {}
    optimizer_parameters['lr'] = learning_rate
    optimizer = optimizer_class(model.parameters(), **optimizer_parameters)

    loss_function = getattr(torch, loss_function_name, None) or getattr(torch.nn, loss_function_name, None) or getattr(torch.nn.functional, loss_function_name, None)
    if not loss_function:
        raise ValueError(f'Loss function "{loss_function_name}" was not found.')

    class CsvDataset(torch.utils.data.Dataset):

        def __init__(self, file_path: str, label_column_name: str, drop_nan_clumns_or_rows: str = 'columns'):
            dataframe = pandas.read_csv(file_path)
            # Preventing error: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found object
            if drop_nan_clumns_or_rows == 'columns':
                non_nan_data = dataframe.dropna(axis='columns')
                removed_columns = set(dataframe.columns) - set(non_nan_data.columns)
                if removed_columns:
                    print('Skipping columns with NaNs: ' + str(removed_columns))
                dataframe = non_nan_data
            if drop_nan_clumns_or_rows == 'rows':
                non_nan_data = dataframe.dropna(axis='index')
                number_of_removed_rows = len(dataframe) - len(non_nan_data)
                if number_of_removed_rows:
                    print(f'Skipped {number_of_removed_rows} rows with NaNs.')
                dataframe = non_nan_data
            numerical_data = dataframe.select_dtypes(include='number')
            non_numerical_data = dataframe.select_dtypes(exclude='number')
            if not non_numerical_data.empty:
                print('Skipping non-number columns:')
                print(non_numerical_data.dtypes)
            self._dataframe = dataframe
            self.labels = numerical_data[[label_column_name]]
            self.features = numerical_data.drop(columns=[label_column_name])

        def __len__(self):
            return len(self._dataframe)

        def __getitem__(self, index):
            return [self.features.loc[index].to_numpy(dtype='float32'), self.labels.loc[index].to_numpy(dtype='float32')]

    dataset = CsvDataset(
        file_path=training_data_path,
        label_column_name=label_column_name,
    )
    train_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=True,
    )

    last_full_batch_loss = None
    for epoch in range(1, number_of_epochs + 1):
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = loss_function(output, target)
            loss.backward()
            optimizer.step()
            if len(data) == batch_size:
                last_full_batch_loss = loss.item()
            if batch_idx % batch_log_interval == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), loss.item()))
        print(f'Training epoch {epoch} completed. Last full batch loss: {last_full_batch_loss:.6f}')

    # print(optimizer.state_dict())
    model.save(trained_model_path)

In [34]:
train_pytorch_model_from_csv_op = create_component_from_func(
    train_pytorch_model_from_csv,
    output_component_file='train_pytorch_model_from_csv_component.yaml',
    base_image='pytorch/pytorch:1.7.1-cuda11.0-cudnn8-runtime',
    packages_to_install=['pandas==1.1.5']
)

In [35]:
train_pytorch_model_from_csv_op = components.load_component('train_pytorch_model_from_csv_component.yaml')

In [36]:
@dsl.pipeline(
    name='Pipeline Chicago Taxi',
    description='Exemplo Pytorch Chicago Taxi'
)
def pytorch_pipeline():
    
    # Excluído "trip_total" para evitar vazamento de dados
    feature_columns = ['trip_seconds', 'trip_miles', 'pickup_community_area', 'dropoff_community_area', 'fare', 'tolls', 'extras'] 
    label_column = 'tips'
    
    network = create_fully_connected_pytorch_network_op(
        layer_sizes=[len(feature_columns), 100, 10, 1],
        activation_name='elu',
    ).output

    training_data = chicago_taxi_dataset_op(
        where='trip_start_timestamp >= "2019-01-01" AND trip_start_timestamp < "2019-02-01"',
        select=','.join([label_column] + feature_columns),
        limit=10000,
    ).output

    training_data = pandas_transform_csv_op(
        table=training_data,
        transform_code='''df = df.fillna({'tolls': 0.0, 'extras': 0.0}); 
                          df = df.dropna(axis='index')''',
    ).output

    trained_model = train_pytorch_model_from_csv_op(
        model=network,
        training_data=training_data,
        label_column_name=label_column,
        loss_function_name='mse_loss',
        # Optional:
        batch_size=32,
        number_of_epochs=2,
        random_seed=0,
        learning_rate=0.1,
        optimizer_name='Adadelta',
        optimizer_parameters={},
    ).outputs['trained_model']

    convert_to_onnx_from_pytorch_script_module_op(
        model=trained_model,
        list_of_input_shapes=[[len(feature_columns)]],
    )

    # TODO: Use a real working regression handler here. See https://github.com/pytorch/serve/issues/987
    serving_handler = download_op('https://raw.githubusercontent.com/pytorch/serve/5c03e711a401387a1d42fc01072fcc38b4995b66/ts/torch_handler/base_handler.py').output
    
    model_archive = create_pytorch_model_archive_op(
        model=trained_model,
        handler=serving_handler,
        # model_name="model",  # Optional
        # model_version="1.0",  # Optional
    ).output

In [None]:
args={}

KFP_ENDPOINT='http://localhost:8080/pipeline'
client = kfp.Client(host=KFP_ENDPOINT)

In [None]:
client.create_run_from_pipeline_func(
    pytorch_pipeline,
    arguments={},
)

In [16]:
cpl.Compiler().compile(
    pipeline_func=pytorch_pipeline,
    package_path='taxi_pytorch_pipeline.yaml'
)    

In [18]:
client.create_run_from_pipeline_package(
    pipeline_file='taxi_pytorch_pipeline.yaml',
    arguments=args
)

RunPipelineResult(run_id=5aae479f-2da9-4e2a-ac5b-a5749dd536de)