In [None]:
import yaml
from kfp import dsl
from kfp.dsl import (
    component,
    Metrics,
    Dataset,
    Input,
    Model,
    Artifact,
    OutputPath,
    Output,
)
from kfp import compiler
import google.cloud.aiplatform as aiplatform
import os

In [None]:
@component(
    base_image="asia-south1-docker.pkg.dev/<projectname>/kubeflow_pipelines/demo_model"
)
def data_ingestion(input_data_path: str, input_data: Output[Dataset],):
    import pandas as pd
    from datetime import datetime, timedelta
    from google.cloud import bigquery
    import logging
    df = pd.read_csv(input_data_path)
    df.to_csv(input_data.path, index=False)

In [None]:
@component(
    base_image="asia-south1-docker.pkg.dev/<projectname>/kubeflow_pipelines/demo_model"
)
def preprocessing(train_df: Input[Dataset], input_data_preprocessed: Output[Dataset]):
    import pandas as pd
    import numpy as np
    from src.preprocess import get_preprocessing
    import logging

    df = pd.read_csv(train_df.path)
    df = get_preprocessing(df)
    df.to_csv(input_data_preprocessed.path, index=False)

In [None]:
@component(
    base_image="asia-south1-docker.pkg.dev/<projectname>/kubeflow_pipelines/demo_model"
)
def train_test_data_split(
    dataset_in: Input[Dataset],
    target_column: str,
    dataset_train: Output[Dataset],
    dataset_test: Output[Dataset],
    test_size: float = 0.2,
):
    from src.train_test_splits import get_train_test_splits
    import pandas as pd
    import logging

    data = pd.read_csv(dataset_in.path)
    X_train, X_test = get_train_test_splits(
        data, target_column, test_size
    )
    logger.info("Dataset train Shape: ", X_train.shape)
    logger.info("Dataset test Shape: ", X_test.shape)
    logger.info(f"X_train columns: {list(X_train.columns)}")
    X_train.to_csv(dataset_train.path, index=False)
    X_test.to_csv(dataset_test.path, index=False)