In [1]:
! pip3 install --upgrade --quiet google-cloud-aiplatform \
                                        google-cloud-storage \
                                        "numpy<2" \
                                        google-cloud-pipeline-components

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
aider-chat 0.75.1 requires urllib3==2.3.0, but you have urllib3 1.26.20 which is incompatible.
auth0-python 4.7.2 requires urllib3<3.0.0,>=2.0.7, but you have urllib3 1.26.20 which is incompatible.
crewai 0.98.0 requires litellm==1.57.4, but you have litellm 1.61.9 which is incompatible.
poetry 1.8.3 requires requests-toolbelt<2.0.0,>=1.0.0, but you have requests-toolbelt 0.10.1 which is incompatible.
sagemaker 2.232.2 requires attrs<24,>=23.1.0, but you have attrs 25.1.0 which is incompatible.
sagemaker 2.232.2 requires importlib-metadata<7.0,>=1.4.0, but you have importlib-metadata 7.2.1 which is incompatible.
streamlit 1.39.0 requires pandas<3,>=1.4.0, but you have pandas 1.3.5 which is incompatible.
transformers 4.46.3 requires tokenizers<0.21,>=0.20, but you have tokenizers 0.19.1 which is incompatible.


In [2]:
PROJECT_ID = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0]

LOCATION = "us-central1"
LOCATION = "us-central1"  # @param {type:"string"}

In [3]:
BUCKET_URI = f"gs://{PROJECT_ID}-unique"  # @param {type:"string"}
PIPELINE_ROOT = f"{BUCKET_URI}/pca_pipeline"

In [4]:
! gsutil mb -l {LOCATION} -p {PROJECT_ID} {BUCKET_URI}

Creating gs://my-project-0004-346516-unique/...
ServiceException: 409 A Cloud Storage bucket named 'my-project-0004-346516-unique' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


In [5]:
PIPELINE_ROOT = f"{BUCKET_URI}/pca_pipeline"  # This is where all pipeline artifacts are sent. You'll need to ensure the bucket is created ahead of time
PIPELINE_ROOT
print(f"PIPELINE_ROOT: {PIPELINE_ROOT}")

PIPELINE_ROOT: gs://my-project-0004-346516-unique/pca_pipeline


In [6]:
SERVICE_ACCOUNT = "[your-service-account]"  # @param {type:"string"}

In [7]:
import sys

IS_COLAB = "google.colab" in sys.modules
if (
    SERVICE_ACCOUNT == ""
    or SERVICE_ACCOUNT is None
    or SERVICE_ACCOUNT == "[your-service-account]"
):
    # Get your service account from gcloud
    if not IS_COLAB:
        shell_output = !gcloud auth list 2>/dev/null
        SERVICE_ACCOUNT = shell_output[2].replace("*", "").strip()

    if IS_COLAB:
        shell_output = ! gcloud projects describe  $PROJECT_ID
        project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
        SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"

    print("Service Account:", SERVICE_ACCOUNT)

Service Account: 255766800726-compute@developer.gserviceaccount.com


In [8]:
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectCreator $BUCKET_URI

! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer $BUCKET_URI

No changes made to gs://my-project-0004-346516-unique/
No changes made to gs://my-project-0004-346516-unique/


In [9]:
from typing import NamedTuple

import kfp
from google.cloud import aiplatform
from kfp import compiler, dsl
from kfp.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                     OutputPath, component)

In [10]:
aiplatform.init(project=PROJECT_ID, staging_bucket=BUCKET_URI)

In [12]:
import kfp
from kfp.dsl import component
from kfp.dsl import Output, Metrics
from typing import NamedTuple
import os

@component(
    packages_to_install=[
        "scikit-learn==1.0.2",
        "pandas==1.3.5",
        "matplotlib==3.5.1",
        "numpy<2"
    ],
    base_image="python:3.9")
def perform_pca( data: Output[Dataset],metrics: Output[Metrics], n_components: int = 2):
    """
    Performs PCA on the Iris dataset, saves the transformed data,
    and generates a scree plot.

    Args:
        n_components (int): The number of principal components to retain.

    Returns:
        None
    """
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.decomposition import PCA
    from sklearn.datasets import load_iris
    from sklearn.preprocessing import StandardScaler
    import os

    # Load the Iris dataset
    iris = load_iris()
    X = iris.data
    y = iris.target

    # Standardize the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Perform PCA
    pca = PCA(n_components=n_components)
    X_reduced = pca.fit_transform(X_scaled)

    # Create a Pandas DataFrame for the transformed data
    df = pd.DataFrame(X_reduced, columns=[f'PC{i+1}' for i in range(n_components)])
    df['target'] = y  # Add the target variable for potential analysis

    # Ensure the output directory exists
    os.makedirs(data.path, exist_ok=True)

    # Save the transformed data to a CSV file
    output_path = os.path.join(data.path, "pca_transformed_data.csv")
    df.to_csv(output_path, index=False)

    # Explained variance ratio
    explained_variance = pca.explained_variance_ratio_
    print("Explained Variance Ratio:", explained_variance)

    # Log metrics
    metrics.log_metric("explained_variance_ratio_PC1", explained_variance[0])
    if n_components > 1:
        metrics.log_metric("explained_variance_ratio_PC2", explained_variance[1])

    # Cumulative explained variance
    cum_var_exp = np.cumsum(explained_variance)

    # Scree plot
    plt.figure(figsize=(8, 6))
    plt.plot(range(1, len(explained_variance) + 1), explained_variance, 'ro-', linewidth=2)
    plt.title('Scree Plot')
    plt.xlabel('Principal Component')
    plt.ylabel('Variance Explained')
    plt.savefig(os.path.join(data.path, "scree_plot.png"))  # Save the plot to a file
    plt.close()

    metrics.log_artifact("scree_plot", data.path + "/scree_plot.png")

    return


In [13]:
@dsl.pipeline(
    name="pca-pipeline",
    description="A pipeline that performs PCA on the Iris dataset"
)
def pca_pipeline():
    pca_task = perform_pca(
        n_components=2
    )
    pca_task.set_caching_options(enable_caching=False)

In [14]:
compiler.Compiler().compile(
    pipeline_func=pca_pipeline, package_path="pca_pipeline.yaml"
)

In [15]:
job = aiplatform.PipelineJob(
    display_name="pca-pipeline",
    template_path="pca_pipeline.yaml",
    pipeline_root=PIPELINE_ROOT,
)

job.run()

Creating PipelineJob
PipelineJob created. Resource name: projects/255766800726/locations/us-central1/pipelineJobs/pca-pipeline-20250225235707
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/255766800726/locations/us-central1/pipelineJobs/pca-pipeline-20250225235707')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/pca-pipeline-20250225235707?project=255766800726
PipelineJob projects/255766800726/locations/us-central1/pipelineJobs/pca-pipeline-20250225235707 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/255766800726/locations/us-central1/pipelineJobs/pca-pipeline-20250225235707 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/255766800726/locations/us-central1/pipelineJobs/pca-pipeline-20250225235707 current state:
PipelineState.PIPELINE_STATE_RUNNING


RuntimeError: Job failed with:
code: 9
message: " The DAG failed because some tasks failed. The failed tasks are: [perform-pca].; Job (project_id = my-project-0004-346516, job_id = 7279514781222961152) is failed due to the above error.; Failed to handle the job: {project_number = 255766800726, job_id = 7279514781222961152}"


### Suggestion - can be deleted

In [18]:
# web_downloader_op = kfp.components.load_component_from_url(

#     'https://raw.githubusercontent.com/kubeflow/pipelines/master/components/web/Download/component-sdk-v2.yaml')


In [19]:
@component(

    packages_to_install=['pandas==1.1.4'],

    output_component_file='component.yaml'

)

def merge_csv(tar_data: Input[Artifact], output_csv: Output[Dataset]):

  import glob

  import pandas as pd

  import tarfile




  tarfile.open(name=tar_data.path, mode="r|gz").extractall('data')

  df = pd.concat(

      [pd.read_csv(csv_file, header=None) 

       for csv_file in glob.glob('data/*.csv')])

  df.to_csv(output_csv.path, index=False, header=False)


  @component(
  def merge_csv(tar_data: Input[Artifact], output_csv: Output[Dataset]):
  return component_factory.create_component_from_func(


In [21]:
@dsl.pipeline(
    name="pca-pipeline",
    description="A pipeline that performs PCA on the Iris dataset"
)
def pca_pipeline():
    pca_task = perform_pca(
        n_components=2
    )
    pca_task.set_caching_options(enable_caching=False)

In [23]:
kfp.compiler.Compiler().compile(

    pipeline_func=pca_pipeline,

    package_path='pca_pipeline.yaml')



In [None]:
job = aiplatform.PipelineJob(
    display_name="pca-pipeline",
    template_path="pca_pipeline.yaml",
    pipeline_root=PIPELINE_ROOT,
)

job.run()

Creating PipelineJob
PipelineJob created. Resource name: projects/255766800726/locations/us-central1/pipelineJobs/pca-pipeline-20250226001212
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/255766800726/locations/us-central1/pipelineJobs/pca-pipeline-20250226001212')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/pca-pipeline-20250226001212?project=255766800726
PipelineJob projects/255766800726/locations/us-central1/pipelineJobs/pca-pipeline-20250226001212 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/255766800726/locations/us-central1/pipelineJobs/pca-pipeline-20250226001212 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/255766800726/locations/us-central1/pipelineJobs/pca-pipeline-20250226001212 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/255766800726/locations/us-central1/pipelineJobs/pca-pipeline-2025022