In [1]:
GOOGLE_CLOUD_PROJECT="nyc-transit-426211"
USER="andrub818"

In [None]:
from kfp import dsl, compiler
from kfp.dsl import component
import google.cloud.aiplatform as aip

@component(
    base_image='python:3.13',
    packages_to_install=['pandas', 'prophet', 'google-cloud-bigquery', 'google-cloud-storage', 'gcsfs', 'joblib==1.3.2']
)
def preprocess_and_train(base_path: str, dt: str) -> str:
    import pandas as pd
    import pickle
    from datetime import datetime, timedelta
    from google.cloud import bigquery
    from prophet import Prophet

    # Define the paths for features and labels
    # base_path will be motor-vehicle-crashes
    # train_data_path = f"{base_path}/training_data/iris.csv"
    model_path = f"{base_path}/trained_models/model_{dt}.pickle"

    client = bigquery.Client()
    query = f"""
        select
            crash_date as ds,
            count(*) as y
        from `nyc-transit-426211.motor_vehicle_crashes.crashes`
        where crash_date <= '{61 days ago}'
        group by 1
        order by 1 asc
    """
    
    df = client.query(query).to_dataframe()
    
    # MAYBE NEEDED?
    fs = gcsfs.GCSFileSystem()
    
    # Load data from GCS - NOT SURE IF NEEDED
    with fs.open(train_data_path, 'r') as f:
        df = pd.read_csv(f)

    
    model = Prophet()
    mode.fit(df)
    print("Training complete!")

    # Save the model to GCS
    with fs.open(model_path, 'wb') as f:
        joblib.dump(model, f)

    # Save preprocessed data and scaler to GCS
    with fs.open(output_train_features_path, 'w') as f:
        pd.DataFrame(features_train_standardized).to_csv(f, index=False)
    with fs.open(output_test_features_path, 'w') as f:
        pd.DataFrame(features_test_standardized).to_csv(f, index=False)
    with fs.open(output_train_labels_path, 'w') as f:
        pd.Series(target_train_encoded).to_csv(f, index=False)
    with fs.open(output_test_labels_path, 'w') as f:
        pd.Series(target_test_encoded).to_csv(f, index=False)
    
    return base_path
    

@component(
    base_image='python:3.9',
    packages_to_install=['pandas', 'scikit-learn', 'google-cloud-storage', 'gcsfs', 'joblib==1.3.2']
)
def evaluate_model(base_path: str) -> None:
    import pandas as pd
    import gcsfs
    import joblib
    import logging
    from sklearn.metrics import classification_report

    model_path = f"{base_path}/trained_models/model.joblib"
    test_features_path = f"{base_path}/preprocessed_data/test_features.csv"
    test_labels_path = f"{base_path}/preprocessed_data/test_labels.csv"

    fs = gcsfs.GCSFileSystem()

    # Load model and test data
    with fs.open(test_features_path, 'r') as f:
        features_test_standardized = pd.read_csv(f)

    with fs.open(test_labels_path, 'r') as f:
        target_test_encoded = pd.read_csv(f)

    # Load the model from GCS
    with fs.open(model_path, 'rb') as model_file:
        model = joblib.load(model_file)

    predictions = model.predict(features_test_standardized)
    report = classification_report(predictions, target_test_encoded)

    # Output evaluation report
    logging.info(classification_report)

In [None]:
@dsl.pipeline(
    name='iris-classification-pipeline',
    description='An example pipeline that trains and evaluates an Iris classification model.'
)
def iris_pipeline():
    preprocess_and_train_task = preprocess_and_train(base_path = f"gs://{USER}")
    evaluate_task = evaluate_model(base_path = preprocess_and_train_task.output)

# Compile the pipeline
pipeline_func = iris_pipeline
pipeline_filename = f"preprocess_train_evaluate.json"

compiler.Compiler().compile(pipeline_func=pipeline_func,
                            package_path=pipeline_filename)

job = aip.PipelineJob(
    display_name = "my_pipeline",
    template_path = pipeline_filename,
    project = GOOGLE_CLOUD_PROJECT,
)
job.submit()