In [None]:
# PARAMETERIZED VARIABLES

PROJECT_ID = "{{PROJECT_ID}}"
TARGET_BUCKET = "{{TARGET_BUCKET}}"
ORGANIZATION_ID = "{{ORGANIZATION_ID}}"
DATAFOUNDATION_TABLE = "{{DATAFOUNDATION_TABLE}}"
BUCKET_PIPELINE = "{{BUCKET_PIPELINE}}"
FEATURE_LIST = {{FEATURE_LIST}}
AUDIENCE_ID = {{AUDIENCE_ID}}
TOPIC_ID = "{{TOPIC_ID}}"
MAX_MSISDN_COUNT = {{MAX_MSISDN_COUNT}}
MSISDN_SAMPLE_PATH = "{{MSISDN_SAMPLE_PATH}}"

In [14]:
# # PARAMETERIZED VARIABLES

# PROJECT_ID = "sbox-ext-collab-prd-50f1"
# TARGET_BUCKET = "gs://model-results-lookalike-model-sandbox"
# ORGANIZATION_ID = "CUSTOMER_A"
# # DATAFOUNDATION_TABLE = "owned_summary.df_customer_data_profile"
# DATAFOUNDATION_TABLE = "owned_summary.test_heuristic_lookalike"
# BUCKET_PIPELINE = "gs://bucket-collab-prd/Arifian/vertex_pipeline_files"
# FEATURE_LIST = ['age','tenure','total_arpu','data_usage_in_mb','data_usage_duration','total_topups',
#             'number_of_topups_1m','total_topups_1m','total_usage_GB_1m','daily_GB_consumption_rate_1m','number_of_topups_2m',
#              'total_topups_2m','total_usage_GB_2m','daily_GB_consumption_rate_2m','number_of_topups_3m','total_topups_3m','total_usage_GB_3m',
#              'daily_GB_consumption_rate_3m']
# AUDIENCE_ID = 1
# TOPIC_ID = "tempbucket-to-audience-table"
# MAX_MSISDN_COUNT = 100
# MSISDN_SAMPLE_LIST = ['Yia/XQuEu4dKaKi3jf29xbZsqsRV0XCHw9QkYR1gk8k=']

In [2]:
TARGET_GCS_LOCATION = TARGET_BUCKET+"/"+ORGANIZATION_ID

In [3]:
pipeline_name = 'generic_lookalike_python_'+ORGANIZATION_ID

In [4]:
from datetime import datetime

REGION = "asia-southeast2"
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
PIPELINE_ROOT = f"{BUCKET_PIPELINE}/pipeline_root/heuristic_model_python_{ORGANIZATION_ID}"

In [5]:
import google.cloud.aiplatform as aip

from typing import NamedTuple
from kfp.v2 import dsl
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, component)

In [6]:
aip.init(project=PROJECT_ID, staging_bucket=BUCKET_PIPELINE, location=REGION)

In [7]:
import tempfile
tmpdir = tempfile.gettempdir()

In [16]:
# get msisdn sample list

@component(
    base_image='python:3.9',
    packages_to_install=[
        "pandas==1.5.2", "gcsfs==2022.11.0", "pyarrow==10.0.1"
    ],
    output_component_file=tmpdir+"/get_msisdns_seed.yaml"
)
def get_msisdns_seed(
    msisdn_sample_path: str, project_id: str
) -> NamedTuple('Outputs', [('msisdn_sample', list)]):
    import pandas as pd
    import gcsfs

    df = pd.read_csv(msisdn_sample_path)
    msisdn_list = df['msisdn'].values.tolist()

    from typing import NamedTuple
    outputs = NamedTuple('Outputs', [('msisdn_sample', list)])
    return outputs(msisdn_list)

In [17]:
# script heuristic lookalike

@component(
    base_image='python:3.9',
    packages_to_install=[
        "pandas==1.3.5", "google-cloud-bigquery==2.34.4", "pyarrow==10.0.0",
        "numpy==1.21.6", 
        "scikit-learn==1.0.2",
        "scipy==1.7.3",
    ],
    output_component_file=tmpdir+"/heuristic_script_python.yaml"
)
def heuristic_script_python(
    lookalike_result: Output[Dataset], table_full_name: str,
    project_id: str, feature_list: list, msisdn_sample_list: list,
    max_msisdn_count: int
):
    import pandas as pd
    import numpy as np
    from scipy.spatial import distance
    from sklearn.metrics.pairwise import euclidean_distances
    from sklearn.model_selection import train_test_split
    pd.set_option('display.max_columns', 300)
    pd.set_option('display.max_rows', 500)
    pd.set_option('mode.chained_assignment', None)

    from google.cloud import bigquery
    client = bigquery.Client(project_id)

    feature_list_str = ', '.join(feature_list)
    
    sql = f"""SELECT 
    msisdn, {feature_list_str}
FROM `{table_full_name}`
WHERE (partition_month = '2022-10-01')"""
    df = client.query(sql).to_dataframe()

    df = df.fillna(0)

    df_sub_seed = df[df['msisdn'].isin(msisdn_sample_list)]
    df_sub_seed = df_sub_seed.loc[:, df_sub_seed.columns != 'msisdn']

    df_pop_msisdn = df[~(df['msisdn'].isin(msisdn_sample_list))]    
    df_sub_pop = df_pop_msisdn.loc[:, df_pop_msisdn.columns != 'msisdn']
    df_pop_msisdn = df_pop_msisdn[['msisdn']]

    del df
    import gc
    gc.collect()

    df_sub_seed = df_sub_seed.reset_index(drop=True)

    distance_matrix = pd.DataFrame(columns=['Distance'])
    for i in range(df_sub_pop.shape[0]):
        df_sub_pop_temp = df_sub_pop.head(0)
        df_sub_pop_temp.loc[i] = df_sub_pop.iloc[i]
        distance = euclidean_distances(df_sub_seed, df_sub_pop_temp)
        distance_matrix.loc[i] = min(distance)
    distance_matrix

    df_pop_msisdn = df_pop_msisdn.reset_index(drop=True)
    df_population_dist_agg = pd.concat([df_pop_msisdn, distance_matrix], axis=1)
    df_population_dist_agg = df_population_dist_agg.sort_values('Distance', ascending=False)
    
    df_population_dist_agg = df_population_dist_agg[['msisdn']][0:max_msisdn_count]

    df_population_dist_agg.to_parquet(lookalike_result.path, index=False)

In [18]:
# write to gcs

@component(
    base_image='python:3.9',
    packages_to_install=[
        "pandas==1.5.2", "gcsfs==2022.11.0", "pyarrow==10.0.1", "datetime"
    ],
    output_component_file=tmpdir+"/write_to_gcs.yaml"
)
def write_to_gcs(
    data: Input[Dataset], project_id: str, 
    audience_id: int, bucket: str
) -> NamedTuple('Outputs', [('data_gcs_path', str)]):
    import pandas as pd
    from datetime import datetime
    TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

    df_to_store = pd.read_parquet(data.path)
    df_to_store['audience_id'] = audience_id
    df_to_store = df_to_store[['audience_id', 'msisdn']]

    path = bucket+f"/generic_python_model_{TIMESTAMP}.csv"

    df_to_store.to_csv(path, index=False, header=False)
    from typing import NamedTuple
    
    outputs = NamedTuple('Outputs', [('data_gcs_path', str)])
    return outputs(path)

In [19]:
# publish to pubsub

@component(
    base_image='python:3.9',
    packages_to_install=[
        "google-cloud-pubsub==2.13.4", "datetime"
    ],
    output_component_file=tmpdir+"/aud_data_to_pubsub.yaml"
)

def aud_data_to_pubsub(
    data_gcs_path: str, topic_id: str,
    project_id: str, audience_id: int
):
    import json
    from google.cloud import pubsub_v1
    publisher = pubsub_v1.PublisherClient()
    topic_path = publisher.topic_path(project_id, topic_id)

    message = {
        'audience_id': audience_id,
        'file_path': data_gcs_path,
    }

    future = publisher.publish(topic_path, json.dumps(message).encode("utf-8"))
    print(future.result())
    print(f"Published messages to {topic_path}.")


In [20]:
from google_cloud_pipeline_components.v1.custom_job import create_custom_training_job_from_component

heuristic_script_python_v2 = create_custom_training_job_from_component(
    heuristic_script_python,
    display_name = 'heuristic_script_python',
    machine_type = 'n1-highmem-16',
)


In [12]:
@dsl.pipeline(
    pipeline_root=PIPELINE_ROOT,
    name=pipeline_name.replace("_","-").lower(),
)
def pipeline(
    target_bucket: str,
    datafoundation_table_fullname: str,
    project_id: str,
    feature_list: list,
    msisdn_sample_path: str,
    max_msisdn_count: int,
    audience_id: int,
    topic_id: str
):
    get_lookalike_msisdn_seed = get_msisdns_seed(
        msisdn_sample_path=msisdn_sample_path, project_id=project_id
    )
    heuristic_lookalike = heuristic_script_python_v2(
        table_full_name=datafoundation_table_fullname,
        project_id=project_id, feature_list=feature_list,
        msisdn_sample_list=get_lookalike_msisdn_seed.outputs['msisdn_sample'], max_msisdn_count=max_msisdn_count,
        project=project_id,
        location=REGION
    )    
    write_result_to_gcs = write_to_gcs(
        data=heuristic_lookalike.outputs["lookalike_result"],
        bucket=target_bucket, project_id=project_id,
        audience_id=audience_id
    )
    trigger_load_gcs_to_cloudsql = aud_data_to_pubsub(
        data_gcs_path=write_result_to_gcs.outputs['data_gcs_path'],
        topic_id=topic_id, project_id=project_id, 
        audience_id=audience_id
    )    


In [13]:
from kfp.v2 import compiler

compiler.Compiler().compile(
    pipeline_func=pipeline, package_path=tmpdir+f"/{pipeline_name}.json"
)



In [None]:
DISPLAY_NAME = pipeline_name + "_" + TIMESTAMP

job = aip.PipelineJob(
    display_name=DISPLAY_NAME,
    template_path=tmpdir+f"/{pipeline_name}.json",
    pipeline_root=PIPELINE_ROOT,
    parameter_values={
        "project_id": PROJECT_ID,
        "datafoundation_table_fullname": DATAFOUNDATION_TABLE,
        "feature_list": FEATURE_LIST,
        "target_bucket": TARGET_GCS_LOCATION,
        "audience_id": AUDIENCE_ID,
        "msisdn_sample_path": MSISDN_SAMPLE_PATH,
        "max_msisdn_count": MAX_MSISDN_COUNT,
        "audience_id": AUDIENCE_ID,
        "topic_id": TOPIC_ID
    },
    enable_caching=False
)
job.run(sync=False)

Creating PipelineJob
PipelineJob created. Resource name: projects/731696491468/locations/asia-southeast2/pipelineJobs/generic-lookalike-python-customer-a-20221221082715
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/731696491468/locations/asia-southeast2/pipelineJobs/generic-lookalike-python-customer-a-20221221082715')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/asia-southeast2/pipelines/runs/generic-lookalike-python-customer-a-20221221082715?project=731696491468
PipelineJob projects/731696491468/locations/asia-southeast2/pipelineJobs/generic-lookalike-python-customer-a-20221221082715 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/731696491468/locations/asia-southeast2/pipelineJobs/generic-lookalike-python-customer-a-20221221082715 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/731696491468/locations/asia-southeast2/pipelineJobs/generic-lookalike-python-custo