In [1]:
PROJECT_ID = 'churn-smu'
GCS_BUCKET = 'practice-smu-123'

## Docker Image for data ingestion

### PY file

In [15]:
%%bash

# Create folders if they don't exist.
mkdir -p tmp/components/ingest/

# Create the Python file that lists GCS blobs.
cat > ./tmp/components/ingest/data_ingest.py <<HERE
import argparse
import pandas as pd

parser = argparse.ArgumentParser()
parser.add_argument(
    '--datapath', type=str, required=True, help='Name of the model file.')
parser.add_argument(
    '--dataset', type=str, required=True, help='GCS bucket name.')

args = parser.parse_args()
datapath=args.datapath
dataset=args.dataset

df_churn = pd.read_csv(datapath)

FEATURE_NAMES = ["GENDER_CD", "EDUCATION_CD", "TOT_SRV_DROPPED_CNT", "TOT_OB_CALL_INTL_ROAM_CNT",
                'BARRING_REASON_CD',  "TOT_SRV_ADDED_CNT", "SUBS_TENURE", "TOT_DAY_LAST_COMPLAINT_CNT",
                "TOT_DAY_LAST_SUSPENDED_CNT", "MTH_TO_SUBS_END_CNT", 'REV_AMT_BASE_1', 'REV_AMT_BASE_2',
                'CUST_AGE','PCT_CHNG_IB_SMS_CNT','CHURN_FLG']

df_churn = df_churn[FEATURE_NAMES]
df_churn.to_csv(dataset + ".csv", index=False, encoding = 'utf-8-sig')

HERE

### Dockerise

In [16]:
%%bash
cat > ./tmp/components/ingest/Dockerfile <<EOF
FROM python:3.9
WORKDIR /digest
COPY data_ingest.py data_ingest.py
RUN pip install pandas
RUN pip install gcsfs
ENV PYTHONPATH="/digest:${PYTHONPATH}"
EOF

### Build image and push

In [17]:
%%bash -s "{PROJECT_ID}"
IMAGE_NAME="churn-data-digestion"
TAG="latest"
cat > ./tmp/components/ingest/build_image.sh <<HERE
PROJECT_ID="${1}"
IMAGE_NAME="${IMAGE_NAME}"
TAG="${TAG}"
GCR_IMAGE="gcr.io/\${PROJECT_ID}/\${IMAGE_NAME}:\${TAG}"
echo ${GCR_IMAGE}
docker build -t \${IMAGE_NAME} .
docker tag \${IMAGE_NAME} \${GCR_IMAGE}
docker push \${GCR_IMAGE}
HERE
cd tmp/components/ingest
bash build_image.sh

# Docker Image for Data Validation

In [113]:
%%bash

# Create folders if they don't exist.
mkdir -p tmp/components/tfdv/

# Create the Python file that lists GCS blobs.
cat > ./tmp/components/tfdv/tfdv.py <<HERE

import argparse
from google.cloud import storage
import tensorflow_data_validation as tfdv
import tensorflow_data_validation.statistics.stats_impl
import pandas as pd
from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions, StandardOptions, SetupOptions

parser = argparse.ArgumentParser()
parser.add_argument(
    '--input_data', type=str, required=True, help='input_data')
parser.add_argument(
    '--output_data', type=str, required=True, help='output_data')
parser.add_argument(
    '--project_id', type=str, required=True, help='project_id')
parser.add_argument(
    '--region', type=str, required=True, help='region')
parser.add_argument(
    '--gcs_temp_location', type=str, required=True, help='gcs_temp_location')
parser.add_argument(
    '--gcs_staging_location', type=str, required=True, help='gcs_staging_location')
parser.add_argument(
    '--bucket', type=str, required=True, help='bucket')

args = parser.parse_args()
input_data=args.input_data
project_id=args.project_id
region=args.region
output_data=args.output_data
gcs_temp_location=args.gcs_temp_location
gcs_staging_location=args.gcs_staging_location
bucket=args.bucket

job_name = 'dv4'
pre_data = pd.read_csv(input_data + '.csv')
schema_path = 'churn/metadata/schema/orig_stats.pb'

# Create and set your PipelineOptions.
options = PipelineOptions()

# For Cloud execution, set the Cloud Platform project, job_name,
# staging location, temp_location and specify DataflowRunner.
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = project_id
google_cloud_options.job_name = job_name
google_cloud_options.staging_location = gcs_staging_location
google_cloud_options.temp_location = gcs_temp_location
google_cloud_options.region = region
options.view_as(StandardOptions).runner = 'DataflowRunner'

setup_options = options.view_as(SetupOptions)
# PATH_TO_WHL_FILE should point to the downloaded tfdv wheel file.
setup_options.extra_packages = ['tensorflow_data_validation-1.8.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl']

storage_client = storage.Client()
storage_bucket = storage_client.bucket(bucket)
schema_exist = storage.Blob(bucket=storage_bucket, name=schema_path).exists(storage_client)

res = 'true'

NUMERICAL_FEATURE_NAMES = [
    "SUBS_TENURE",
    "TOT_DAY_LAST_COMPLAINT_CNT",
    "TOT_DAY_LAST_SUSPENDED_CNT",
    "MTH_TO_SUBS_END_CNT",
    'REV_AMT_BASE_1',
    'REV_AMT_BASE_2',
    'CUST_AGE',
    'PCT_CHNG_IB_SMS_CNT'
]

EMBEDDING_CATEGORICAL_FEATURES = [
    "GENDER_CD",
    "EDUCATION_CD"]

if schema_exist:

    new_stats = tfdv.generate_statistics_from_csv(input_data.path + '.csv',
                                                  output_path=f'gs://{bucket}/churn/tmp/temp.pb',
                                                 pipeline_options=options,
                                                       )

    old_stats = tfdv.load_statistics(f'gs://{bucket}/churn/metadata/schema/orig_stats.pb')

    schema1 = tfdv.infer_schema(statistics=old_stats)
    for feature in NUMERICAL_FEATURE_NAMES:
        tfdv.get_feature(schema1, feature).drift_comparator.jensen_shannon_divergence.threshold = 0.15

    for feature in EMBEDDING_CATEGORICAL_FEATURES:
        tfdv.get_feature(schema1, feature).drift_comparator.infinity_norm.threshold = 0.1

    drift_anomalies = tfdv.validate_statistics(
        statistics=new_stats, schema=schema1, previous_statistics=old_stats)

    from google.protobuf.json_format import MessageToDict
    d = MessageToDict(drift_anomalies)
    val = d['driftSkewInfo'][0]['driftMeasurements'][0]['value']
    thresh = d['driftSkewInfo'][0]['driftMeasurements'][0]['threshold']

    if val < thresh:
        res = 'false'

if not schema_exist:    

    tfdv.generate_statistics_from_csv(input_data + '.csv',
                                      output_path=f'gs://{bucket}/churn/metadata/schema/orig_stats.pb',
                                      pipeline_options=options,
                                           )

assert res == 'true', "Data Validation failed"

HERE

In [114]:
%%bash
cat > ./tmp/components/tfdv/Dockerfile <<EOF
FROM gcr.io/churn-smu/churn-tfdv:latest
WORKDIR /tfdv
COPY tfdv.py tfdv.py

RUN pip install pandas
RUN pip install gcsfs
RUN pip install google-cloud-storage

ENV PYTHONPATH="/digest:${PYTHONPATH}"
EOF

In [115]:
%%bash -s "{PROJECT_ID}"
IMAGE_NAME="churn-data-tfdv"
TAG="latest"
cat > ./tmp/components/tfdv/build_image.sh <<HERE
PROJECT_ID="${1}"
IMAGE_NAME="${IMAGE_NAME}"
TAG="${TAG}"
GCR_IMAGE="gcr.io/\${PROJECT_ID}/\${IMAGE_NAME}:\${TAG}"
echo ${GCR_IMAGE}
docker build -t \${IMAGE_NAME} .
docker tag \${IMAGE_NAME} \${GCR_IMAGE}
docker push \${GCR_IMAGE}
HERE
cd tmp/components/tfdv
bash build_image.sh


Sending build context to Docker daemon   7.68kB
Step 1/7 : FROM gcr.io/churn-smu/churn-tfdv:latest
latest: Pulling from churn-smu/churn-tfdv
f22ccc0b8772: Pulling fs layer
3cf8fb62ba5f: Pulling fs layer
e80c964ece6a: Pulling fs layer
b37f61c40172: Pulling fs layer
8c47335e6fbf: Pulling fs layer
b4130fb48840: Pulling fs layer
2d065d739ca2: Pulling fs layer
c785fe321ad3: Pulling fs layer
fa18799a91d9: Pulling fs layer
956e27097e62: Pulling fs layer
ab616973205e: Pulling fs layer
28adb37a4160: Pulling fs layer
f224a69aa011: Pulling fs layer
7f850b9da14b: Pulling fs layer
b4130fb48840: Waiting
d3078a090cbd: Pulling fs layer
2d065d739ca2: Waiting
5a481cbb57e8: Pulling fs layer
234007c5a90e: Pulling fs layer
a03c6fd6bc3e: Pulling fs layer
ee1287759c0d: Pulling fs layer
23e9a987485c: Pulling fs layer
f5d6f69c6295: Pulling fs layer
6d47547837e5: Pulling fs layer
11b712f0bf96: Pulling fs layer
bb003bf5d206: Pulling fs layer
9bf87f0a51a6: Pulling fs layer
f6f17e35ab72: Pulling fs layer
b37f61c4

# Docker Image for Data Imputation & Store

In [42]:
%%bash

# Create folders if they don't exist.
mkdir -p tmp/components/impute_and_store/

# Create the Python file that lists GCS blobs.
cat > ./tmp/components/impute_and_store/impute_and_store.py <<HERE
import argparse
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np
import pickle
import google.cloud.storage as storage

parser = argparse.ArgumentParser()
parser.add_argument(
    '--pre_impute_dataset', type=str, required=True, help='path of dataset input')
parser.add_argument(
    '--post_impute_dataset', type=str, required=True, help='path of dataset output')
parser.add_argument(
    '--bucket', type=str, required=True, help='GCS Bucket')

args = parser.parse_args()
pre_impute_dataset=args.pre_impute_dataset
post_impute_dataset=args.post_impute_dataset
bucket_name = args.bucket

def recode_TOT_OB_CALL_INTL_ROAM_CNT(data):
    if data <= 100:
        return("<=100")
    elif data <= 200:
        return("<=200")
    else:
        return(">200")

def recode_education(data):
    if data == " .":
        return("none")
    elif int(data) >= 3:
        return(">=3")
    else:
        return(data)

NUMERICAL_FEATURE_NAMES = [
    "SUBS_TENURE",
    "TOT_DAY_LAST_COMPLAINT_CNT",
    "TOT_DAY_LAST_SUSPENDED_CNT",
    "MTH_TO_SUBS_END_CNT",
    'REV_AMT_BASE_1',
    'REV_AMT_BASE_2',
    'CUST_AGE',
    'PCT_CHNG_IB_SMS_CNT'
]

EMBEDDING_CATEGORICAL_FEATURES = [
    "GENDER_CD",
    "EDUCATION_CD",
    "TOT_SRV_DROPPED_CNT",
    "TOT_OB_CALL_INTL_ROAM_CNT",
    'BARRING_REASON_CD', 
    "TOT_SRV_ADDED_CNT"]

TARGET_LABEL = ['CHURN_FLG']

pre_data = pd.read_csv(pre_impute_dataset + '.csv')
pre_data["EDUCATION_CD"] = pre_data["EDUCATION_CD"].apply(recode_education)
pre_data["TOT_OB_CALL_INTL_ROAM_CNT"] = pre_data["TOT_OB_CALL_INTL_ROAM_CNT"].apply(recode_TOT_OB_CALL_INTL_ROAM_CNT)
pre_data_cat = pre_data[EMBEDDING_CATEGORICAL_FEATURES]
pre_data_num = pre_data[NUMERICAL_FEATURE_NAMES]

bucket = storage.Client().bucket(bucket_name)

cat_imp = SimpleImputer(strategy = 'most_frequent')
num_imp = SimpleImputer(strategy = 'median')
imputed_data_cat = cat_imp.fit_transform(pre_data_cat)
imputed_data_num = num_imp.fit_transform(pre_data_num)

# Store Cat Imputer
cat_imp_name = "cat_imputer" + f".pkl"
with open(cat_imp_name, 'wb') as file:  
    pickle.dump(cat_imp, file)

blob = bucket.blob('{}/{}'.format("churn/artifact/preprocess",
                                cat_imp_name))
blob.upload_from_filename(cat_imp_name)

# Store Num Imputer
num_imp_name = "num_imputer" + f".pkl"
with open(num_imp_name, 'wb') as file:  
    pickle.dump(num_imp, file)

blob = bucket.blob('{}/{}'.format("churn/artifact/preprocess",
                                num_imp_name))
blob.upload_from_filename(num_imp_name)  

out_df = pd.DataFrame(np.concatenate((imputed_data_cat, imputed_data_num, pre_data[TARGET_LABEL].values),axis = 1),
                    columns = EMBEDDING_CATEGORICAL_FEATURES + NUMERICAL_FEATURE_NAMES + TARGET_LABEL)
out_df.to_csv(post_impute_dataset + ".csv" , index=False, encoding='utf-8-sig')

HERE

In [47]:
%%bash
cat > ./tmp/components/impute_and_store/Dockerfile <<EOF

FROM python:3.9

WORKDIR /impute_store
COPY impute_and_store.py impute_and_store.py

RUN pip install pandas
RUN pip install sklearn
RUN pip install numpy
RUN pip install --upgrade google-api-python-client
RUN pip install google-cloud-storage
RUN pip install gcsfs

ENV PYTHONPATH="/impute_store:${PYTHONPATH}"

EOF

In [48]:
%%bash -s "{PROJECT_ID}"

IMAGE_NAME="churn-data-impute-store"
TAG="latest"

cat > ./tmp/components/impute_and_store/build_image.sh <<HERE

PROJECT_ID="${1}"
IMAGE_NAME="${IMAGE_NAME}"
TAG="${TAG}"
GCR_IMAGE="gcr.io/\${PROJECT_ID}/\${IMAGE_NAME}:\${TAG}"
echo ${GCR_IMAGE}
docker build -t \${IMAGE_NAME} .
docker tag \${IMAGE_NAME} \${GCR_IMAGE}
docker push \${GCR_IMAGE}

HERE

cd tmp/components/impute_and_store
bash build_image.sh


Sending build context to Docker daemon  10.24kB
Step 1/10 : FROM python:3.9
 ---> 9ac24a438a75
Step 2/10 : WORKDIR /impute_store
 ---> Using cache
 ---> 15c572e53ea7
Step 3/10 : COPY impute_and_store.py impute_and_store.py
 ---> Using cache
 ---> 857e54936252
Step 4/10 : RUN pip install pandas
 ---> Using cache
 ---> bb9e84c86094
Step 5/10 : RUN pip install sklearn
 ---> Using cache
 ---> 22e07ef579e9
Step 6/10 : RUN pip install numpy
 ---> Using cache
 ---> d9470708f07b
Step 7/10 : RUN pip install --upgrade google-api-python-client
 ---> Using cache
 ---> 07f9b9244965
Step 8/10 : RUN pip install google-cloud-storage
 ---> Using cache
 ---> 9ef6551b74b8
Step 9/10 : RUN pip install gcsfs
 ---> Running in 231e2aed64d5
Collecting gcsfs
  Downloading gcsfs-2022.5.0-py2.py3-none-any.whl (25 kB)
Collecting aiohttp<4
  Downloading aiohttp-3.8.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.2 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Docker Image for Imputation/One-Hot Encode/Standard Scaling

In [50]:
%%bash

# Create folders if they don't exist.
mkdir -p tmp/components/enc_scl_store/

# Create the Python file that lists GCS blobs.
cat > ./tmp/components/enc_scl_store/enc_scl_store.py <<HERE
import argparse

import pandas as pd
import numpy as np
import pickle
import re
import google.cloud.storage as storage
from sklearn.preprocessing import OneHotEncoder, StandardScaler

parser = argparse.ArgumentParser()
parser.add_argument(
    '--pre_enc_dataset', type=str, required=True, help='path of dataset input')
parser.add_argument(
    '--post_enc_dataset', type=str, required=True, help='path of dataset output')
parser.add_argument(
    '--bucket_name', type=str, required=True, help='GCS Bucket')

args = parser.parse_args()
pre_enc_dataset=args.pre_enc_dataset
post_enc_dataset=args.post_enc_dataset
bucket_name = args.bucket_name

NUMERICAL_FEATURE_NAMES = [
    "SUBS_TENURE",
    "TOT_DAY_LAST_COMPLAINT_CNT",
    "TOT_DAY_LAST_SUSPENDED_CNT",
    "MTH_TO_SUBS_END_CNT",
    'REV_AMT_BASE_1',
    'REV_AMT_BASE_2',
    'CUST_AGE',
    'PCT_CHNG_IB_SMS_CNT']

EMBEDDING_CATEGORICAL_FEATURES ={
    "x0_" :"GENDER_CD",
    "x1_" :"EDUCATION_CD",
    "x2_" :"TOT_SRV_DROPPED_CNT",
    "x3_" :"TOT_OB_CALL_INTL_ROAM_CNT",
    "x4_" :'BARRING_REASON_CD', 
    "x5_" :"TOT_SRV_ADDED_CNT"}

TARGET_LABEL = ['CHURN_FLG']

pre_data = pd.read_csv(pre_enc_dataset + '.csv')
pre_data_cat = pre_data[EMBEDDING_CATEGORICAL_FEATURES.values()]
pre_data_num = pre_data[NUMERICAL_FEATURE_NAMES]

bucket = storage.Client().bucket(bucket_name)

enc = OneHotEncoder()
scl = StandardScaler()

enc_data = enc.fit_transform(pre_data_cat)
enc_file_name = 'encoder' + f'.pkl'
with open(enc_file_name, 'wb') as file:  
    pickle.dump(enc, file)

blob = bucket.blob('{}/{}'.format("churn/artifact/preprocess", enc_file_name))
blob.upload_from_filename(enc_file_name)  

scl_data = scl.fit_transform(pre_data_num)
scl_file_name = 'scaler' + f'.pkl'
with open(scl_file_name, 'wb') as file:  
    pickle.dump(scl, file)

blob = bucket.blob('{}/{}'.format("churn/artifact/preprocess", scl_file_name))   
blob.upload_from_filename(scl_file_name) 

column_labels = list(enc.get_feature_names_out()) + NUMERICAL_FEATURE_NAMES + TARGET_LABEL
out_df = pd.DataFrame(np.concatenate((enc_data.toarray(), scl_data, pre_data[TARGET_LABEL].values),axis = 1),
                      columns = column_labels)
out_df.to_csv(post_enc_dataset + ".csv" , index=False, encoding='utf-8-sig')
    
HERE

In [51]:
%%bash
cat > ./tmp/components/enc_scl_store/Dockerfile <<EOF

FROM python:3.9

WORKDIR /enc_scl_store
COPY enc_scl_store.py enc_scl_store.py

RUN pip install pandas
RUN pip install sklearn
RUN pip install numpy
RUN pip install --upgrade google-api-python-client
RUN pip install google-cloud-storage
RUN pip install gcsfs

ENV PYTHONPATH="/enc_scl_store:${PYTHONPATH}"

EOF

In [52]:
%%bash -s "{PROJECT_ID}"

IMAGE_NAME="churn-data-enc-scl-store"
TAG="latest"

cat > ./tmp/components/enc_scl_store/build_image.sh <<HERE

PROJECT_ID="${1}"
IMAGE_NAME="${IMAGE_NAME}"
TAG="${TAG}"
GCR_IMAGE="gcr.io/\${PROJECT_ID}/\${IMAGE_NAME}:\${TAG}"
echo ${GCR_IMAGE}
docker build -t \${IMAGE_NAME} .
docker tag \${IMAGE_NAME} \${GCR_IMAGE}
docker push \${GCR_IMAGE}

HERE

cd tmp/components/enc_scl_store
bash build_image.sh


Sending build context to Docker daemon  6.144kB
Step 1/10 : FROM python:3.9
 ---> 9ac24a438a75
Step 2/10 : WORKDIR /enc_scl_store
 ---> Running in 774de76aec01
Removing intermediate container 774de76aec01
 ---> ead1751f56bf
Step 3/10 : COPY enc_scl_store.py enc_scl_store.py
 ---> 19aff5176ddf
Step 4/10 : RUN pip install pandas
 ---> Running in 7202aab8cd3f
Collecting pandas
  Downloading pandas-1.4.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 11.7/11.7 MB 71.4 MB/s eta 0:00:00
Collecting pytz>=2020.1
  Downloading pytz-2022.1-py2.py3-none-any.whl (503 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 503.5/503.5 KB 39.7 MB/s eta 0:00:00
Collecting python-dateutil>=2.8.1
  Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 247.7/247.7 KB 28.5 MB/s eta 0:00:00
Collecting numpy>=1.18.5
  Downloading numpy-1.23.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.

# Docker Image for Hyperparameter Tuning

In [54]:
%%bash

# Create folders if they don't exist.
mkdir -p tmp/components/hyperparameter_tuning/

# Create the Python file that lists GCS blobs.
cat > ./tmp/components/hyperparameter_tuning/hyperparameter_tuning.py <<HERE
import argparse

import pandas as pd
import numpy as np
import google.cloud.storage as storage
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler
import pickle

parser = argparse.ArgumentParser()
parser.add_argument(
    '--dataset', type=str, required=True, help='path of dataset input')
parser.add_argument(
    '--bucket_name', type=str, required=True, help='GCS Bucket')

args = parser.parse_args()
dataset=args.dataset
bucket_name = args.bucket_name

param_grid = {'max_depth':[10,15,20],
               'min_samples_split':[10,15,20]}
base_model = RandomForestClassifier(n_estimators = 20, random_state = 2022)

df = pd.read_csv(dataset + '.csv').values
X = df[:,:-1]
y = df[:,-1].astype(int)

oversample = RandomOverSampler(sampling_strategy='minority')
X_train, y_train = oversample.fit_resample(X, y)

gcv = GridSearchCV(base_model, param_grid = param_grid, cv = 3, scoring = 'f1')
gcv.fit(X_train, y_train)

hyperparameters = gcv.best_params_

hyper_name = "hyper.pkl"
with open(hyper_name, 'wb') as file:  
    pickle.dump(hyperparameters, file)
bucket = storage.Client().bucket(bucket_name)
blob = bucket.blob('{}/{}'.format("churn/metadata", hyper_name))
blob.upload_from_filename(hyper_name)  

grid_name = "gcv.pkl"
with open(grid_name, 'wb') as file:
    pickle.dump(gcv, file)
blob = bucket.blob('{}/{}'.format("churn/metadata", grid_name))
blob.upload_from_filename(grid_name)

HERE

In [55]:
%%bash
cat > ./tmp/components/hyperparameter_tuning/Dockerfile <<EOF

FROM python:3.9

WORKDIR /hyperparameter_tuning
COPY hyperparameter_tuning.py hyperparameter_tuning.py

RUN pip install pandas
RUN pip install sklearn
RUN pip install numpy
RUN pip install --upgrade google-api-python-client
RUN pip install google-cloud-storage
RUN pip install gcsfs
RUN pip install -U imbalanced-learn

ENV PYTHONPATH="/enc_scl_store:${PYTHONPATH}"

EOF

In [56]:
%%bash -s "{PROJECT_ID}"

IMAGE_NAME="churn-data-hyperparameter_tuning"
TAG="latest"

cat > ./tmp/components/hyperparameter_tuning/build_image.sh <<HERE

PROJECT_ID="${1}"
IMAGE_NAME="${IMAGE_NAME}"
TAG="${TAG}"
GCR_IMAGE="gcr.io/\${PROJECT_ID}/\${IMAGE_NAME}:\${TAG}"
echo ${GCR_IMAGE}
docker build -t \${IMAGE_NAME} .
docker tag \${IMAGE_NAME} \${GCR_IMAGE}
docker push \${GCR_IMAGE}

HERE

cd tmp/components/hyperparameter_tuning
bash build_image.sh


Sending build context to Docker daemon   5.12kB
Step 1/11 : FROM python:3.9
 ---> 9ac24a438a75
Step 2/11 : WORKDIR /hyperparameter_tuning
 ---> Running in a1bdf3326682
Removing intermediate container a1bdf3326682
 ---> 6a68ed34c218
Step 3/11 : COPY hyperparameter_tuning.py hyperparameter_tuning.py
 ---> 4b8ec265fce4
Step 4/11 : RUN pip install pandas
 ---> Running in a6add70b9120
Collecting pandas
  Downloading pandas-1.4.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 11.7/11.7 MB 74.0 MB/s eta 0:00:00
Collecting python-dateutil>=2.8.1
  Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 247.7/247.7 KB 27.2 MB/s eta 0:00:00
Collecting pytz>=2020.1
  Downloading pytz-2022.1-py2.py3-none-any.whl (503 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 503.5/503.5 KB 40.4 MB/s eta 0:00:00
Collecting numpy>=1.18.5
  Downloading numpy-1.23.0-cp39-cp39-manylinux_2_17_x86_

# Create Docker Image for Model Training

In [86]:
%%bash

# Create folders if they don't exist.
mkdir -p tmp/components/training/

# Create the Python file that lists GCS blobs.
cat > ./tmp/components/training/training.py <<HERE
import argparse

import operator, pickle, json
import pandas as pd
import numpy as np
import google.cloud.storage as storage
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import roc_curve, confusion_matrix, f1_score
import gcsfs

parser = argparse.ArgumentParser()
parser.add_argument(
    '--dataset', type=str, required=True, help='path of dataset input')
parser.add_argument(
    '--bucket_name', type=str, required=True, help='GCS Bucket')
parser.add_argument(
    '--model', type=str, required=True, help='Model Artifact Path')
parser.add_argument(
    '--classification_metrics', type=str, required=True, help='Confusion Matrix')
parser.add_argument(
    '--base_metrics', type=str, required=True, help='Base Metrics')
parser.add_argument(
    '--feature_importance', type=str, required=True, help='Feature Importance Table')

args = parser.parse_args()
dataset=args.dataset
bucket_name = args.bucket_name
model=args.model
classification_metrics = args.classification_metrics
base_metrics=args.base_metrics
feature_importance = args.feature_importance

df = pd.read_csv(dataset + ".csv")
feature_label = df.columns[:-1]
data = df.values

X = data[:,:-1]
y = data[:,-1].astype(int)

X_train, X_test, y_train, y_test = tts(X, y, test_size = 0.2, random_state = 2022)
oversample = RandomOverSampler(sampling_strategy='minority')
X_train, y_train = oversample.fit_resample(X_train, y_train)

# fs = gcsfs.GCSFileSystem(project='practice-smu-123')
# fs.ls(bucket_name)

bucket = storage.Client().bucket(bucket_name)
blob = bucket.blob("churn/metadata/hyper.pkl")
hyper_name = "hyper.pkl"
blob.download_to_filename(hyper_name)
file = open(hyper_name, 'rb')
hyperparameters = pickle.load(file)
file.close()

model_rf = RandomForestClassifier(n_estimators = 20)
model_rf.set_params(**hyperparameters)
model_rf.fit(X_train, y_train)
feature_importances = model_rf.feature_importances_
rf_feature_importance = {feature_label[i] : model_rf.feature_importances_[i]
                            for i in range(len(model_rf.feature_importances_))}
rf_feature_importance = dict(sorted(rf_feature_importance.items(), reverse = True,
                                    key=operator.itemgetter(1)))
feature_importance_df = pd.DataFrame([rf_feature_importance.keys(),rf_feature_importance.values()],
                                    index = ["Feature","Importance"]).transpose()
feature_importance_df.to_csv(feature_importance + ".csv", index=False,
                             header = False, encoding='utf-8-sig')


# blob = bucket.blob('{}/{}'.format("churn/metadata", "feature_importance.csv"))
# blob.upload_from_filename(feature_importance + ".csv")

# file_name = model + f'.pkl'
# with fs.open(file_name, 'wb') as file:
#     pickle.dump(model_rf, file)
    
y_preds = model_rf.predict(X_test)

y_scores = model_rf.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_true = y_test, y_score = y_scores, pos_label = True)
classification_metrics.log_roc_curve(fpr.tolist(), tpr.tolist(),thresholds.tolist())

classification_metrics.log_confusion_matrix(["False","True"],confusion_matrix(y_test, y_preds).tolist())

f1_score = f1_score(y_test, y_preds)
#  thresholds_dict = json.loads(thresholds_dict_str)
model.metadata["f1_score"] = float(f1_score)
base_metrics.log_metric("f1_score",float(f1_score))

HERE

In [87]:
%%bash
cat > ./tmp/components/training/Dockerfile <<EOF

FROM python:3.9

WORKDIR /training
COPY training.py training.py

RUN pip install pandas
RUN pip install sklearn
RUN pip install numpy
RUN pip install --upgrade google-api-python-client
RUN pip install google-cloud-storage
RUN pip install gcsfs
RUN pip install -U imbalanced-learn

ENV PYTHONPATH="/training:${PYTHONPATH}"

EOF

In [88]:
%%bash -s "{PROJECT_ID}"

IMAGE_NAME="churn-data-training"
TAG="latest"

cat > ./tmp/components/training/build_image.sh <<HERE

PROJECT_ID="${1}"
IMAGE_NAME="${IMAGE_NAME}"
TAG="${TAG}"
GCR_IMAGE="gcr.io/\${PROJECT_ID}/\${IMAGE_NAME}:\${TAG}"
echo ${GCR_IMAGE}
docker build -t \${IMAGE_NAME} .
docker tag \${IMAGE_NAME} \${GCR_IMAGE}
docker push \${GCR_IMAGE}

HERE

cd tmp/components/training
bash build_image.sh


Sending build context to Docker daemon  7.168kB
Step 1/11 : FROM python:3.9
 ---> 9ac24a438a75
Step 2/11 : WORKDIR /training
 ---> Using cache
 ---> a998fb19b5df
Step 3/11 : COPY training.py training.py
 ---> 6c7791ec5540
Step 4/11 : RUN pip install pandas
 ---> Running in 502e0f3782f3
Collecting pandas
  Downloading pandas-1.4.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 11.7/11.7 MB 73.1 MB/s eta 0:00:00
Collecting numpy>=1.18.5
  Downloading numpy-1.23.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 17.1/17.1 MB 57.4 MB/s eta 0:00:00
Collecting pytz>=2020.1
  Downloading pytz-2022.1-py2.py3-none-any.whl (503 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 503.5/503.5 KB 41.2 MB/s eta 0:00:00
Collecting python-dateutil>=2.8.1
  Downloading python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 247.7/247.7 KB

In [109]:
%%bash

# Create folders if they don't exist.
mkdir -p tmp/components/check_model_metrics/

# Create the Python file that lists GCS blobs.
cat > ./tmp/components/check_model_metrics/check_model_metrics.py <<HERE

import argparse
import json

parser = argparse.ArgumentParser()

parser.add_argument(
    '--base_metrics', type=str, required=True, help='Base Metrics')
parser.add_argument(
    '--threshold_dict', type=str, required=True, help='Threshold F1-Score')
parser.add_argument(
    '--deploy_path', type=str, required=True, help='Deploy Status')

args = parser.parse_args()

base_metrics = args.base_metrics
threshold_dict=args.threshold_dict
deploy_path = args.deploy_path

def threshold_check(val1,val2):
    cond = "False"
    if val1 > val2:
        cond = "True"
    return cond

thresholds_dict = json.loads(threshold_dict)
deploy = threshold_check(float(base_metrics), thresholds_dict['f1_score'])

with open(deploy_path + '.txt', 'w') as file:
    file.write(deploy)
    
HERE

In [102]:
%%bash
cat > ./tmp/components/check_model_metrics/Dockerfile <<EOF

FROM python:3.9

WORKDIR /check_model_metrics
COPY check_model_metrics.py check_model_metrics.py

ENV PYTHONPATH="/check_model_metrics:${PYTHONPATH}"

EOF

In [103]:
%%bash -s "{PROJECT_ID}"

IMAGE_NAME="churn-data-check_model_metrics"
TAG="latest"

cat > ./tmp/components/check_model_metrics/build_image.sh <<HERE

PROJECT_ID="${1}"
IMAGE_NAME="${IMAGE_NAME}"
TAG="${TAG}"
GCR_IMAGE="gcr.io/\${PROJECT_ID}/\${IMAGE_NAME}:\${TAG}"
echo ${GCR_IMAGE}
docker build -t \${IMAGE_NAME} .
docker tag \${IMAGE_NAME} \${GCR_IMAGE}
docker push \${GCR_IMAGE}

HERE

cd tmp/components/check_model_metrics
bash build_image.sh


Sending build context to Docker daemon  4.608kB
Step 1/4 : FROM python:3.9
 ---> 9ac24a438a75
Step 2/4 : WORKDIR /check_model_metrics
 ---> Using cache
 ---> 88eed03186c8
Step 3/4 : COPY check_model_metrics.py check_model_metrics.py
 ---> 87c75f966485
Step 4/4 : ENV PYTHONPATH="/check_model_metrics:"
 ---> Running in c6979e1fa0c7
Removing intermediate container c6979e1fa0c7
 ---> bc881c2d549d
Successfully built bc881c2d549d
Successfully tagged churn-data-check_model_metrics:latest
The push refers to repository [gcr.io/churn-smu/churn-data-check_model_metrics]
bb2d107d1a4d: Preparing
9561a59afff2: Preparing
9d5f84e3acdc: Preparing
7ae141717ba6: Preparing
ba95fb9ea3e2: Preparing
ca5c6d5c3d01: Preparing
33a247b4fc52: Preparing
5afd661c6106: Preparing
66183893ba24: Preparing
6840c8ff46bd: Preparing
97d5fec864d8: Preparing
6840c8ff46bd: Waiting
97d5fec864d8: Waiting
ca5c6d5c3d01: Waiting
5afd661c6106: Waiting
33a247b4fc52: Waiting
66183893ba24: Waiting
9561a59afff2: Layer already exists
9d

# Docker Image for Deployment

In [104]:
%%bash

# Create folders if they don't exist.
mkdir -p tmp/components/deploy_model/

# Create the Python file that lists GCS blobs.
cat > ./tmp/components/deploy_model/deploy_model.py <<HERE

from google.cloud import aiplatform

parser = argparse.ArgumentParser()

parser.add_argument(
    '--model', type=str, required=True, help='Model Path')
parser.add_argument(
    '--project', type=str, required=True, help='Project')
parser.add_argument(
    '--region', type=str, required=True, help='Region')
parser.add_argument(
    '--serving_container_image_uri', type=str, required=True, help='serving_container_image_uri')

args = parser.parse_args()

model = args.model
project=args.project
region = args.region
serving_container_image_uri = args.serving_container_image_uri

aiplatform.init(project = project, location = region)

DISPLAY_NAME = "churn_prediction_v2"
MODEL_NAME = "churn_rf_v2"
ENDPOINT_NAME = "churn_endpoint_v2"

def create_endpoint():
    endpoints = aiplatform.Endpoint.list(
        filter = 'display_name = "{}"'.format(ENDPOINT_NAME),
        order_by = 'create_time desc',
        project = project,
        location = region,
    )

    if len(endpoints) > 0:
        endpoint = endpoints[0]
    else:
        endpoint = aiplatform.Endpoint.create(display_name = ENDPOINT_NAME,
                                              project = project,
                                              location = region)

    return endpoint

endpoint = create_endpoint()
endpoint_info = endpoint.resource_name.split('/')[-1]

model_upload = aiplatform.Model.upload(display_name = DISPLAY_NAME,
                                       artifact_uri = model.replace("model",""),
                                       serving_container_image_uri = serving_container_image_uri,
                                       serving_container_health_route=f'/v1/models/{MODEL_NAME}',
                                       serving_container_predict_route=f'/v1/models/{MODEL_NAME}:predict',
                                       serving_container_environment_variables = {"MODEL_NAME":MODEL_NAME,},
                                       )

model_deploy = model_upload.deploy(machine_type = "n1-standard-4",
                                  endpoint = endpoint,
                                  traffic_split = {"0":100},
                                  deployed_model_display_name = DISPLAY_NAME)
    
HERE

In [105]:
%%bash
cat > ./tmp/components/deploy_model/Dockerfile <<EOF

FROM python:3.9

WORKDIR /deploy_model
COPY deploy_model.py deploy_model.py

RUN pip install google-cloud-aiplatform
RUN pip install sklearn
RUN pip install kfp

ENV PYTHONPATH="/check_model_metrics:${PYTHONPATH}"

EOF

In [106]:
%%bash -s "{PROJECT_ID}"

IMAGE_NAME="churn-data-deploy_model"
TAG="latest"

cat > ./tmp/components/deploy_model/build_image.sh <<HERE

PROJECT_ID="${1}"
IMAGE_NAME="${IMAGE_NAME}"
TAG="${TAG}"
GCR_IMAGE="gcr.io/\${PROJECT_ID}/\${IMAGE_NAME}:\${TAG}"
echo ${GCR_IMAGE}
docker build -t \${IMAGE_NAME} .
docker tag \${IMAGE_NAME} \${GCR_IMAGE}
docker push \${GCR_IMAGE}

HERE

cd tmp/components/deploy_model
bash build_image.sh


Sending build context to Docker daemon  6.144kB
Step 1/7 : FROM python:3.9
 ---> 9ac24a438a75
Step 2/7 : WORKDIR /deploy_model
 ---> Running in bfa2ec4b6e03
Removing intermediate container bfa2ec4b6e03
 ---> 3e7f9b093071
Step 3/7 : COPY deploy_model.py deploy_model.py
 ---> 57312b1cd485
Step 4/7 : RUN pip install google-cloud-aiplatform
 ---> Running in d2ef1dd97690
Collecting google-cloud-aiplatform
  Downloading google_cloud_aiplatform-1.15.0-py2.py3-none-any.whl (2.1 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.1/2.1 MB 26.3 MB/s eta 0:00:00
Collecting google-cloud-bigquery<3.0.0dev,>=1.15.0
  Downloading google_cloud_bigquery-2.34.4-py2.py3-none-any.whl (206 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 206.6/206.6 KB 24.4 MB/s eta 0:00:00
Collecting protobuf<4.0.0dev,>=3.19.0
  Downloading protobuf-3.20.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.0/1.0 MB 55.0 MB/s eta 0:00:00
Collecting packaging<22.0.0

# Docker Image for Endpoint test

In [110]:
%%bash

# Create folders if they don't exist.
mkdir -p tmp/components/endpoint_test/

# Create the Python file that lists GCS blobs.
cat > ./tmp/components/endpoint_test/endpoint_test.py <<HERE

import argparse
from google.cloud import aiplatform

parser = argparse.ArgumentParser()

parser.add_argument(
    '--endpoint', type=str, required=True, help='End Point ID')
parser.add_argument(
    '--project', type=str, required=True, help='Project')
parser.add_argument(
    '--region', type=str, required=True, help='Region')

args = parser.parse_args()

model = args.endpoint
project=args.project
region = args.region

aiplatform.init(project=project, location=location)

instance = [[0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,
             0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0,
             0.0, 0.0, -0.210, -0.695, 0.341, -0.528, 1.282, -1.632, 1.568, 1.309]]

endpoint = aiplatform.Endpoint(endpoint)

prediction = endpoint.predict(instances=instance)
assert int(prediction[0][0]) in [0, 1], "Prediction Error"
    
HERE

In [111]:
%%bash
cat > ./tmp/components/endpoint_test/Dockerfile <<EOF

FROM python:3.9

WORKDIR /endpoint_test
COPY endpoint_test.py check_model_metrics.py

RUN pip install google-cloud-aiplatform

ENV PYTHONPATH="/endpoint_test:${PYTHONPATH}"

EOF

In [112]:
%%bash -s "{PROJECT_ID}"

IMAGE_NAME="churn-data-endpoint_test"
TAG="latest"

cat > ./tmp/components/endpoint_test/build_image.sh <<HERE

PROJECT_ID="${1}"
IMAGE_NAME="${IMAGE_NAME}"
TAG="${TAG}"
GCR_IMAGE="gcr.io/\${PROJECT_ID}/\${IMAGE_NAME}:\${TAG}"
echo ${GCR_IMAGE}
docker build -t \${IMAGE_NAME} .
docker tag \${IMAGE_NAME} \${GCR_IMAGE}
docker push \${GCR_IMAGE}

HERE

cd tmp/components/endpoint_test
bash build_image.sh


Sending build context to Docker daemon  4.608kB
Step 1/5 : FROM python:3.9
 ---> 9ac24a438a75
Step 2/5 : WORKDIR /endpoint_test
 ---> Running in 45a03498717e
Removing intermediate container 45a03498717e
 ---> 8753b6fcf543
Step 3/5 : COPY endpoint_test.py check_model_metrics.py
 ---> 02148bb7e054
Step 4/5 : RUN pip install google-cloud-aiplatform
 ---> Running in 9b33942c62af
Collecting google-cloud-aiplatform
  Downloading google_cloud_aiplatform-1.15.0-py2.py3-none-any.whl (2.1 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.1/2.1 MB 27.0 MB/s eta 0:00:00
Collecting google-cloud-bigquery<3.0.0dev,>=1.15.0
  Downloading google_cloud_bigquery-2.34.4-py2.py3-none-any.whl (206 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 206.6/206.6 KB 23.7 MB/s eta 0:00:00
Collecting google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5
  Downloading google_api_core-2.8.2-py3-none-any.whl (114 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 114.6/114.6 KB 14.5 MB/s eta 0:00:00
C