In [2]:
import pandas as pd
import re
from os import system, listdir
from os.path import isfile, join
from random import shuffle


def create_data_frame(folder: str) -> pd.DataFrame:
    '''
    folder - the root folder of train or test dataset
    Returns: a DataFrame with the combined data from the input folder
    '''
    pos_folder = f'{folder}/pos' # positive reviews
    neg_folder = f'{folder}/neg' # negative reviews
    
    def get_files(fld: str) -> list:
        '''
        fld - positive or negative reviews folder
        Returns: a list with all files in input folder
        '''
        return [join(fld, f) for f in listdir(fld) if isfile(join(fld, f))]
    
    def append_files_data(data_list: list, files: list, label: int) -> None:
        '''
        Appends to 'data_list' tuples of form (file content, label)
        for each file in 'files' input list
        '''
        for file_path in files:
            with open(file_path, 'r') as f:
                text = f.read()
                data_list.append((text, label))
    
    pos_files = get_files(pos_folder)
    neg_files = get_files(neg_folder)
    
    data_list = []
    append_files_data(data_list, pos_files, 1)
    append_files_data(data_list, neg_files, 0)
    shuffle(data_list)
    
    text, label = tuple(zip(*data_list))
    # replacing line breaks with spaces
    text = list(map(lambda txt: re.sub('(<br\s*/?>)+', ' ', txt), text))
    
    return pd.DataFrame({'text': text, 'label': label})

imdb_train = create_data_frame('../aclImdb/train')
imdb_test = create_data_frame('../aclImdb/test')

In [3]:
system("mkdir 'csv'")
imdb_train.to_csv('csv/imdb_train.csv', index=False)
imdb_test.to_csv('csv/imdb_test.csv', index=False)

# imdb_train = pd.read_csv('csv/imdb_train.csv')
# imdb_test = pd.read_csv('csv/imdb_test.csv')

In [52]:
%%writefile ./pipeline/trainer_image/train.py

import numpy as np
import pandas as pd
import os
from os import system
import subprocess
import sys
import fire
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from joblib import dump, load # used for saving and loading sklearn objects
from scipy.sparse import save_npz, load_npz, csr_matrix # used for saving and loading sparse matrices
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

def train_model(training_dataset_file, validation_dataset_file, gcs_model_path):
    
    imdb_train = pd.read_csv(training_dataset_file)
    imdb_test = pd.read_csv(validation_dataset_file)
    
    if not os.path.exists('data_preprocessors'):
        system("mkdir 'data_preprocessors'")
    if not os.path.exists('vectorized_data'):
        system("mkdir 'vectorized_data'")
    if not os.path.exists('model'):
        system("mkdir 'model'")

    #preprocessing

    # Bigram Counts

    bigram_vectorizer = CountVectorizer(ngram_range=(1, 2))
    bigram_vectorizer.fit(imdb_train['text'].values)
    
    dump(bigram_vectorizer, 'data_preprocessors/bigram_vectorizer.joblib')
    
    # bigram_vectorizer = load('data_preprocessors/bigram_vectorizer.joblib')
    
    X_train_bigram = bigram_vectorizer.transform(imdb_train['text'].values)
    
    save_npz('vectorized_data/X_train_bigram.npz', X_train_bigram)
    
    # X_train_bigram = load_npz('vectorized_data/X_train_bigram.npz')
    
    # Bigram Tf-Idf

    bigram_tf_idf_transformer = TfidfTransformer()
    bigram_tf_idf_transformer.fit(X_train_bigram)

    dump(bigram_tf_idf_transformer, 'data_preprocessors/bigram_tf_idf_transformer.joblib')

    # bigram_tf_idf_transformer = load('data_preprocessors/bigram_tf_idf_transformer.joblib')

    X_train_bigram_tf_idf = bigram_tf_idf_transformer.transform(X_train_bigram)

    save_npz('vectorized_data/X_train_bigram_tf_idf.npz', X_train_bigram_tf_idf)
    
    # X_train_bigram_tf_idf = load_npz('vectorized_data/X_train_bigram_tf_idf.npz')
    
    y_train = imdb_train['label'].values

    def train_and_show_scores(X: csr_matrix, y: np.array, title: str) -> None:
        #splitting data
        X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, train_size=0.75, stratify=y
        )

        
        clf = SGDClassifier()

        distributions = dict(
            loss=['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
            learning_rate=['optimal', 'invscaling', 'adaptive'],
            eta0=uniform(loc=1e-7, scale=1e-2)
        )

        random_search_cv = RandomizedSearchCV(
            estimator=clf,
            param_distributions=distributions,
            cv=5,
            n_iter=20
        )
        random_search_cv.fit(X_train, y_train)
        print(f'Best params: {random_search_cv.best_params_}')
        print(f'Best score: {random_search_cv.best_score_}')
        dump(random_search_cv.best_estimator_, 'model/model.joblib.pkl')
        subprocess.check_call(['gsutil', 'cp', 'model/model.joblib.pkl', gcs_model_path],
                        stderr=sys.stdout)
        print('Saved model in: {}'.format(gcs_model_path))

    train_and_show_scores(X_train_bigram_tf_idf, y_train, 'Bigram Tf-Idf')
    
if __name__ == '__main__':
  fire.Fire(train_model)

Overwriting ./pipeline/trainer_image/train.py


In [7]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from scipy.sparse import 
import numpy as np

def train_and_show_scores(X: csr_matrix, y: np.array, title: str) -> None:
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, train_size=0.75, stratify=y
    )

    clf = SGDClassifier()
    clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    valid_score = clf.score(X_valid, y_valid)
    print(f'{title}\nTrain score: {round(train_score, 2)} ; Validation score: {round(valid_score, 2)}\n')

y_train = imdb_train['label'].values


train_and_show_scores(X_train_bigram_tf_idf, y_train, 'Bigram Tf-Idf')

Bigram Tf-Idf
Train score: 0.98 ; Validation score: 0.9



In [68]:
!gsutil cp gs://verexai_automl_text_data/custommodel/data/* gs://verexai_automl_text_data/pipeline_root/custommodel/imdb_reviews/data/


Copying gs://verexai_automl_text_data/custommodel/data/imdb_test.csv [Content-Type=text/csv]...
Copying gs://verexai_automl_text_data/custommodel/data/imdb_train.csv [Content-Type=text/csv]...
/ [2 files][ 61.8 MiB/ 61.8 MiB]                                                
Operation completed over 2 objects/61.8 MiB.                                     


In [31]:
!gsutil cp /home/jupyter/Vertexai-Custom-Model-Covertype/Imdb_reviews_custom/csv/imdb_train.csv gs://verexai_automl_text_data/custommodel/data/

Copying file:///home/jupyter/Vertexai-Custom-Model-Covertype/Imdb_reviews_custom/csv/imdb_train.csv [Content-Type=text/csv]...
- [1 files][ 31.2 MiB/ 31.2 MiB]                                                
Operation completed over 1 objects/31.2 MiB.                                     


In [53]:
!python3 ./pipeline/trainer_image/train.py "gs://verexai_automl_text_data/custommodel/data/imdb_train.csv" "gs://verexai_automl_text_data/custommodel/data/imdb_test.csv" "gs://verexai_automl_text_data/custommodel/model/"

Bigram Tf-Idf
Train score: 0.98 ; Validation score: 0.91

Copying file://model/model.joblib.pkl [Content-Type=application/octet-stream]...
/ [1 files][ 11.6 MiB/ 11.6 MiB]                                                
Operation completed over 1 objects/11.6 MiB.                                     
Saved model in: gs://verexai_automl_text_data/custommodel/model/


In [65]:
PROJECT_ID='dna-verizonpoc'
IMAGE_NAME='movie_reviews_trainer_image'
TAG='latest'
TRAINER_IMAGE='gcr.io/{}/{}:{}'.format(PROJECT_ID, IMAGE_NAME, TAG)

In [67]:
!gcloud builds submit --timeout 15m --tag $TRAINER_IMAGE ./pipeline/trainer_image

Creating temporary tarball archive of 4 file(s) totalling 5.7 KiB before compression.
Uploading tarball of [./pipeline/trainer_image] to [gs://dna-verizonpoc_cloudbuild/source/1629299872.666692-b9b32fac77754307a238dddad692a6cd.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/dna-verizonpoc/locations/global/builds/28eb4a67-4d95-49b6-9316-71cd73720ac8].
Logs are available at [https://console.cloud.google.com/cloud-build/builds/28eb4a67-4d95-49b6-9316-71cd73720ac8?project=885855441164].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "28eb4a67-4d95-49b6-9316-71cd73720ac8"

FETCHSOURCE
Fetching storage object: gs://dna-verizonpoc_cloudbuild/source/1629299872.666692-b9b32fac77754307a238dddad692a6cd.tgz#1629299872908982
Copying gs://dna-verizonpoc_cloudbuild/source/1629299872.666692-b9b32fac77754307a238dddad692a6cd.tgz#1629299872908982...
/ [1 files][  1.3 KiB/  1.3 KiB]                                                
Operation compl

In [26]:
PROJECT_ID='dna-verizonpoc'
IMAGE_NAME='movie_reviews_base_image'
TAG='latest'
BASE_IMAGE='gcr.io/{}/{}:{}'.format(PROJECT_ID, IMAGE_NAME, TAG)

In [27]:
!gcloud builds submit --timeout 15m --tag $BASE_IMAGE ./pipeline/base_image

Creating temporary tarball archive of 2 file(s) totalling 300 bytes before compression.
Uploading tarball of [./pipeline/base_image] to [gs://dna-verizonpoc_cloudbuild/source/1629374755.653616-afc9c1788fff43eaae27079d7ce69e12.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/dna-verizonpoc/locations/global/builds/6c3644c1-bb39-4bf1-a820-774254f45baa].
Logs are available at [https://console.cloud.google.com/cloud-build/builds/6c3644c1-bb39-4bf1-a820-774254f45baa?project=885855441164].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "6c3644c1-bb39-4bf1-a820-774254f45baa"

FETCHSOURCE
Fetching storage object: gs://dna-verizonpoc_cloudbuild/source/1629374755.653616-afc9c1788fff43eaae27079d7ce69e12.tgz#1629374755880201
Copying gs://dna-verizonpoc_cloudbuild/source/1629374755.653616-afc9c1788fff43eaae27079d7ce69e12.tgz#1629374755880201...
/ [1 files][  305.0 B/  305.0 B]                                                
Operation comple

In [64]:
import sklearn as np
print(np.__version__)

0.24.2


In [None]:
import kfp
from google.cloud import aiplatform
from google_cloud_pipeline_components import aiplatform as gcc_aip
from kfp.v2 import compiler
from kfp.v2.dsl import component
from kfp.v2.google import experimental
from kfp.v2.google.client import AIPlatformClient

In [None]:
# create args list for trainer

PIPELINE_ROOT="gs://verexai_automl_text_data/pipeline_root/custommodel/imdb_reviews/"
TRAINER_ARGS = ["gs://verexai_automl_text_data/pipeline_root/custommodel/imdb_reviews/data/imdb_train.csv", "gs://verexai_automl_text_data/pipeline_root/custommodel/imdb_reviews/data/imdb_test.csv", "gs://verexai_automl_text_data/pipeline_root/custommodel/imdb_reviews/model/"]

# create working dir to pass to job spec
import time

ts = int(time.time())
WORKING_DIR = f"{PIPELINE_ROOT}/{ts}"

MODEL_DISPLAY_NAME = f"movie-reviews-sentiment-anlaysis{ts}"
print(TRAINER_ARGS, WORKING_DIR, MODEL_DISPLAY_NAME)

In [None]:

@component
def training_op(input1: str):
    print("training task: {}".format(input1))

In [None]:
@kfp.dsl.pipeline(name="movie-reviews-sentiment-anlaysis" + TIMESTAMP)
def pipeline(
    project: str = PROJECT_ID,
    model_display_name: str = MODEL_DISPLAY_NAME,
    serving_container_image_uri: str = "us-docker.pkg.dev/cloud-aiplatform/prediction/tf2-cpu.2-3:latest",
):

    train_task = training_op("model training")
    experimental.run_as_aiplatform_custom_job(
        train_task,
        worker_pool_specs=[
            {
                "containerSpec": {
                    "args": TRAINER_ARGS,
                    "env": [{"name": "AIP_MODEL_DIR", "value": WORKING_DIR}],
                    "imageUri": "gcr.io/dna-verizonpoc/movie_reviews_trainer_image@sha256:a07b0c1ee718cb864fbf7c356a252110ca897b4d6b2c70705dde019fd850578b",
                },
                "replicaCount": "1",
                "machineSpec": {
                    "machineType": "n1-standard-4",
                },
            }
        ],
    )

    model_upload_op = gcc_aip.ModelUploadOp(
        project=project,
        display_name=model_display_name,
        artifact_uri=WORKING_DIR,
        serving_container_image_uri=serving_container_image_uri,
        serving_container_environment_variables={"NOT_USED": "NO_VALUE"},
    )
    model_upload_op.after(train_task)

    endpoint_create_op = gcc_aip.EndpointCreateOp(
        project=project,
        display_name="pipelines-created-endpoint",
    )

    model_deploy_op = gcc_aip.ModelDeployOp(  # noqa: F841
        project=project,
        endpoint=endpoint_create_op.outputs["endpoint"],
        model=model_upload_op.outputs["model"],
        deployed_model_display_name=model_display_name,
        machine_type="n1-standard-4",
    )

In [51]:
project_id="dna-verizonpoc"
region="us-central1"

In [53]:
from kfp.v2 import dsl
from kfp.v2.dsl import (Artifact,
                        Dataset,
                        Input,
                        Model,
                        Output,
                        Metrics,
                        ClassificationMetrics,
                        component)

from kfp.v2 import compiler
from google_cloud_pipeline_components import aiplatform as gcc_aip
from joblib import load

In [54]:
@component( base_image="gcr.io/dna-verizonpoc/movie_reviews_base_image:latest")
def get_data(
    dataset_train: Output[Dataset],
    dataset_test: Output[Dataset]
    
):
    
    from sklearn.model_selection import train_test_split as tts
    import pandas as pd
    # import some data to play with
    
    
    data_raw = pd.read_csv("gs://verexai_automl_text_data/pipeline_root/custommodel/imdb_reviews/data/imdb_movie_reviews_data.csv")
    train, test = tts(data_raw, test_size=0.3)
    
    train.to_csv(dataset_train.path)
    test.to_csv(dataset_test.path)

In [55]:
@component(
    packages_to_install = [
        "pandas",
        "sklearn",
        "joblib",
        "scipy"
    ],
)
def train_movie_reviews_model(
    dataset: Input[Dataset],
    model_artifact: Output[Model]
):
    import numpy as np
    import pandas as pd
    import os
    from os import system
    import subprocess
    import sys
    import fire
    from sklearn.linear_model import SGDClassifier
    from sklearn.model_selection import train_test_split
    from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
    from joblib import dump, load # used for saving and loading sklearn objects
    from scipy.sparse import save_npz, load_npz, csr_matrix # used for saving and loading sparse matrices
    from sklearn.model_selection import RandomizedSearchCV
    from scipy.stats import uniform
    
    imdb_train = pd.read_csv(dataset.path)
    
    if not os.path.exists('model'):
        system("mkdir 'model'")

    #preprocessing

    # Bigram Counts

    bigram_vectorizer = CountVectorizer(ngram_range=(1, 2))
    bigram_vectorizer.fit(imdb_train['text'].values)
    
    X_train_bigram = bigram_vectorizer.transform(imdb_train['text'].values)

    # Bigram Tf-Idf

    bigram_tf_idf_transformer = TfidfTransformer()
    bigram_tf_idf_transformer.fit(X_train_bigram)
    X_train_bigram_tf_idf = bigram_tf_idf_transformer.transform(X_train_bigram)
    y_train = imdb_train['label'].values

    def train_and_show_scores(X_train: csr_matrix, y_train: np.array, title: str) -> None:
        
        clf = SGDClassifier()

        distributions = dict(
            loss=['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
            learning_rate=['optimal', 'invscaling', 'adaptive'],
            eta0=uniform(loc=1e-7, scale=1e-2)
        )

        random_search_cv = RandomizedSearchCV(
            estimator=clf,
            param_distributions=distributions,
            cv=5,
            n_iter=20
        )
        random_search_cv.fit(X_train, y_train)
        print(f'Best params: {random_search_cv.best_params_}')
        print(f'Best score: {random_search_cv.best_score_}')
        dump(random_search_cv.best_estimator_,'model/movie_reviews_model.joblib.pkl')
        dump('model/movie_reviews_model.joblib.pkl',model_artifact.path)
        model_artifact.metadata["best_params"] = random_search_cv.best_params_
        model_artifact.metadata["train_score"] = float(random_search_cv.best_score_)
        model_artifact.metadata["framework"] = "Scikit-learn"

    train_and_show_scores(X_train_bigram_tf_idf, y_train, 'Bigram Tf-Idf')

In [56]:
@component(
    packages_to_install = [
        "pandas",
        "sklearn",
        "joblib"
    ],
)
def eval_model(
    test_set: Input[Dataset],
    movie_review_model: Input[Model],
    metrics: Output[ClassificationMetrics],
    smetrics: Output[Metrics]
):
    import pandas as pd
    from joblib import dump, load
    from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
    imdb_test = pd.read_csv(test_set.path)
    bigram_vectorizer = CountVectorizer(ngram_range=(1, 2))
    bigram_tf_idf_transformer = TfidfTransformer()
    X_test = bigram_vectorizer.transform(imdb_test['text'].values)
    X_test = bigram_tf_idf_transformer.transform(X_test)
    y_test = imdb_test['label'].values
    model = load(movie_review_model.path)
    score = model.score(X_test, y_test)
    
    from sklearn.metrics import confusion_matrix
    y_pred = model.predict(imdb_test['text'].values)
    
    metrics.log_confusion_matrix(
       ["False", "True"],
       confusion_matrix(
           imdb_test['label'].values, y_pred
       ).tolist(),  # .tolist() to convert np array to list.
    )
    
    movie_review_model.metadata["test_score"] = float(score)
    smetrics.log_metric("score", float(score))

In [57]:
@dsl.pipeline(
    # Default pipeline root. You can override it when submitting the pipeline.
    pipeline_root="gs://verexai_automl_text_data/pipeline_root/custommodel/imdb_reviews/",
    # A name for the pipeline. Use to determine the pipeline Context.
    name="custom-pipeline-text-sentiment-anlaysis",
)
def pipeline():
    dataset_op = get_data()
    train_op = train_movie_reviews_model(dataset_op.outputs["dataset_train"])
    eval_op = eval_model(
        test_set=dataset_op.outputs["dataset_test"],
        movie_review_model=train_op.outputs["model_artifact"]
    )
    endpoint_create_op = gcc_aip.EndpointCreateOp(
        project=project_id,
        display_name="pipelines-created-endpoint-text-sa",
    )

    model_deploy_op = gcc_aip.ModelDeployOp(         
        project=project_id,
        endpoint=endpoint_create_op.outputs["endpoint"],
        model=train_op.outputs["model_artifact"],
        deployed_model_display_name="movie_reviews_model",
        machine_type="n1-standard-4",
    )
    
    
compiler.Compiler().compile(pipeline_func=pipeline,
        package_path='movie_review_pipeline.json')

In [None]:
def deploy_model_with_dedicated_resources_sample(
    project,
    location,
    movie_review_model: Input[Model],
    machine_type: str,
    endpoint: Input[Endpoint],
    min_replica_count: int = 1,
    max_replica_count: int = 1,
    sync: bool = True,
):

    gcc_aip.init(project=project_id, location=region)

    model = aiplatform.Model(model_name="movie_reviews_model")

    # The explanation_metadata and explanation_parameters should only be
    # provided for a custom trained model and not an AutoML model.
    model.deploy(
        endpoint=endpoint,
        deployed_model_display_name="movie_review_model",
        machine_type="n1-standard-4",
        min_replica_count=1,
        max_replica_count=1,
        sync=sync,
    )

    model.wait()

    print(model.display_name)
    print(model.resource_name)
    return model


In [58]:
from kfp.v2.google.client import AIPlatformClient

api_client = AIPlatformClient(
                project_id="dna-verizonpoc",
                region="us-central1"
                )

response = api_client.create_run_from_job_spec(
    'movie_review_pipeline.json',
    enable_caching=False 
)

In [71]:
!gsutil compose gs://verexai_automl_text_data/pipeline_root/custommodel/imdb_reviews/data/imdb_train.csv gs://verexai_automl_text_data/pipeline_root/custommodel/imdb_reviews/data/imdb_test.csv gs://verexai_automl_text_data/pipeline_root/custommodel/imdb_reviews/data/imdb_movie_reviews_data.csv 

Composing gs://verexai_automl_text_data/pipeline_root/custommodel/imdb_reviews/data/imdb_movie_reviews_data.csv from 2 component object(s).


In [12]:
import pandas as pd
data_raw = pd.read_csv("gs://verexai_automl_text_data/pipeline_root/custommodel/imdb_reviews/data/imdb_movie_reviews_data.csv")
data_raw.head()

Unnamed: 0,text,label
0,Throw this lame dog a bone. Sooo bad...you may...,0
1,This film limps from self indulgent moment to ...,0
2,The story is very trustworthy and powerful. Th...,0
3,"I enjoy movies like this for their spirit, no ...",1
4,I wondered why John Wood was not playing Dr. F...,0


In [7]:
!pwd

/home/jupyter/Vertexai-Custom-Model-Covertype/Imdb_reviews_custom


In [19]:
import fsspec