In [1]:
%load_ext lab_black

In [2]:
import kfp
from kfp import dsl
from kfp.components import InputPath, OutputPath
from kfp.components import create_component_from_func

In [3]:
BASE_IMAGE = "python:3.8-slim"

In [4]:
def s3_adapter(bucket_name: str, labels: list, output_dir_path: OutputPath()):

    import io
    import os
    import dill
    import boto3
    import torch
    import pandas as pd
    import transformers
    from io import BytesIO
    from contextlib import contextmanager
    from tempfile import NamedTemporaryFile
    from transformers import PretrainedConfig, PreTrainedModel

    class S3Adapter:
        def __init__(self, labels):
            self.labels = labels

        def folder_exists_and_not_empty(self, bucket_name: str, key: str) -> bool:

            if not key.endswith("/"):
                key = key + "/"
            resp = self.s3_client.list_objects(
                Bucket=bucket_name, Prefix=key, Delimiter="/", MaxKeys=1
            )
            return "Contents" in resp

        @contextmanager
        def s3_fileobj(self, bucket_name, key):

            """
            Yields a file object from the filename at {bucket}/{key}

            Args:
                bucket (str): Name of the S3 bucket where you model is stored
                key (str): Relative path from the base of your bucket, including the filename and extension of the object to be retrieved.
            """

            obj = self.s3_client.get_object(Bucket=bucket_name, Key=key)
            yield BytesIO(obj["Body"].read())

        def get_data_from_s3(self, bucket_name, prefix, data_file):

            data_object = self.s3_client.get_object(
                Bucket=bucket_name, Key=os.path.join(prefix, data_file)
            )["Body"]
            return data_object

        def get_model_tokenizer_from_s3(self, bucket_name, prefix, model_name="model"):

            if not self.folder_exists_and_not_empty(
                bucket_name=bucket_name, key=prefix
            ):
                model = transformers.BertForSequenceClassification.from_pretrained(
                    "bert-base-uncased", num_labels=len(self.labels)
                )
                tokenizer = transformers.BertTokenizer.from_pretrained(
                    "bert-base-uncased"
                )
                self.save_model_tokenizer_into_s3(
                    bucket_name=bucket_name,
                    prefix=prefix,
                    model_name=model_name,
                    model=model,
                    tokenizer=tokenizer,
                )
                return model, tokenizer

            else:
                tempfile = NamedTemporaryFile()
                with self.s3_fileobj(
                    bucket_name=bucket_name, key=f"{prefix}/{model_name}.bin"
                ) as f:
                    tempfile.write(f.read())

                with self.s3_fileobj(
                    bucket_name=bucket_name, key=f"{prefix}/config.json"
                ) as f:
                    dict_data = json.load(f)
                    print(type(dict_data))
                    config = PretrainedConfig.from_dict(dict_data)

                #                 model = PreTrainedModel.from_pretrained(tempfile.name, config=config)
                model = transformers.BertForSequenceClassification.from_pretrained(
                    tempfile.name, config=config
                )
                tokenizer = self.get_tokenizer_from_s3(bucket_name, prefix)
                return model, tokenizer

        def get_tokenizer_from_s3(self, bucket_name, prefix):

            tempfile = NamedTemporaryFile()
            with self.s3_fileobj(
                bucket_name=bucket_name, key=f"{prefix}/vocab.txt"
            ) as f:
                tempfile.write(f.read())
            tokenizer = transformers.BertTokenizer.from_pretrained(tempfile.name)
            return tokenizer

        def save_model_tokenizer_into_s3(
            self, bucket_name, prefix, model, model_name, tokenizer=None
        ):
            buffer = io.BytesIO()
            config_string = model.config.to_json_string()
            self.s3_client.put_object(
                Bucket=bucket_name, Key=f"{prefix}/config.json", Body=config_string
            )
            torch.save(model.state_dict(), buffer)
            self.s3_client.put_object(
                Bucket=bucket_name,
                Key=f"{prefix}/{model_name}.bin",
                Body=buffer.getvalue(),
            )
            if tokenizer:
                vocab_string = ""
                for token, token_id in tokenizer.vocab.items():
                    vocab_string = vocab_string + token + "\n"

                vocab_string = vocab_string.strip()
                self.s3_client.put_object(
                    Bucket=bucket_name, Key=f"{prefix}/vocab.txt", Body=vocab_string
                )

    os.makedirs(output_dir_path, exist_ok=True)
    s3_adpater_obj = S3Adapter(labels=labels)
    s3_pickle_path = os.path.join(output_dir_path, "s3_adapter.pkl")
    with open(s3_pickle_path, "wb") as f:
        dill.dump(s3_adpater_obj, f)

In [5]:
def create_data_loader(
    input_dir_path: InputPath(), labels: list, mode: str, output_dir_path: OutputPath()
):

    import os
    import os
    import dill
    import torch
    import transformers
    from torch.utils.data import Dataset

    class LabelDataset(Dataset):
        """

        LabelDataset class - to load the dataset used the __getitem__ fashion supported by the Pytorch.
        The loader supports the JSON and the csv format for parsing the input to the network.
            :param mode: mode of Label classifier model (train, eval or serve)
            :param text: input text for train, eval and serve components
            :param label: output label for train and eval components
            :param func_test: True for functional testing of package
            :param tokenizer: tokenizer from huggingface library


        """

        def __init__(self, labels, mode="serve", max_length=512, text=None, label=None):
            self.mode = mode
            self.labels = labels
            self.max_length = max_length

        def __getitem__(self, item):
            """

            Returns tokenized tensors for text and label(if mode is train or eval) for the given index.
                :param: item: index to fetch the data.
                :returns dict: dictionary of tensors containing input_ids, attention_mask,
                               token_type_ids and label(output if mode is train or eval)

            """
            if self.mode in ["train_eval", "train", "eval"]:
                text = str(self.text[item])
                processed_text = " ".join(text.split())
                label = self.label[item]
                inputs = self.tokenizer.encode_plus(
                    processed_text,
                    None,
                    add_special_tokens=True,
                    max_length=self.max_length,
                    truncation=True,
                )
                padding_length = self.max_length - len(inputs.get("input_ids"))
                input_ids = inputs.get("input_ids") + ([0] * padding_length)
                attention_mask = inputs.get("attention_mask") + ([0] * padding_length)
                token_type_ids = inputs.get("token_type_ids") + ([0] * padding_length)
                return {
                    "input_ids": torch.tensor(input_ids, dtype=torch.long),
                    "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
                    "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
                    "label": torch.tensor(
                        self.labels.index(label.strip()), dtype=torch.long
                    ),
                }
            elif self.mode == "serve":
                text = str(self.text[item])
                text = " ".join(text.split())
                inputs = self.tokenizer.encode_plus(
                    text,
                    None,
                    add_special_tokens=True,
                    max_length=self.max_length,
                    truncation=True,
                )
                padding_length = self.max_length - len(inputs.get("input_ids"))
                input_ids = inputs.get("input_ids") + ([0] * padding_length)
                attention_mask = inputs.get("attention_mask") + ([0] * padding_length)
                token_type_ids = inputs.get("token_type_ids") + ([0] * padding_length)
                return {
                    "text": text,
                    "input_ids": torch.tensor(input_ids, dtype=torch.long),
                    "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
                    "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
                }

        def __len__(self):
            """

            Returns length of dataset
                :returns int: length of dataset

            """
            return len(self.text)

    os.makedirs(output_dir_path, exist_ok=True)
    with open(os.path.join(input_dir_path, "s3_adapter.pkl"), "rb") as f:
        s3_adapter_obj = dill.load(f)
    with open(os.path.join(output_dir_path, "s3_adapter.pkl"), "wb") as f:
        dill.dump(s3_adapter_obj, f)

    dataset_object = LabelDataset(mode=mode, labels=labels)
    # Writing LabelBackbone class into model.pkl file
    with open(os.path.join(output_dir_path, "dataset.pkl"), "wb") as f:
        dill.dump(dataset_object, f)

In [6]:
def train(
    input_dir_path: InputPath(),
    bucket_name: str,
    prefix: str,
    data_file: str,
    labels: list,
    num_epochs: int,
    batch_size: int,
    device: str,
    output_dir_path: OutputPath(),
):

    """

    Training method for Label Classifier. Saves the model after training of model is completed
        :param num_epochs: Number of epochs for training purpose


    """

    import io
    import os
    import dill
    import json
    import boto3
    import torch
    import numpy as np
    import pandas as pd
    import transformers
    from tqdm import tqdm
    import torch.nn as nn
    from transformers import AdamW
    import torch.nn.functional as f
    from torch.utils.data import DataLoader

    buffer = io.BytesIO()
    s3_client = boto3.client("s3")
    os.makedirs(output_dir_path, exist_ok=True)

    class LabelBackbone(nn.Module):
        """

        LabelBackbone - Backbone model class for Label Classifier
            :param model: Transformer model name from config.TRANSFORMER_MODEL_LIST or saved transformer model name
            :param tokenizer: tokenizer from huggingface library
            :returns Object of LabelBackbone Model

        """

        def __init__(self, model):
            super().__init__()
            self.model = model
            self.drop_out = nn.Dropout(p=0.3)

        def forward(self, **kwargs):
            return self.model(**kwargs)
            #             final_output = self.linear(self.drop_out(po))
            return final_output

    with open(os.path.join(input_dir_path, "s3_adapter.pkl"), "rb") as f:
        s3_adapter_obj = dill.load(f)

    setattr(s3_adapter_obj, "s3_client", s3_client)
    pretrained_model, tokenizer = s3_adapter_obj.get_model_tokenizer_from_s3(
        bucket_name=bucket_name, prefix=f"{prefix}/pretrained"
    )

    #     pretrained_model = s3_adapter_obj.get_model_from_s3(
    #         bucket_name=bucket_name, prefix=f"{prefix}/pretrained_model"
    #     )
    backbone_model = LabelBackbone(model=pretrained_model)

    loss_function = nn.CrossEntropyLoss().to(device)
    optimizer = AdamW(backbone_model.parameters(), lr=3e-5)

    dataframe = pd.read_csv(
        s3_adapter_obj.get_data_from_s3(
            bucket_name=bucket_name, prefix=prefix, data_file=data_file
        )
    )
    print(dataframe.head())
    text = dataframe.loc[:, "text"]
    label = dataframe.loc[:, "label"]

    with open(os.path.join(input_dir_path, "dataset.pkl"), "rb") as f:
        train_dataset = dill.load(f)
    setattr(train_dataset, "text", text)
    setattr(train_dataset, "label", label)
    setattr(train_dataset, "mode", "train")
    setattr(train_dataset, "tokenizer", tokenizer)
    train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    try:
        backbone_model.to(device)
        backbone_model.train()  # Call the train method from the nn.Module base class
        print("Starting the Training Loop ..")  # Training loop start
        for epoch in range(num_epochs):
            train_loss = 0
            train_accuracy = 0
            print(f"[INFO] Epoch {epoch + 1} Started..")
            for index, batch in tqdm(enumerate(train_data_loader)):
                print(
                    f"[INFO] [TRAINING] Epoch {epoch + 1} Iteration {index + 1} Running.."
                )
                optimizer.zero_grad()
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                token_type_ids = batch["token_type_ids"].to(device)
                label = batch["label"].to(device).to(device)
                output = backbone_model(
                    input_ids=input_ids, attention_mask=attention_mask, labels=label
                )
                #                 loss = loss_function(output, label)
                loss = output[0]
                print(output[1])
                print("loss", loss, type(loss))
                loss.backward()
                optimizer.step()
                train_loss = train_loss + loss.item()
                _, hypothesis = torch.max(output[1], dim=1)
                train_accuracy = (
                    train_accuracy
                    + torch.sum(torch.tensor(hypothesis == label)).item()
                )
            train_accuracy = train_accuracy / (
                len(self.data_loader[0]) * batch_size
            )
            accuracy.append(train_accuracy)
            train_loss = train_loss / (
                len(self.data_loader[0]) * batch_size
            )
            train_st = f"Training Loss: {train_loss} Train Accuracy: {train_accuracy}"
            print(f"Epoch: {epoch+1} {train_st}")

        print("Model has been successfully built..")
        # utils_tools.save_model_bin(model_name=config.MARKER_CLASSIFIER, model=self.model)
        #         accuracy = sum(accuracy) / len(accuracy)
        s3_adapter_obj.save_model_tokenizer_into_s3(
            bucket_name=bucket_name,
            prefix=f"{prefix}/finetuned",
            model=backbone_model.model,
        )

    except (RuntimeError, MemoryError, ValueError, TypeError) as e:
        print("Training Exception Occurred")
        raise RuntimeError("Training Exception Occurred")

In [7]:
s3_adapter_comp = create_component_from_func(
    s3_adapter,
    base_image=BASE_IMAGE,
    packages_to_install=["boto3", "dill", "pandas", "transformers", "torch"],
)
dataset_comp = create_component_from_func(
    create_data_loader,
    base_image=BASE_IMAGE,
    packages_to_install=["torch", "dill", "transformers"],
)
train_comp = create_component_from_func(
    train,
    base_image=BASE_IMAGE,
    packages_to_install=["torch", "pandas", "transformers", "dill", "boto3", "numpy"],
)

In [8]:
@kfp.dsl.pipeline(name="label-classifier-training-pipeline")
def model_pipeline(labels: list):
    mode = "train"
    prefix = "kf-label-classifier"
    bucket_name = "mlops-kubeflow"
    labels = [
        "Action with Deadline",
        "Announcement",
        "Appreciation",
        "Action",
        "Others",
    ]
    s3_adapter_task = s3_adapter_comp(bucket_name=bucket_name, labels=labels)
    dataset_task = dataset_comp(
        input_dir=s3_adapter_task.output, labels=labels, mode=mode
    )
    train_task = train_comp(
        input_dir=dataset_task.output,
        bucket_name=bucket_name,
        prefix=prefix,
        data_file="label.csv",
        labels=labels,
        num_epochs=10,
        batch_size=2,
        device="cpu",
    )

In [9]:
EXPERIMENT_NAME = "label_classifier"
HOST = "https://kubeflow-workos-slvr.anthem.com"
namespace = "chaluvadi-avinash"
session_cookie = "MTY0NTQyNTI5M3xOd3dBTkRkVVRUTkxTa3RVVkVnM1MxTkhUVTFKVmxCQ05WQXpVVTFFVUVSSE0wdE9RVXRGTlROSVRsQkhRVU5NV2xkWU4wRktSRkU9fDyBx8kiey-TPSimHemde3ySqxXCVyP_8OhlH06bBMu3"
client = kfp.Client(
    host=f"{HOST}/pipeline",
    cookies=f"authservice_session={session_cookie}",
    namespace=namespace,
    ssl_ca_cert="./root.pem",
)
experiment = client.create_experiment(name=EXPERIMENT_NAME, namespace=namespace)
client.create_run_from_pipeline_func(
    pipeline_func=model_pipeline,
    arguments={},
    experiment_name=EXPERIMENT_NAME,
    namespace=namespace,
)

RunPipelineResult(run_id=7a807322-4a85-4181-9a9c-03c1e57ff307)

In [11]:
def train(input_dir_path: InputPath(), data_file:str, num_epochs:int, batch_size:int, device:str, label_string:str, model_path:str, tokenizer_path:str, output_dir_path: OutputPath()):
    """
    
        Training method for Label Classifier. Saves the model after training of model is completed
            :param num_epochs: Number of epochs for training purpose
            

    """
    import dill
    import json
    import torch
    import numpy as np
    import pandas as pd
    from tqdm import tqdm
    import torch.nn as nn
    from transformers import AdamW
    import torch.nn.functional as f
    
    
    dataframe = df.read_csv(os.path.join(input_dir_path, data_file_name))
    model_pcikle_path = os.path.join(input_dir_path, "model.pkl")
    with open(model_pcikle_path, "rb") as f:
        model = dill.load(f)
    optimizer = AdamW(self.model.parameters(), lr=3e-5)
    loss_function = nn.CrossEntropyLoss().to(device)
    setattr(model, "model", transformers.BertForSequenceClassification(model_path))
    
#     def create_batches(dataframe):
#         tokenizer = transformers.BertTokenizer.from_pretrained(tokenizer_path)
#         labels = json.loads(label_string)
#         train_data = []
#         for index in range(0, len(df), batch_size):
#             batch_dict = {}
#             end_index = index + batch_size
#             text = df.loc[index: end_index, "text"].values
#             marker = df.loc[index: end_index, "marker"].values.to_list()
#             inputs = tokenizer.batch_encode_plus(text, max_length=512, padding="max_length", truncation=True)
#             batch_dict["input_ids"] = torch.tensor(inputs.get("input_ids"), dtype=torch.long)
#             batch_dict["attention_mask"] = torch.tensor(inputs.get("attention_mask"), dtype=torch.long)
#             batch_dict["token_type_ids"] = torch.tensor(inputs.get("token_type_ids"), dtype=torch.long)
#             batch_data["marker"] = torch.tensor(map(lambda x: labels.index(x.title()),marker), dtype=torch.long)
#             train_data.append(batch_dict)
#         return train_data
    
    train_data = create_batches(dataframe=dataframe)
    
    try:
        model.to(device)
        model.train()  # Call the train method from the nn.Module base class
        print("Starting the Training Loop ..")  # Training loop start
        for epoch in range(num_epochs):
            train_loss = 0
            train_accuracy = 0
            print(f"[INFO] Epoch {epoch + 1} Started..")
            for index, batch in tqdm(enumerate(train_data)):
                print(
                    f"[INFO] [TRAINING] Epoch {epoch + 1} Iteration {index + 1} Running.."
                )
                optimizer.zero_grad()
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                token_type_ids = batch["token_type_ids"].to(device)
                marker = batch["marker"].to(self.device).to(device)
                output = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids,
                )
                loss = loss_function(output, marker)
                loss.backward()
                optimizer.step()
                train_loss = train_loss + loss.item()
                _, hypothesis = torch.max(output, dim=1)
                train_accuracy = (
                    train_accuracy
                    + torch.sum(torch.tensor(hypothesis == marker)).item()
                )
            train_accuracy = train_accuracy / (
                len(self.data_loader[0])
                * config.params.get("train").get("batch_size")
            )
            accuracy.append(train_accuracy)
            train_loss = train_loss / (
                len(self.data_loader[0])
                * config.params.get("train").get("batch_size")
            )
            train_st = (
                f"Training Loss: {train_loss} Train Accuracy: {train_accuracy}"
            )
            print(f"Epoch: {epoch+1} {train_st}")

        print("Model has been successfully built..")
        # utils_tools.save_model_bin(model_name=config.MARKER_CLASSIFIER, model=self.model)
        accuracy = sum(accuracy) / len(accuracy)

        ## save the model chackpoint to minio
            
    except (RuntimeError, MemoryError, ValueError, TypeError) as e:
        print("Training Exception Occurred")
        raise e

In [None]:
"mlops-kubeflow"

In [None]:
s3client.download_file(Bucket='mlops-kubeflow',Key='pipeline_parameters_and_metrics.ipynb',Filename='C:\\Users\\AG98087\\Downloads\\lakshmi_narayana.ipynb')

In [9]:
import boto3

s3 = boto3.resource("s3")
bucket = s3.Bucket("mlops-kubeflow")
bucket.objects.filter(Prefix="kf-label-classifier").delete()

[{'ResponseMetadata': {'RequestId': 'F0FK6N4FRHDK68T2',
   'HostId': 'XxQMbZOzA2p8NDIZuUrnqGOpwkinYXfWq7Z6SUkJ9BmYs7ftxqyzorWNM3WbwKJnn6eqIYm6KEw=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'XxQMbZOzA2p8NDIZuUrnqGOpwkinYXfWq7Z6SUkJ9BmYs7ftxqyzorWNM3WbwKJnn6eqIYm6KEw=',
    'x-amz-request-id': 'F0FK6N4FRHDK68T2',
    'date': 'Mon, 21 Feb 2022 10:05:25 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'kf-label-classifier/label.csv',
    'DeleteMarker': True,
    'DeleteMarkerVersionId': '20fd8vej4dPRuV5VzkO81F4h3BpnspuX'},
   {'Key': 'kf-label-classifier/pretrained/model.bin',
    'DeleteMarker': True,
    'DeleteMarkerVersionId': '5euWEIxLxnYY4vYoqskk7IeaQO56yPxE'},
   {'Key': 'kf-label-classifier/pretrained/vocab.txt',
    'DeleteMarker': True,
    'DeleteMarkerVersionId': 'hkpYSPaxjHivAQaEKxNlKNVDCusTDdru'},
   {'Key': 'kf-label-classifi

In [10]:
import boto3

s3.Bucket("mlops-kubeflow").upload_file("label.csv", "kf-label-classifier/label.csv")

In [7]:
import json
import boto3

s3 = boto3.client("s3")
st = "Avinash"
st = st.encode()
s3.put_object(Bucket="mlops-kubeflow", Key="kf-label-classifier/test.bin", Body=st)

{'ResponseMetadata': {'RequestId': 'ZZ3PHH5EF9N96EJN',
  'HostId': 'oJEwzUZLewzk1AvafDAHVojRK9YCpqO/psJrPXzR/ju6kNVPlWs6zP5337PsQb52Y6P2l3DvYS0=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'oJEwzUZLewzk1AvafDAHVojRK9YCpqO/psJrPXzR/ju6kNVPlWs6zP5337PsQb52Y6P2l3DvYS0=',
   'x-amz-request-id': 'ZZ3PHH5EF9N96EJN',
   'date': 'Fri, 18 Feb 2022 15:32:43 GMT',
   'x-amz-version-id': 'dWv30nE68SVSxt1XrRiNkaLvMlF_AY4X',
   'x-amz-expiration': 'expiry-date="Sun, 19 Feb 2023 00:00:00 GMT", rule-id="tf-s3-lifecycle-20220210124446674200000002"',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"97cddd635cef02b3ceaf25641f9b2eee"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 1},
 'Expiration': 'expiry-date="Sun, 19 Feb 2023 00:00:00 GMT", rule-id="tf-s3-lifecycle-20220210124446674200000002"',
 'ETag': '"97cddd635cef02b3ceaf25641f9b2eee"',
 'ServerSideEncryption': 'AES256',
 'VersionId': 'dWv30nE68SVSxt1XrRiNkaLvMlF_AY4X'}

In [2]:
import boto3
s3 = boto3.client("s3")
obj = s3.get_object(Bucket="mlops-kubeflow", Key="test-label-classifier/test.csv")[
    "Body"
]

ModuleNotFoundError: No module named 'boto3'

In [12]:
import pandas as pd

pd.read_csv(obj)
# ByetsIO(obj)

Unnamed: 0,text,label
0,Legato takes to the field on 27th and 28th of ...,Announcement
1,Your great work has earned you a vacation,Appreciation
2,I guess Kajari told you to watch those videos...,Action
3,what no confusion for per week 2 weeks scripts...,Others
4,You are full of constructive suggestions,Appreciation
...,...,...
627,Move it to tested then only I will move it to ...,Action
628,So this particular thing I think this should b...,Others
629,I am busy with UI and UX design and review as ...,Action with Deadline
630,Uh I am putting it in in-review but. Okay you ...,Action


In [70]:
import os
import torch.nn as nn
def test():
    class LabelBackbone(nn.Module):
        """

        LabelBackbone - Backbone model class for Label Classifier
            :param model: Transformer model name from config.TRANSFORMER_MODEL_LIST or saved transformer model name
            :param tokenizer: tokenizer from huggingface library
            :returns Object of LabelBackbone Model

        """

        def __init__(self):
            super().__init__()
    #             self.model = transformers.BertForSequenceClassification(model_path)
            self.drop_out = nn.Dropout(p=0.3)
            self.linear = nn.Linear(768, 3)

        def forward(self, input_ids, attention_mask, token_type_ids):
            _, po = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                return_dict=False,
            )
            final_output = self.linear(self.drop_out(po))
            return final_output

    backbone_model = LabelBackbone()
    print("**********", LabelBackbone.__name__)
    # Writing LabelBackbone class into model.pkl file
#     with open("test.pkl", "wb") as f:
#         dill.dump(backbone_model, f)

b'Avinash'

In [42]:
with open("test.pkl", "rb") as f:
    ob = dill.load(f)

In [43]:
dill.source.getsource(ob)

OSError: could not extract source code

In [73]:
import pandas as pd
from io import BytesIO
pd.read_csv(obj)

Unnamed: 0,text,marker
0,Yeah so this particular login screen code you ...,Others
1,Yeah I have shared with you and divesh both of...,Others
2,Yeah divesh please go ahead. This piece of cod...,Others
3,Hello yes divesh your voice is not audible. He...,Others
4,I think. Still some noise is happening. This p...,Others
...,...,...
275,Okay yeah yeah suresh anything pending from yo...,Others
276,"Um, uh, I don't think so himanshu so actually ...",Others
277,Wednesday is our demo so today by end of the d...,Others
278,Thank you guys so uh whatever you guys have sh...,Others


In [55]:
from io import BytesIO
import torch
by = BytesIO(obj["Body"].read())
type(by)

_io.BytesIO

In [58]:
torch.jit.load(by)

RuntimeError: PytorchStreamReader failed reading zip archive: not a ZIP archive

In [28]:
import transformers
import numpy as np
# tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")
# torch.load(".bin")

In [10]:
inputs = tokenizer.batch_encode_plus(np.array(["Hi How are you", "Can we connect ?"]))

In [11]:
inputs["input_ids"]

[[101, 7632, 2129, 2024, 2017, 102], [101, 2064, 2057, 7532, 1029, 102]]

In [16]:
import torch
torch.tensor(inputs["input_ids"], dtype=torch.long)

tensor([[ 101, 7632, 2129,  ...,    0,    0,    0],
        [ 101, 2064, 2057,  ...,    0,    0,    0]])

In [21]:
from contextlib import contextmanager 
from io import BytesIO 
import torch
from tempfile import NamedTemporaryFile 

In [22]:
@contextmanager 
def s3_fileobj(bucket, key): 
    """
    Yields a file object from the filename at {bucket}/{key}

    Args:
        bucket (str): Name of the S3 bucket where you model is stored
        key (str): Relative path from the base of your bucket, including the filename and extension of the object to be retrieved.
    """
    s3 = boto3.client("s3") 
    obj = s3.get_object(Bucket=bucket, Key=key) 
    yield BytesIO(obj["Body"].read()) 

In [25]:
def load_model(bucket, path_to_model, model_name='pytorch_model'):
    """
    Load a model at the given S3 path. It is assumed that your model is stored at the key:

        '{path_to_model}/{model_name}.bin'

    and that a config has also been generated at the same path named:

        f'{path_to_model}/config.json'

    """
    tempfile = NamedTemporaryFile() 
    with s3_fileobj(bucket, path_to_model) as f: 
        tempfile.write(f.read()) 
 
#     with s3_fileobj(bucket, f'{path_to_model}/config.json') as f: 
#         dict_data = json.load(f) 
#         config = PretrainedConfig.from_dict(dict_data) 
 
    tokenizer = transformers.BertTokenizer.from_pretrained(tempfile.name)
    return tokenizer

In [26]:
model = load_model('mlops-kubeflow', 'kf-label-classifier/vocab.txt')

Exception ignored in: <function tqdm.__del__ at 0x7f6f5e5a3670>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/tqdm/std.py", line 1147, in __del__
    self.close()
  File "/opt/conda/lib/python3.8/site-packages/tqdm/notebook.py", line 286, in close
    self.disp(bar_style='danger', check_delay=False)
AttributeError: 'tqdm' object has no attribute 'disp'
Exception ignored in: <function tqdm.__del__ at 0x7f6f5e5a3670>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/tqdm/std.py", line 1147, in __del__
    self.close()
  File "/opt/conda/lib/python3.8/site-packages/tqdm/notebook.py", line 286, in close
    self.disp(bar_style='danger', check_delay=False)
AttributeError: 'tqdm' object has no attribute 'disp'


In [71]:
def train(
    bucket_name: str,
    prefix: str,
    data_file: str,
    labels: list,
    num_epochs: int,
    batch_size: int,
    device: str,
):

    """

    Training method for Label Classifier. Saves the model after training of model is completed
        :param num_epochs: Number of epochs for training purpose


    """

    import io
    import os
    import dill
    import json
    import boto3
    import torch
    import numpy as np
    import pandas as pd
    import transformers
    from tqdm import tqdm
    import torch.nn as nn
    from transformers import AdamW
    import torch.nn.functional as f
    from torch.utils.data import DataLoader
    from contextlib import contextmanager
    from tempfile import NamedTemporaryFile
    from transformers import PretrainedConfig, PreTrainedModel

    buffer = io.BytesIO()
    s3_client = boto3.client("s3")
    #     os.makedirs(output_dir_path, exist_ok=True)

    class LabelBackbone(nn.Module):
        """

        LabelBackbone - Backbone model class for Label Classifier
            :param model: Transformer model name from config.TRANSFORMER_MODEL_LIST or saved transformer model name
            :param tokenizer: tokenizer from huggingface library
            :returns Object of LabelBackbone Model

        """

        def __init__(self, model):
            super().__init__()
            self.model = model
            self.drop_out = nn.Dropout(p=0.3)

        def forward(self, **kwargs):
            return self.model(**kwargs)
            #             final_output = self.linear(self.drop_out(po))
            return final_output

    class S3Adapter:
        def __init__(self, labels, client):
            self.s3_client = client
            self.labels = labels

        def folder_exists_and_not_empty(self, bucket_name: str, key: str) -> bool:

            if not key.endswith("/"):
                key = key + "/"
            resp = self.s3_client.list_objects(
                Bucket=bucket_name, Prefix=key, Delimiter="/", MaxKeys=1
            )
            return "Contents" in resp

        @contextmanager
        def s3_fileobj(self, bucket_name, key):

            """
            Yields a file object from the filename at {bucket}/{key}

            Args:
                bucket (str): Name of the S3 bucket where you model is stored
                key (str): Relative path from the base of your bucket, including the filename and extension of the object to be retrieved.
            """

            obj = self.s3_client.get_object(Bucket=bucket_name, Key=key)
            yield io.BytesIO(obj["Body"].read())

        def get_data_from_s3(self, bucket_name, prefix, data_file):

            data_object = self.s3_client.get_object(
                Bucket=bucket_name, Key=os.path.join(prefix, data_file)
            )["Body"]
            return data_object

        def get_model_tokenizer_from_s3(self, bucket_name, prefix, model_name="model"):

            if not self.folder_exists_and_not_empty(
                bucket_name=bucket_name, key=prefix
            ):
                model = transformers.BertForSequenceClassification.from_pretrained(
                    "bert-base-uncased", num_labels=5
                )
                tokenizer = transformers.BertTokenizer.from_pretrained(
                    "bert-base-uncased"
                )
                self.save_model_tokenizer_into_s3(
                    bucket_name=bucket_name,
                    prefix=prefix,
                    model_name=model_name,
                    model=model,
                    tokenizer=tokenizer,
                )
                return model, tokenizer

            else:
                tempfile = NamedTemporaryFile()
                with self.s3_fileobj(
                    bucket_name=bucket_name, key=f"{prefix}/{model_name}.bin"
                ) as f:

                    print("....")
                    tempfile.write(f.read())

                with self.s3_fileobj(
                    bucket_name=bucket_name, key=f"{prefix}/config.json"
                ) as f:
                    dict_data = json.load(f)
                    print(type(dict_data))
                    config = PretrainedConfig.from_dict(dict_data)
                print(tempfile.name)
                #                 model = PreTrainedModel.from_pretrained(
                #                     tempfile.name, config=config, num_labels=5
                #                 )
                model2 = transformers.BertForSequenceClassification.from_pretrained(
                    tempfile.name, config=config
                )
                print("------>model_done")
                tokenizer = self.get_tokenizer_from_s3(bucket_name, prefix)
                return model, tokenizer

        def get_tokenizer_from_s3(self, bucket_name, prefix):

            tempfile = NamedTemporaryFile()
            with self.s3_fileobj(
                bucket_name=bucket_name, key=f"{prefix}/vocab.txt"
            ) as f:
                tempfile.write(f.read())
            tokenizer = transformers.BertTokenizer.from_pretrained(tempfile.name)
            return tokenizer

        def save_model_tokenizer_into_s3(
            self, bucket_name, prefix, model, model_name, tokenizer=None
        ):
            buffer = io.BytesIO()
            config_string = model.config.to_json_string()
            self.s3_client.put_object(
                Bucket=bucket_name, Key=f"{prefix}/config.json", Body=config_string
            )
            torch.save(model.state_dict(), buffer)
            self.s3_client.put_object(
                Bucket=bucket_name,
                Key=f"{prefix}/{model_name}.bin",
                Body=buffer.getvalue(),
            )
            if tokenizer:
                vocab_string = ""
                for token, token_id in tokenizer.vocab.items():
                    vocab_string = vocab_string + token + "\n"

                vocab_string = vocab_string.strip()
                self.s3_client.put_object(
                    Bucket=bucket_name, Key=f"{prefix}/vocab.txt", Body=vocab_string
                )

    s3_adapter_obj = S3Adapter(labels=labels, client=s3_client)
    pretrained_model, tokenizer = s3_adapter_obj.get_model_tokenizer_from_s3(
        bucket_name=bucket_name, prefix=f"{prefix}/pretrained"
    )

    #     pretrained_model = s3_adapter_obj.get_model_from_s3(
    #         bucket_name=bucket_name, prefix=f"{prefix}/pretrained_model"
    #     )
    backbone_model = LabelBackbone(model=pretrained_model)

    loss_function = nn.CrossEntropyLoss().to(device)
    optimizer = AdamW(backbone_model.parameters(), lr=3e-5)

    dataframe = pd.read_csv(
        s3_adapter_obj.get_data_from_s3(
            bucket_name=bucket_name, prefix=prefix, data_file=data_file
        )
    )
    print(dataframe.head())
    text = dataframe.loc[:, "text"]
    label = dataframe.loc[:, "label"]

    with open(os.path.join(input_dir_path, "dataset.pkl"), "rb") as f:
        train_dataset = dill.load(f)
    setattr(train_dataset, "text", text)
    setattr(train_dataset, "label", label)
    setattr(train_dataset, "mode", "train")
    setattr(train_dataset, "tokenizer", tokenizer)
    train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    try:
        backbone_model.to(device)
        backbone_model.train()  # Call the train method from the nn.Module base class
        print("Starting the Training Loop ..")  # Training loop start
        for epoch in range(num_epochs):
            train_loss = 0
            train_accuracy = 0
            print(f"[INFO] Epoch {epoch + 1} Started..")
            for index, batch in tqdm(enumerate(train_data_loader)):
                print(
                    f"[INFO] [TRAINING] Epoch {epoch + 1} Iteration {index + 1} Running.."
                )
                optimizer.zero_grad()
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                token_type_ids = batch["token_type_ids"].to(device)
                label = batch["label"].to(device).to(device)
                output = backbone_model(
                    input_ids=input_ids, attention_mask=attention_mask, labels=label
                )
                loss = loss_function(output, marker)
                test_loss = output[0]
                print("loss", test_loss, test_loss.item(), loss)
                loss.backward()
                optimizer.step()
                train_loss = train_loss + loss.item()
                _, hypothesis = torch.max(output, dim=1)
                train_accuracy = (
                    train_accuracy
                    + torch.sum(torch.tensor(hypothesis == marker)).item()
                )
            train_accuracy = train_accuracy / (
                len(self.data_loader[0]) * config.params.get("train").get("batch_size")
            )
            accuracy.append(train_accuracy)
            train_loss = train_loss / (
                len(self.data_loader[0]) * config.params.get("train").get("batch_size")
            )
            train_st = f"Training Loss: {train_loss} Train Accuracy: {train_accuracy}"
            print(f"Epoch: {epoch+1} {train_st}")

        print("Model has been successfully built..")
        # utils_tools.save_model_bin(model_name=config.MARKER_CLASSIFIER, model=self.model)
        accuracy = sum(accuracy) / len(accuracy)
        s3_adapter_obj.save_model_tokenizer_into_s3(
            bucket_name=bucket_name,
            prefix=f"{prefix}/finetuned",
            model=backbone_model.model,
        )

    except (RuntimeError, MemoryError, ValueError, TypeError) as e:
        print("Training Exception Occurred")
        raise RuntimeError("Training Exception Occurred")

In [72]:
mode = "train"
prefix = "kf-label-classifier"
bucket_name = "mlops-kubeflow"
labels = [
    "Action with Deadline",
    "Announcement",
    "Appreciation",
    "Action",
    "Others",
]
    
train(
    bucket_name=bucket_name,
    prefix=prefix,
    data_file="label.csv",
    labels=labels,
    num_epochs=10,
    batch_size=2,
    device="cpu",
)

....
<class 'dict'>
/tmp/tmpa0dy6u_e
------>model_done




UnboundLocalError: local variable 'model' referenced before assignment

In [10]:
import torch

In [11]:
a = torch.rand(4, 4)

In [12]:
a

tensor([[0.7953, 0.3103, 0.1301, 0.9803],
        [0.1159, 0.4120, 0.9926, 0.8895],
        [0.1988, 0.9261, 0.9853, 0.2093],
        [0.2053, 0.1303, 0.6152, 0.2943]])

In [14]:
_, index = torch.max(a, dim=1)

In [16]:
index

tensor([3, 2, 2, 2])

In [21]:
torch.sum(index == torch.tensor([1, 2, 3, 4])).item()

1