In [1]:
import kfp
from kfp import dsl
import kfp.components as components

In [14]:
from typing import NamedTuple

def download_and_load_data():
    from minio import Minio
    import numpy as np
    import requests
    
    minio_client = Minio(
        "192.168.1.10:30950",
        access_key="minio",
        secret_key="minio123",
        secure=False
    )
    minio_bucket = "mlpipeline"
    
    filename = "toxic_comments.csv"
    try:
        response = minio_client.get_object(minio_bucket, "dataset/"+filename)
        # Read data from response.
        response.close()
        response.release_conn()
    except:
        url = "https://raw.githubusercontent.com/agungfazrulhaq/toxicdetection/main/data/train.csv"
        r = requests.get(url, allow_redirects=True)
        open(filename, 'wb').write(r.content)

        print("Downloaded file "+filename)
        minio_client.fput_object(minio_bucket, "dataset/toxic_comments.csv", "toxic_comments.csv")
        print("Stored downloaded dataset")

def transform_and_split_train(split_size:float = 0.9) :
    import tensorflow as tf
    from minio import Minio
    import numpy as np
    import pandas as pd
    import requests
    
    minio_client = Minio(
        "192.168.1.10:30950",
        access_key="minio",
        secret_key="minio123",
        secure=False
    )
    minio_bucket = "mlpipeline"
    
    filename = "toxic_comments.csv"
    def remove_stopwords(sentence):
        """
        Removes a list of stopwords

        Args:
            sentence (string): sentence to remove the stopwords from

        Returns:
            sentence (string): lowercase sentence without the stopwords
        """
        # List of stopwords
        stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

        # Sentence converted to lowercase-only
        sentence = sentence.lower()

        words = sentence.split()
        no_words = [w for w in words if w not in stopwords]
        sentence = " ".join(no_words)

        return sentence

    def remove_symbols(sentence) :
        return re.sub(r'[^\w]', ' ', sentence)
    
    minio_client.fget_object(minio_object, "dataset/toxic_comments.csv", "/tmp/toxic_comments.csv")
    df_train = pd.read_csv("toxic_comments.csv")
    train = df_train[['comment_text','toxic','severe_toxic','obscene','threat','insult','identity_hate']]
    
    train["text_no_stopwords"] = train["comment_text"].apply(lambda x : remove_stopwords(x))
    train["text_final"] = train["text_no_stopwords"].apply(lambda x : remove_symbols(x))
    
    def train_val_split(data, split) :
        train = []
        train_label = []
        validation = []
        val_label = []
        for ind,val in data.iterrows() :
            if len(train) < len(data)*split :
                train.append(val['text_final'])
                train_label.append(np.array(val[['toxic','severe_toxic','obscene','threat','insult','identity_hate']].values))
            else :
                validation.append(val['text_final'])
                val_label.append(np.array(val[['toxic','severe_toxic','obscene','threat','insult','identity_hate']].values))

        train = np.array(train)
        train_label = np.array(train_label)
        validation = np.array(validation)
        val_label = np.array(val_label)

        return train, train_label, validation, val_label
    
    x_train, y_train, x_val, y_val = train_val_split(train, split_size)
    
    y_train = np.asarray(y_train).astype(np.float32)
    y_val = np.asarray(y_val).astype(np.float32)
    
    np.save("/tmp/x_train.npy", x_train)
    minio_client.fput_object(minio_object, "commentoxic/x_train", "/tmp/x_train.npy")
    
    np.save("/tmp/y_train.npy", y_train)
    minio_client.fput_object(minio_object, "commentoxic/y_train", "/tmp/y_train.npy")
    
    np.save("/tmp/x_val.npy", x_val)
    minio_client.fput_object(minio_object, "commentoxic/x_val", "/tmp/x_val.npy")
    
    np.save("/tmp/y_val.npy", y_val)
    minio_client.fput_object(minio_object, "commentoxic/y_val", "/tmp/y_val.npy")
    
    print("Train shape :", x_train.shape)
    print("Validation shape :", x_val.shape)

def building_tokenizer(oov_token:str = "<OOV>"):
    
    from minio import Minio
    import tensorflow as tf
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.utils import to_categorical 
    from tensorflow.keras.preprocessing.text import Tokenizer
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    import numpy as np
    
    minio_client = Minio(
        "192.168.1.10:30950",
        access_key="minio",
        secret_key="minio123",
        secure=False
    )
    minio_bucket = "mlpipeline"
    
    def fit_tokenizer(train_sentences, oov_token) :
        tokenizer = Tokenizer(oov_token=oov_token)
        tokenizer.fit_on_texts(train_sentences)
    
        return tokenizer
    
    minio_client.fget_object(minio_bucket,"commentoxic/x_train","/tmp/x_train.npy")
    x_train = np.load("/tmp/x_train.npy")
    
    tokenizer = fit_tokenizer(x_train, oov_token)
    
    with open('tmp/tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    minio_client.fput_object(minio_bucket,"commentoxic/tokenizer.pickle", "/tmp/tokenizer.pickle")
    
    
def building_model(n_epochs:int = 10,
                   optimizer:str = "adam",
                   max_len:int = 120) -> NamedTuple('Output', [('mlpipeline_metrics', 'Metrics')]):
    from minio import Minio
    import tensorflow as tf
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.utils import to_categorical 
    from tensorflow.keras.preprocessing.text import Tokenizer
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    import numpy as np
    import os
    
    minio_client = Minio(
        "192.168.1.10:30950",
        access_key="minio",
        secret_key="minio123",
        secure=False
    )
    minio_bucket = "mlpipeline"
    
    EMBEDDING_DIM = 16
    PADDING = 'post'
    TRUNCATING = 'post'
    
    word_index = tokenizer.word_index
    NUM_WORDS = len(word_index)
    
    # Build the model
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(NUM_WORDS, EMBEDDING_DIM, input_length=max_len),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16, return_sequences=True)),
        tf.keras.layers.Conv1D(filters=64, kernel_size=5, activation='relu'),
        tf.keras.layers.MaxPooling1D(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Conv1D(filters=32, kernel_size=5, activation='relu'),
        tf.keras.layers.GlobalMaxPooling1D(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(6, activation='sigmoid')
    ])
    
    # Print the model summary
    print(model.summary())
    
    # Compile the model
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    import pickle
    
    minio_client.fget_object(minio_bucket, "commentoxic/tokenizer.pickle", "/tmp/tokenizer.pickle")
    file = open('/tmp/tokenizer.pickle', 'rb')
    tokenizer = pickle.load(file)
    
    minio_client.fget_object(minio_bucket, "commentoxic/x_train", "/tmp/x_train.npy")
    x_train = np.load("/tmp/x_train.npy")
    
    minio_client.fget_object(minio_bucket, "commentoxic/y_train", "/tmp/y_train.npy")
    y_train = np.load("/tmp/y_train.npy")
    
    minio_client.fget_object(minio_bucket, "commentoxic/x_val", "/tmp/x_val.npy")
    x_val = np.load("/tmp/x_val.npy")
    
    minio_client.fget_object(minio_bucket, "commentoxic/y_val", "/tmp/y_val.npy")
    y_val = np.load("/tmp/y_val.npy")
    

    training_sequences = tokenizer.texts_to_sequences(x_train)
    training_padded = pad_sequences(training_sequences, maxlen=max_len, padding=PADDING, truncating=TRUNCATING)

    validation_sequences = tokenizer.texts_to_sequences(x_val)
    validation_padded = pad_sequences(validation_sequences, maxlen=max_len, padding=PADDING, truncating=TRUNCATING)
    
    # Train the model
    history = model.fit(training_padded, y_train, epochs=num_epochs, validation_data=(validation_padded, y_val))
    
    model.save("/tmp/toxicmodel")
    model.save("/tmp/toxic_model.h5")
    
    minio_client.fput_object(minio_object, "commentoxic/toxic_model.h5", "/tmp/toxic_model.h5")
    
    def upload_local_directory_to_minio(local_path, bucket_name, minio_path):
        assert os.path.isdir(local_path)

        for local_file in glob.glob(local_path + '/**'):
            local_file = local_file.replace(os.sep, "/") # Replace \ with / on Windows
            if not os.path.isfile(local_file):
                upload_local_directory_to_minio(
                    local_file, bucket_name, minio_path + "/" + os.path.basename(local_file))
            else:
                remote_path = os.path.join(
                    minio_path, local_file[1 + len(local_path):])
                remote_path = remote_path.replace(
                    os.sep, "/")  # Replace \ with / on Windows
                minio_client.fput_object(bucket_name, remote_path, local_file)
    
    upload_local_directory_to_minio("/tmp/toxicmodel", minio_bucket, "commentoxic/model/toxicmodel/1/")
    
    metrics = {
      'metrics': [{
          'name': 'model_accuracy',
          'numberValue':  float(history.history["accuracy"][-1]),
          'format' : "PERCENTAGE"
        },{
          'name': 'model_loss',
          'numberValue':  float(history.history["loss"][-1]),
          'format' : "PERCENTAGE"
        },{
          'name': 'model_val_accuracy',
          'numberValue':  float(history.history["val_accuracy"][-1]),
          'format' : "PERCENTAGE"
        },{
          'name': 'model_val_loss',
          'numberValue':  float(history.history["val_loss"][-1]),
          'format' : "PERCENTAGE"
        }]}
    
    from collections import namedtuple
    output = namedtuple('output', ['mlpipeline_metrics'])
    return output(json.dumps(metrics))
    
    
    