In [1]:
experiment_name ='demo-kfp-model-training'
import random
import kfp
from kfp import dsl
from kfp.components import create_component_from_func
from kfp.components import InputPath, OutputPath

In [2]:
BASE_IMAGE = "python:3.8-slim"

In [3]:
# Download the data from minio
def download_data(datset:str, output_dir_path: OutputPath()):
    from  minio import Minio
    import pandas as pd
    import os
    import urllib3
    
    os.makedirs(output_dir_path, exist_ok=True)
    file_path = os.path.join(output_dir_path, "data" + '.csv')
    print("file_path:",file_path)

    minio_client = Minio(
    "10.110.71.235:9000",
    access_key="minio",
    secret_key="minio123",
    secure=False,
    http_client=urllib3.ProxyManager(
        "http://10.110.71.235:9000/",
        timeout=urllib3.Timeout.DEFAULT_TIMEOUT,
        cert_reqs="CERT_REQUIRED",
        retries=urllib3.Retry(
            total=5,
            backoff_factor=0.2,
            status_forcelist=[500, 502, 503, 504],
        ),
    ),
    )

    obj = minio_client.get_object("dataset",datset)
    df = pd.read_csv(obj)
    print(df.head())
    df.to_csv(file_path)


In [4]:
# Exploratory Data Analysis 

def explore_data_analysis(input_dir_path: InputPath(), output_dir_path: OutputPath()):
    import os
    import pandas as pd  
    
    print("input_dir_path: ", input_dir_path)
    dir_items = os.listdir(input_dir_path)
    print("dir_items: ", dir_items)
    
    os.makedirs(output_dir_path, exist_ok=True)
    file_path = os.path.join(output_dir_path, "accuracy" + '.csv')
    
    input_file_path = input_dir_path + "/" + dir_items[0]
    
    print("file_path:",file_path)
    df = pd.read_csv(input_file_path)
    print(df.head())

    chest_pain=pd.get_dummies(df['cp'],prefix='cp',drop_first=True)
    df=pd.concat([df,chest_pain],axis=1)
    df.drop(['cp'],axis=1,inplace=True)
    sp=pd.get_dummies(df['slope'],prefix='slope')
    th=pd.get_dummies(df['thal'],prefix='thal')
    rest_ecg=pd.get_dummies(df['restecg'],prefix='restecg')
    frames=[df,sp,th,rest_ecg]
    df=pd.concat(frames,axis=1)
    df.drop(['slope','thal','restecg'],axis=1,inplace=True)

    df.to_csv(file_path)
    print(df.head())


In [5]:
# def classification_model(input_dir_path: InputPath(), output_dir_path: OutputPath()):
#     import pandas as pd  
#     from tensorflow.keras.models import Sequential
#     from tensorflow.keras.layers import Dense
# #     import keras
#     from sklearn.model_selection import train_test_split
#     from sklearn.preprocessing import StandardScaler
#     import os

#     print("input_dir_path: ", input_dir_path)
#     dir_items = os.listdir(input_dir_path)
#     print("dir_items: ", dir_items)
    
#     os.makedirs(output_dir_path, exist_ok=True)
    
#     file_path = os.path.join(output_dir_path, "classifier" + '.json')
    
#     input_file_path = os.path.join(input_dir_path ,dir_items[0])
    
#     df = pd.read_csv(input_file_path)

#     X = df.drop(['target'], axis = 1)
#     y = df.target.values
#     sc = StandardScaler()
#     x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,  random_state = 20)
#     x_train = sc.fit_transform(x_train)
#     x_test = sc.transform(x_test)

#     classifier = Sequential()
#     classifier.add(Dense(units = 11,  activation = 'relu', input_dim = 22))
#     classifier.add(Dense(units = 11, activation = 'relu'))
#     classifier.add(Dense(units = 1,  activation = 'sigmoid'))
#     classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    
#     classifier.fit(x_train, y_train, batch_size = 10, epochs = 1)
  
#     # serialize model to JSON
#     model_json = classifier.to_json()
    
#     with open(file_path, "w") as json_file:
#         json_file.write(model_json)

#     # serialize weights to HDF5
#     classifier.save_weights("classifier.h5")
#     print("Saved model to disk")
    

In [6]:
def classifier_model(input_dir_path: InputPath(), output_dir_path: OutputPath()):
    import pandas as pd  
    from sklearn.linear_model import  LogisticRegression
#     import keras
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    import os
    import pickle

    print("input_dir_path: ", input_dir_path)
    dir_items = os.listdir(input_dir_path)
    print("dir_items: ", dir_items)
    
    os.makedirs(output_dir_path, exist_ok=True)
    
    file_path = os.path.join(output_dir_path, "lrmodel" + '.pkl')
    
    input_file_path = os.path.join(input_dir_path ,dir_items[0])
    
    df = pd.read_csv(input_file_path)
    
    X = df.drop(['target'], axis = 1)
    y = df.target.values
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,  random_state = 20)
    
    classifier = LogisticRegression()
    classifier.fit(x_train,y_train)
    
    pickle.dump(classifier, open(file_path, 'wb'))

    

In [7]:
def classifier_knn_model(input_dir_path: InputPath(), output_dir_path: OutputPath()):
    import pandas as pd  
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    import os
    import pickle

    print("input_dir_path: ", input_dir_path)
    dir_items = os.listdir(input_dir_path)
    print("dir_items: ", dir_items)
    
    os.makedirs(output_dir_path, exist_ok=True)
    
    file_path = os.path.join(output_dir_path, "knnmodel" + '.pkl')
    
    input_file_path = os.path.join(input_dir_path ,dir_items[0])
    
    df = pd.read_csv(input_file_path)
    
    X = df.drop(['target'], axis = 1)
    y = df.target.values
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,  random_state = 20)
    
    knn = KNeighborsClassifier()
    knn.fit(x_train,y_train)
    pickle.dump(knn, open(file_path, 'wb'))

In [8]:
# Converting the python function to component
download_data_opp = create_component_from_func(download_data,base_image=BASE_IMAGE, packages_to_install=['minio', 'pandas'])

eda_opp = create_component_from_func(explore_data_analysis,base_image=BASE_IMAGE, packages_to_install=['pandas'])

# classification_model_opp = create_component_from_func(classifier_model,base_image=BASE_IMAGE, packages_to_install=['pandas','sklearn'] )

# classifier_model_knn_opp = create_component_from_func(classifier_knn_model,base_image=BASE_IMAGE, packages_to_install=['pandas','sklearn'] )

In [17]:
@kfp.dsl.pipeline(name='model-training-pipeline')
def model_pipeline(data="heart.csv"):
    download_data_task = download_data_opp(data)
    eda_opp_task = eda_opp(input_dir=download_data_task.output)
    
#     classification_model_task = classification_model_opp(input_dir=eda_opp_task.output)
#     classifier_model_knn_opp(input_dir=eda_opp_task.output)

In [18]:
kfp.compiler.Compiler().compile(model_pipeline, 'model-training-pipeline.yaml')

#Submit a pipeline run
kfp_endpoint = None
kfp.Client(host=kfp_endpoint).create_run_from_pipeline_func(model_pipeline,arguments={},experiment_name=experiment_name)

RunPipelineResult(run_id=166c56a1-a696-4cc6-8eb7-fdcee9bf137d)