In [1]:
import pandas as pd
import numpy as np

import pickle
import os
import mlflow

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

#### Get and run model with its ID from AWS S3 bucket

In [3]:
RUN_ID = os.getenv(
    "RUN_ID", "6f4e767405144ea0abe4bdac7bdcbb43"
)  #'6f4e767405144ea0abe4bdac7bdcbb43'

# model from aws s3 bucket
logged_model = f"s3://mlops-remote-bucket/1/{RUN_ID}/artifacts/model-SVC"

# Load model as a PyFuncModel.
model = mlflow.pyfunc.load_model(logged_model)

In [4]:
# check model for confirmation

model

mlflow.pyfunc.loaded_model:
  artifact_path: model-SVC
  flavor: mlflow.sklearn
  run_id: 6f4e767405144ea0abe4bdac7bdcbb43

In [5]:
# data preparation


def get_data(data):

    df = pd.read_csv(data)

    df = df.rename(
        columns={"V": "voltage", "H": "height", "S": "soil_types", "M": "mine_types"}
    )

    df["mine_types"] = df["mine_types"].replace({1: 0, 2: 1, 3: 2, 4: 3, 5: 4})

    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]
    df.reset_index(drop=True)

    num_variables = ["voltage", "height"]
    cat_variable = ["soil_types"]
    df[cat_variable] = df[cat_variable].astype(str)

    return df

In [8]:
dataset = get_data("Mine_Dataset.csv")

dataset

Unnamed: 0,voltage,height,soil_types,mine_types
0,0.338157,0.000000,0.0,0
1,0.320241,0.181818,0.0,0
2,0.287009,0.272727,0.0,0
3,0.256284,0.454545,0.0,0
4,0.262840,0.545455,0.0,0
...,...,...,...,...
333,0.323262,0.909091,0.4,4
334,0.444108,0.181818,1.0,4
335,0.353474,0.454545,1.0,4
336,0.362537,0.727273,1.0,4


In [9]:
# Splitting dataset


def split_dataset(df):

    df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
    df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

    df_full_train = df_full_train.reset_index(drop=True)
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    return df_full_train, df_train, df_val, df_test

In [10]:
df_full_train, df_train, df_val, df_test = split_dataset(dataset)

In [11]:
# Function to get the target value


def get_y(df):
    y = df["mine_types"].values
    return y

In [12]:
# Individual targets for the split dataset

y_train = get_y(df_train)
y_val = get_y(df_val)
y_test = get_y(df_test)

In [13]:
del df_train["mine_types"]
del df_val["mine_types"]
del df_test["mine_types"]

In [14]:
# Prepare dictionaries


def prepare_dictionaries(df):

    num_variables = ["voltage", "height"]
    cat_variable = ["soil_types"]

    dicts = df[cat_variable + num_variables].to_dict(orient="records")
    return dicts

In [15]:
# individual dictionaries of the split dataset

train_dicts = prepare_dictionaries(df_train)
val_dicts = prepare_dictionaries(df_val)
test_dicts = prepare_dictionaries(df_test)

In [65]:
# Evaluation function of auc_score for individual target


def roc_auc_score_multiclass(actual_class, pred_class, average="macro"):

    # creating a set of all the unique classes using the actual class list
    unique_class = set(actual_class)
    roc_auc_dict = {}
    for per_class in unique_class:

        # creating a list of all the classes except the current class
        other_class = [x for x in unique_class if x != per_class]

        # marking the current class as 1 and all other classes as 0
        new_actual_class = [0 if x in other_class else 1 for x in actual_class]
        new_pred_class = [0 if x in other_class else 1 for x in pred_class]

        # using scikit-learn method to calculate the roc_auc_score
        roc_auc = roc_auc_score(new_actual_class, new_pred_class, average=average)
        roc_auc_dict[per_class] = roc_auc

    return roc_auc_dict

#### Applying the model on random test dataset (df_test)

In [17]:
df_test

Unnamed: 0,voltage,height,soil_types
0,0.356495,0.363636,0.2
1,0.504531,0.363636,0.2
2,0.549848,0.818182,1.0
3,0.314199,1.000000,1.0
4,0.425981,0.545455,1.0
...,...,...,...
57,0.422960,0.272727,1.0
58,0.338157,0.000000,0.0
59,0.280966,0.818182,0.2
60,0.335347,0.818182,0.0


In [18]:
# length of df_test dataset

len(df_test)

62

In [37]:
# index 40 of test dataset

land_mine = df_test.iloc[40].to_dict()
land_mine

{'voltage': 0.341389342, 'height': 0.818181818, 'soil_types': '0.4'}

In [32]:
# prediction

y_pred = model.predict(land_mine)
y_pred[0]

4

#### Applying the model on all test dataset (test_dicts) to predict

In [33]:
test_dicts

[{'soil_types': '0.2', 'voltage': 0.356495062, 'height': 0.363636364},
 {'soil_types': '0.2', 'voltage': 0.504531116, 'height': 0.363636364},
 {'soil_types': '1.0', 'voltage': 0.549848276, 'height': 0.818181818},
 {'soil_types': '1.0', 'voltage': 0.314199046, 'height': 1.0},
 {'soil_types': '1.0', 'voltage': 0.425981373, 'height': 0.545454545},
 {'soil_types': '0.2', 'voltage': 0.438065949, 'height': 0.545454545},
 {'soil_types': '0.0', 'voltage': 0.335347054, 'height': 0.454545455},
 {'soil_types': '0.2', 'voltage': 0.296072182, 'height': 0.909090909},
 {'soil_types': '0.0', 'voltage': 0.471298533, 'height': 0.181818182},
 {'soil_types': '0.8', 'voltage': 0.477340821, 'height': 0.181818182},
 {'soil_types': '1.0', 'voltage': 0.353473918, 'height': 0.454545455},
 {'soil_types': '0.6', 'voltage': 0.558911708, 'height': 0.0},
 {'soil_types': '0.2', 'voltage': 0.326283622, 'height': 0.909090909},
 {'soil_types': '0.0', 'voltage': 0.356495062, 'height': 0.636363636},
 {'soil_types': '0.8',

In [34]:
len(test_dicts)

62

In [38]:
# predictions of the whole test dataset

y_pred = model.predict(test_dicts)
y_pred

array([4, 1, 1, 2, 3, 1, 2, 4, 3, 2, 4, 4, 4, 3, 4, 0, 4, 2, 1, 0, 0, 0,
       3, 4, 3, 1, 4, 2, 3, 0, 2, 0, 1, 0, 4, 2, 1, 4, 0, 3, 4, 4, 4, 4,
       1, 3, 2, 3, 0, 3, 2, 0, 3, 0, 4, 0, 1, 3, 0, 0, 3, 3])

In [39]:
# prediction of the 40th dataset: same as above

y_pred = model.predict(test_dicts)
y_pred[40]

4

#### Putting all above in a pipeline

In [42]:
# data preparation


def get_data(data):

    df = pd.read_csv(data)

    df = df.rename(
        columns={"V": "voltage", "H": "height", "S": "soil_types", "M": "mine_types"}
    )

    df["mine_types"] = df["mine_types"].replace({1: 0, 2: 1, 3: 2, 4: 3, 5: 4})

    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]
    df.reset_index(drop=True)

    num_variables = ["voltage", "height"]
    cat_variable = ["soil_types"]
    df[cat_variable] = df[cat_variable].astype(str)

    return df


# Splitting dataset


def split_dataset(df):

    df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
    df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

    df_full_train = df_full_train.reset_index(drop=True)
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    return df_full_train, df_train, df_val, df_test


df_full_train, df_train, df_val, df_test = split_dataset(dataset)

In [43]:
# Prepare dictionaries


def prepare_dictionaries(df):

    num_variables = ["voltage", "height"]
    cat_variable = ["soil_types"]

    dicts = df[cat_variable + num_variables].to_dict(orient="records")
    return dicts


# individual dictionaries of the split dataset

train_dicts = prepare_dictionaries(df_train)
val_dicts = prepare_dictionaries(df_val)
test_dicts = prepare_dictionaries(df_test)

In [44]:
# Function to get the target value


def get_y(df):
    y = df["mine_types"].values
    return y


# Individual targets for the split dataset

y_train = get_y(df_train)
y_val = get_y(df_val)
y_test = get_y(df_test)


# delete the target from each dataset

del df_train["mine_types"]
del df_val["mine_types"]
del df_test["mine_types"]

In [45]:
# function of the load model


def load_model(run_id):

    logged_model = f"s3://mlops-remote-bucket/1/{RUN_ID}/artifacts/model-SVC"
    model = mlflow.pyfunc.load_model(logged_model)
    return model

#### Applying the full model pipeline on the index 40 test data

In [71]:
def apply_model(input_data, run_id):

    df = get_data(input_data)

    df_full_train, df_train, df_val, df_test = split_dataset(df)

    dicts = prepare_dictionaries(df)

    train_dicts = prepare_dictionaries(df_train)
    val_dicts = prepare_dictionaries(df_val)
    test_dicts = prepare_dictionaries(df_test)

    y = get_y(df)

    y_train = get_y(df_train)
    y_val = get_y(df_val)
    y_test = get_y(df_test)

    del df_train["mine_types"]
    del df_val["mine_types"]
    del df_test["mine_types"]

    model = load_model(run_id)
    y_pred = model.predict(test_dicts)

    roc_auc_dict = roc_auc_score_multiclass(y_test, y_pred)

    return y_pred, roc_auc_dict

In [72]:
# calling the full pipeline

predictions, auc_score = apply_model("Mine_Dataset.csv", run_id=RUN_ID)

In [73]:
predictions

array([4, 1, 1, 2, 3, 1, 2, 4, 3, 2, 4, 4, 4, 3, 4, 0, 4, 2, 1, 0, 0, 0,
       3, 4, 3, 1, 4, 2, 3, 0, 2, 0, 1, 0, 4, 2, 1, 4, 0, 3, 4, 4, 4, 4,
       1, 3, 2, 3, 0, 3, 2, 0, 3, 0, 4, 0, 1, 3, 0, 0, 3, 3])

In [74]:
auc_score

{0: 0.98,
 1: 1.0,
 2: 0.6970108695652174,
 3: 0.8228511530398323,
 4: 0.7051630434782609}

In [75]:
predictions[40]

4