In [1]:
import pandas as pd
import numpy as np

import pickle
import mlflow

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

In [2]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("landmine-detector-experiment")

2024/07/31 15:04:28 INFO mlflow.tracking.fluent: Experiment with name 'landmine-detector-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://mlops-remote-bucket/2', experiment_id='2', lifecycle_stage='active', name='landmine-detector-experiment', tags={}>

In [3]:
# data preparation


def get_data(data):

    df = pd.read_csv(data)

    df = df.rename(
        columns={"V": "voltage", "H": "height", "S": "soil_types", "M": "mine_types"}
    )

    df["mine_types"] = df["mine_types"].replace({1: 0, 2: 1, 3: 2, 4: 3, 5: 4})

    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]
    df.reset_index(drop=True)

    num_variables = ["voltage", "height"]
    cat_variable = ["soil_types"]
    df[cat_variable] = df[cat_variable].astype(str)

    return df

In [6]:
dataset = get_data("Mine_Dataset.csv")

dataset

Unnamed: 0,voltage,height,soil_types,mine_types
0,0.338157,0.000000,0.0,0
1,0.320241,0.181818,0.0,0
2,0.287009,0.272727,0.0,0
3,0.256284,0.454545,0.0,0
4,0.262840,0.545455,0.0,0
...,...,...,...,...
333,0.323262,0.909091,0.4,4
334,0.444108,0.181818,1.0,4
335,0.353474,0.454545,1.0,4
336,0.362537,0.727273,1.0,4


In [7]:
# Splitting dataset


def split_dataset(df):

    df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
    df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

    df_full_train = df_full_train.reset_index(drop=True)
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    return df_full_train, df_train, df_val, df_test

In [8]:
df_full_train, df_train, df_val, df_test = split_dataset(dataset)

In [9]:
# Function to get the target value


def get_y(df):
    y = df["mine_types"].values
    return y

In [10]:
# Individual targets for the split dataset

y_train = get_y(df_train)
y_val = get_y(df_val)
y_test = get_y(df_test)

In [11]:
del df_train["mine_types"]
del df_val["mine_types"]
del df_test["mine_types"]

In [12]:
# Prepare dictionaries


def prepare_dictionaries(df):

    num_variables = ["voltage", "height"]
    cat_variable = ["soil_types"]

    dicts = df[cat_variable + num_variables].to_dict(orient="records")
    return dicts

In [13]:
# individual dictionaries of the split dataset

train_dicts = prepare_dictionaries(df_train)
val_dicts = prepare_dictionaries(df_val)
test_dicts = prepare_dictionaries(df_test)

In [14]:
# Evaluation function of auc_score for individual target


def roc_auc_score_multiclass(actual_class, pred_class, average="macro"):

    # creating a set of all the unique classes using the actual class list
    unique_class = set(actual_class)
    roc_auc_dict = {}
    for per_class in unique_class:

        # creating a list of all the classes except the current class
        other_class = [x for x in unique_class if x != per_class]

        # marking the current class as 1 and all other classes as 0
        new_actual_class = [0 if x in other_class else 1 for x in actual_class]
        new_pred_class = [0 if x in other_class else 1 for x in pred_class]

        # using scikit-learn method to calculate the roc_auc_score
        roc_auc = roc_auc_score(new_actual_class, new_pred_class, average=average)
        roc_auc_dict[per_class] = roc_auc

    return roc_auc_dict

In [15]:
# logging into MLflow

with mlflow.start_run():

    mlflow.set_tag("developer", "Emmanuel")

    mlflow.log_param("train-data", df_train)
    mlflow.log_param("val-data-path", df_val)

    SVC_best_params = {
        "C": 239.7017845360123,
        "kernel": "poly",
        "degree": 3,
        "gamma": "scale",
        "coef0": 9.717139430035742,
        "tol": 0.6958712596862648,
        "cache_size": 139,
        "decision_function_shape": "ovo",
    }

    mlflow.log_params(SVC_best_params)

    dv = DictVectorizer(sparse=False)
    model = SVC(**SVC_best_params)

    mlflow.sklearn.log_model(model, artifact_path="model")

    X_train = dv.fit_transform(train_dicts)
    model.fit(X_train, y_train)

    X_val = dv.transform(val_dicts)
    y_pred = model.predict(X_val)

    accuracy = (y_val == y_pred).mean()

    roc_auc_score = roc_auc_score_multiclass(y_val, y_pred)

    print(SVC_best_params, accuracy, roc_auc_score)

    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("roc_auc_score_0", roc_auc_score[0])
    mlflow.log_metric("roc_auc_score_1", roc_auc_score[1])
    mlflow.log_metric("roc_auc_score_2", roc_auc_score[2])
    mlflow.log_metric("roc_auc_score_3", roc_auc_score[3])
    mlflow.log_metric("roc_auc_score_4", roc_auc_score[4])

    with open("dict_vectorizer.bin", "wb") as f_out:
        pickle.dump(dv, f_out)
    mlflow.log_artifact("dict_vectorizer.bin", artifact_path="dict-preprocessor")



{'C': 239.7017845360123, 'kernel': 'poly', 'degree': 3, 'gamma': 'scale', 'coef0': 9.717139430035742, 'tol': 0.6958712596862648, 'cache_size': 139, 'decision_function_shape': 'ovo'} 0.7049180327868853 {0: 0.8977272727272727, 1: 0.9375, 2: 0.6163636363636363, 3: 0.8325320512820512, 4: 0.7295918367346939}


#### USing Mlflow client to download DIctVectorizer

In [16]:
from mlflow.tracking import MlflowClient

In [18]:
tracking_uri = "http://127.0.0.1:5000"
RUN_ID = "2441a3e55caa4e02b962150a04fa47fd"

client = MlflowClient(tracking_uri=tracking_uri)

In [20]:
path = client.download_artifacts(
    run_id=RUN_ID, path="dict-preprocessor/dict_vectorizer.bin"
)

In [21]:
with open(path, "rb") as f_out:
    dv = pickle.load(f_out)

In [22]:
dv