In [12]:
# example
import mlflow
from mlflow.models import infer_signature
import numpy as np
from sklearn.linear_model import LogisticRegression

with mlflow.start_run():
    X = np.array([-2, -1, 0, 1, 2, 1]).reshape(-1, 1)
    y = np.array([0, 0, 1, 1, 1, 0])
    lr = LogisticRegression()
    lr.fit(X, y)
    signature = infer_signature(X, lr.predict(X))

    model_info = mlflow.sklearn.log_model(
        sk_model=lr, artifact_path="model", signature=signature
    )

sklearn_pyfunc = mlflow.pyfunc.load_model(model_uri=model_info.model_uri)

data = np.array([-4, 1, 0, 10, -2, 1]).reshape(-1, 1)

predictions = sklearn_pyfunc.predict(data)

In [13]:
df=pd.read_csv('https://raw.githubusercontent.com/digipodium/Datasets/main/classfication/diabetes.csv')

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [15]:
url ='https://raw.githubusercontent.com/digipodium/Datasets/main/classfication/diabetes.csv'
df = pd.read_csv(url)
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [16]:
# creating a mlflow experiment

with mlflow.start_run():
    X = df.drop('Outcome', axis=1)
    y = df['Outcome']
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
        test_size=0.3, random_state=42)
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('model', DecisionTreeClassifier())
    ])
    pipe.fit(X_train, y_train)
    predictions = pipe.predict(X_test)
    print(classification_report(y_test, predictions, output_dict=True))
    mlflow.sklearn.log_model(pipe, "model")
    mlflow.log_param("model", "DecisionTreeClassifier")

{'0': {'precision': 0.8074074074074075, 'recall': 0.7218543046357616, 'f1-score': 0.7622377622377622, 'support': 151.0}, '1': {'precision': 0.5625, 'recall': 0.675, 'f1-score': 0.6136363636363636, 'support': 80.0}, 'accuracy': 0.7056277056277056, 'macro avg': {'precision': 0.6849537037037037, 'recall': 0.6984271523178809, 'f1-score': 0.6879370629370629, 'support': 231.0}, 'weighted avg': {'precision': 0.7225909892576559, 'recall': 0.7056277056277056, 'f1-score': 0.710774074410438, 'support': 231.0}}




 --.. comparing with mlflow

In [17]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

pipe1 = Pipeline([('scaler', StandardScaler()),  ('model', DecisionTreeClassifier())])
pipe2 = Pipeline([('scaler', StandardScaler()), ('model', KNeighborsClassifier())])
pipe3 = Pipeline([('scaler', StandardScaler()), ('model', RandomForestClassifier())])
pipe4 = Pipeline([('scaler', StandardScaler()), ('model', SVC())])

In [18]:
def train_n_log_model(model, model_name):
    mlflow.set_experiment("Diabetes Classification")
    with mlflow.start_run():
        model.fit(X_train, y_train)
        ypred = model.predict(X_test)
        signature = infer_signature(X_test, ypred)
        eval_report = classification_report(y_test, ypred, output_dict=True)
        mlflow.log_param("model", model_name)
        mlflow.log_metric("accuracy", eval_report['accuracy'])
        mlflow.log_metric("recall", eval_report['weighted avg']['recall'])
        mlflow.log_metric("precision", eval_report['weighted avg']['precision'])
        mlflow.log_metric("f1-score", eval_report['weighted avg']['f1-score'])
        mlflow.sklearn.log_model(model, model_name, signature=signature)

In [19]:
train_n_log_model(pipe1, "DecisionTreeClassifier")
train_n_log_model(pipe2, "KNeighborsClassifier")
train_n_log_model(pipe3, "RandomForestClassifier")
train_n_log_model(pipe4, "SVC")

2024/10/06 12:09:02 INFO mlflow.tracking.fluent: Experiment with name 'Diabetes Classification' does not exist. Creating a new experiment.
