In [31]:
import mlflow
import mlflow.sklearn
import mlflow.tensorflow
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

In [32]:
# Set URI untuk MLflow Tracking Server
mlflow.set_tracking_uri("http://127.0.0.1:5001")

In [33]:
def load_and_preprocess_data(dataset_path):
    # Memuat dataset
    data = pd.read_csv(dataset_path)

    # Langkah 1: Pemrosesan - Mengubah teks menjadi data numerik menggunakan TfidfVectorizer
    vectorizer = TfidfVectorizer(stop_words='english')  # Menghapus stop words umum dalam bahasa Inggris
    X = vectorizer.fit_transform(data['Message'])  # Asumsi 'Message' adalah kolom yang berisi teks

    # Langkah 2: Mengkodekan label 'ham' dan 'spam' menjadi nilai numerik
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(data['Category'])  # 'ham' -> 0, 'spam' -> 1

    return X, y

In [34]:
def train_and_log_model(data_path, model_type):
    # Memuat dan memproses dataset
    X, y = load_and_preprocess_data(data_path)
    
    # Memulai run MLflow
    with mlflow.start_run(run_name=f"model_{model_type}") as run:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        if model_type == 'LogisticRegression':
            model = LogisticRegression(max_iter=1000)
            params = {'max_iter': 1000}
        
        elif model_type == 'RandomForest':
            model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
            params = {'n_estimators': 100, 'max_depth': 5, 'random_state': 42}
        
        elif model_type == 'NeuralNetwork':
            model = Sequential([
                Dense(64, input_dim=X_train.shape[1], activation='relu'),
                Dense(32, activation='relu'),
                Dense(1, activation='sigmoid')
            ])
            model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
            params = {'optimizer': 'Adam', 'loss': 'binary_crossentropy'}
        
        # Melatih model
        if model_type == 'NeuralNetwork':
            model.fit(X_train.toarray(), y_train, epochs=10, batch_size=32, verbose=0)  # Pastikan input dalam bentuk array
            # Neural Network memerlukan evaluasi terpisah
            y_pred = model.predict(X_test.toarray())  # Pastikan X_test juga dalam bentuk array
            y_pred = (y_pred > 0.5).astype(int)  # Mengubah probabilitas menjadi label biner (0 atau 1)
        else:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

        # Menghitung metrik
        y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else y_pred
        metrics = {
            'accuracy': accuracy_score(y_test, y_pred),
            'auc_score': roc_auc_score(y_test, y_prob)
        }

        # Mencatat parameter, metrik, dan model
        mlflow.log_params(params)
        mlflow.log_metrics(metrics)
        mlflow.set_tags({'model_type': model_type})
        
        if model_type == 'NeuralNetwork':
            mlflow.tensorflow.log_model(model, f"model_{model_type}")
        else:
            mlflow.sklearn.log_model(model, f"model_{model_type}")

        print(f"Model {model_type} berhasil dicatat dengan run_id: {run.info.run_id}")
        return run.info.run_id


In [35]:
if __name__ == "__main__":
    # Ganti path dataset sesuai dengan dataset Anda
    dataset_path = "data/spam.csv"
    
    run_id_lr = train_and_log_model(dataset_path, 'LogisticRegression')
    run_id_rf = train_and_log_model(dataset_path, 'RandomForest')
    run_id_nn = train_and_log_model(dataset_path, 'NeuralNetwork')
    
    print(f"Logistic Regression run_id: {run_id_lr}")
    print(f"Random Forest run_id: {run_id_rf}")
    print(f"Neural Network run_id: {run_id_nn}")



Model LogisticRegression berhasil dicatat dengan run_id: 6a93dba5065f4ccc863777c9c0b84730
🏃 View run model_LogisticRegression at: http://127.0.0.1:5001/#/experiments/0/runs/6a93dba5065f4ccc863777c9c0b84730
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/0


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Model RandomForest berhasil dicatat dengan run_id: 3c81f89db0b340a1aa37142fddb25382
🏃 View run model_RandomForest at: http://127.0.0.1:5001/#/experiments/0/runs/3c81f89db0b340a1aa37142fddb25382
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/0
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step




Model NeuralNetwork berhasil dicatat dengan run_id: a4b902dfb7944d6bb93e71753dcca39b
🏃 View run model_NeuralNetwork at: http://127.0.0.1:5001/#/experiments/0/runs/a4b902dfb7944d6bb93e71753dcca39b
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/0
Logistic Regression run_id: 6a93dba5065f4ccc863777c9c0b84730
Random Forest run_id: 3c81f89db0b340a1aa37142fddb25382
Neural Network run_id: a4b902dfb7944d6bb93e71753dcca39b
