In [1]:
# ! pip install sentence-transformers
# ! pip install mlflow

In [2]:
import pandas as pd  
import numpy as np 
from sklearn.utils import resample 
from sklearn.model_selection import GridSearchCV

# Parsing URLs
from urllib.parse import urlparse  

# SentenceTransformer for text embeddings
from sentence_transformers import SentenceTransformer 

# Scikit-learn for various ML models 
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import MultinomialNB 
from sklearn.svm import SVC    
from sklearn.ensemble import RandomForestClassifier 
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import StackingClassifier 

# Evaluation metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_curve, auc   

# MLflow for experiment tracking
import mlflow 

# Joblib for saving and loading models
import joblib  

# Python logging module for logging messages
import logging  



In [3]:
def convert_text_to_vectors(data, filename):
    """
    Convert text data to sentence vectors using SentenceTransformer model and store in a file.
    """
    model = SentenceTransformer('all-MiniLM-L6-v2')

    data = data.replace(np.nan, '', regex=True)
    vectors = model.encode(data['text'])

    # Store vectors in a file
    with open(filename, 'wb') as file:
        pickle.dump(vectors, file)

    return vectors

In [4]:
def train_model(clf, param_grid, train_emb, val_emb, test_emb, y_train, y_val, y_test):
    """
    Train a model with hyperparameter tuning on embedded training data and evaluate performance on validation and test sets.
    """
    # Parameter tuning with validation set
    grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy')
    grid_search.fit(val_emb, y_val)
    
    # Get best parameters
    best_params = grid_search.best_params_
    
    # Train the model on combined training and validation data with best parameters
    clf.set_params(**best_params)
    clf.fit(train_emb, y_train)

    # Make predictions on the test set
    y_pred = clf.predict(test_emb)

    # Calculate accuracy
    acc = accuracy_score(y_test, y_pred)

    # Evaluate precision-recall curve AUC for binary classification predictions
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
    aucpr = auc(recall, precision)

    return clf, acc, aucpr

In [5]:
def log_model_metrics(model, acc, aucpr, model_name):
    """
    Log model metrics and artifacts using MLflow.
    """
    filename = model_name + ".joblib"
    joblib.dump(model, filename)

    print(f"Accuracy: {acc}")
    print(f"AUCPR: {aucpr}")

    mlflow.sklearn.autolog()

    with mlflow.start_run():
        mlflow.log_param("model", filename)
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("AUCPR", aucpr)

        # Log artifact
        mlflow.log_artifact(filename)

        mlflow.sklearn.log_model(model, model_name)

        # Get the MLflow tracking URI scheme
        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

        mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path="sklearn-model",
            registered_model_name=model_name + "_Model"
        )

In [6]:
def get_model_version(model_name):
    """
    Retrieve the latest version of a model from MLflow by its name.
    """
    client = mlflow.tracking.MlflowClient()
    
    model_version = client.get_latest_versions(model_name, stages=["None"])[0].version
    return model_version