For a SIFT + Bag of Visual Words + Classifier pipeline, we follow these steps:
1. Generate SIFT descriptors for each image.
2. Cluster the SIFT descriptors to get visual vocabulary. Cluster centers are the vocabulary.
3. Use the clustering model to predict cluster labels for each descriptor for each image. 
4. Get normalized histograms of the cluster labels for each image. This gives us a normalized count of the number of visual words that are present in the image. 
5. Use the histogram and the labels to build a classifier.

In [1]:
import os
import dotenv
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.pipeline import Pipeline
import mlflow
import mlflow.sklearn
from functools import wraps
from src.data_utils.dataset import Dataset

In [None]:
dotenv.load_dotenv()

RANDOM_STATE = int(os.environ["RANDOM_STATE"])
MLFLOW_DATA_DIR = os.environ["MLFLOW_DATA_DIR"]
MLFLOW_TRACKING_URI = os.environ["MLFLOW_TRACKING_URI"]

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

# mlflow ui --backend-store-uri "sqlite:///mlflow_data/mlruns.db"

In [13]:
def get_experiment_id(experiment_name: str):
    """
    Retrieve the experiment ID for the experiment name. Create 
    a new experiment if it does not exist.

    Parameters:
        - experiment_name (str): The MLFlow experiment name.
    """
    try:
        experiment = mlflow.get_experiment_by_name(experiment_name)
        experiment_id = experiment.experiment_id
    except AttributeError:
        artifact_location = os.path.join(
            MLFLOW_DATA_DIR, 
            experiment_name
        )
        experiment_id = mlflow.create_experiment(experiment_name, artifact_location=artifact_location)

    return experiment_id


def mlflow_log_clustering(func):
    """
    Decorator for logging model parameters, metrics, and the model artifact to MLflow.

    Parameters: 
        - experiment_name (str): The MLFlow experiment name.
    """
    @wraps(func)
    def wrapper(*args, **kwargs):
        # Set the experiment
        experiment_name = kwargs["experiment_name"]
        experiment_id = get_experiment_id(experiment_name)
        mlflow.set_experiment(experiment_id=experiment_id)

        with mlflow.start_run():
            model, metrics = func(*args, **kwargs)

            model_params = model.get_params()
            mlflow.log_params(model_params)

            params = kwargs
            for key, value in params.items():
                if key != "experiment_name":
                    mlflow.log_param(key, value)

            mlflow.log_metrics(metrics)

            mlflow.sklearn.log_model(
                model, 
                artifact_path=experiment_name, 
                serialization_format=mlflow.sklearn.SERIALIZATION_FORMAT_PICKLE
            )

        return model, metrics
    return wrapper


@mlflow_log_clustering
def run_clustering_pipeline(X_train, y_train, pipeline: Pipeline, experiment_name: str):
    pipeline.fit(X_train, y_train)
    predicted_labels = pipeline.predict(X_train)
    silhouette = silhouette_score(X_train, predicted_labels)

    metrics = {
        "silhouette_score": silhouette
    }

    return pipeline, metrics

# Clustering

In [5]:
train_ds = Dataset("train")
train_descriptors, train_suits, train_nums = train_ds.load_descriptors(n=10)

In [6]:
# Using dataframes to leverage groupbys
train_df = pd.DataFrame(
    {
        "suits": [arr[0, 0].astype("str") for arr in train_suits], 
        "numbers": [arr[0, 0].astype("str") for arr in train_nums]
    }
)

train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   suits    10 non-null     object
 1   numbers  10 non-null     object
dtypes: object(2)
memory usage: 288.0+ bytes


## Modelling suits

In [7]:
train_df["suits"].value_counts()

suits
diamonds    10
Name: count, dtype: int64

Since clubs have the lowest count, we take 1806 samples from each class. 

In [8]:
num_samples_per_class = 10
train_mask = train_df.groupby(
    "suits"
).sample(
    num_samples_per_class, 
    random_state=RANDOM_STATE
).index.values

In [9]:
train_descriptors_filtered = [train_descriptors[i] for i in train_mask]
train_suits_filtered = [train_suits[i] for i in train_mask]

In [14]:
model, metrics =  run_clustering_pipeline(
    np.vstack(train_descriptors_filtered), 
    np.vstack(train_suits_filtered), 
    KMeans(10, random_state=RANDOM_STATE), 
    experiment_name="clustering"
)



In [75]:
clustering_exp_id = get_experiment_id("clustering")

# while False:
# for k in range(100, 501, 50):

mlflow.set_experiment(experiment_id=clustering_exp_id)
k = 10
with mlflow.start_run() as mlflow_run:
    kmeans_model = KMeans(
        n_clusters=k, 
        random_state=RANDOM_STATE
    )

    kmeans_model.fit(np.vstack(train_descriptors_filtered[:10]))

    mlflow.sklearn.log_model(
        kmeans_model, 
        f"clustering/{mlflow_run.info.run_id}", 
        serialization_format=mlflow.sklearn.SERIALIZATION_FORMAT_PICKLE
    )

    mlflow.log_params({'k': k})






