In [4]:
# Third party imports.
import numpy as np
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, homogeneity_score, completeness_score, v_measure_score

In [5]:
class ProductClustering:
    """
    Class for clustering similar or identical products.
    """

    def __init__(self):
        self.kmeans_model = None
        self.cluster_labels = None

        self.text_features = None
        self.image_features = None

    def load_feature_vectors(self, text_features, image_features):
        """
        Load text and image feature vectors.

        Parameters:
        text_features (numpy.ndarray): Text feature vectors.
        image_features (numpy.ndarray): Image feature vectors.
        """
        self.text_features = text_features
        self.image_features = image_features

    def prepare_input_data(self):
        """
        Prepare input data by concatenating text and image features.

        Returns:
        numpy.ndarray: Concatenated feature vectors.
        """
        if self.text_features is not None and self.image_features is not None:
            return np.concatenate((self.text_features, self.image_features), axis=1)
        else:
            print("Error: Load feature vectors first.")
            return None

    def split_data(self, input_data, test_size=0.2, validation_size=0.25, random_state=42):
        """
        Split data into training, validation, and test sets.

        Parameters:
        input_data (numpy.ndarray): Input feature vectors.
        test_size (float): Proportion of the dataset to include in the test split.
        validation_size (float): Proportion of the dataset to include in the validation split.
        random_state (int): Random seed for reproducibility.

        Returns:
        tuple: Arrays of train, validation, and test sets.
        """
        X_train, X_test = train_test_split(input_data, test_size=test_size, random_state=random_state)
        X_train, X_val = train_test_split(X_train, test_size=validation_size, random_state=random_state)
        return X_train, X_val, X_test

    def fit_kmeans(self, input_data, num_clusters=5, random_state=42):
        """
        Fit KMeans clustering model to the input data.

        Parameters:
        input_data (numpy.ndarray): Input feature vectors.
        num_clusters (int): Number of clusters to form.
        random_state (int): Random seed for reproducibility.
        """
        self.kmeans_model = KMeans(n_clusters=num_clusters, random_state=random_state)
        self.cluster_labels = self.kmeans_model.fit_predict(input_data)

    def get_cluster_centers(self):
        """
        Get the cluster centers.

        Returns:
        numpy.ndarray: Cluster centers.
        """
        if self.kmeans_model is not None:
            return self.kmeans_model.cluster_centers_
        else:
            print("Error: Fit KMeans model first.")
            return None

    def get_cluster_labels(self):
        """
        Get the cluster labels assigned to each product.

        Returns:
        numpy.ndarray: Cluster labels.
        """
        return self.cluster_labels

    def predict_cluster(self, feature_vector):
        """
        Predict the cluster for a given feature vector.

        Parameters:
        feature_vector (numpy.ndarray): Feature vector of a product.

        Returns:
        int: Predicted cluster label.
        """
        if self.kmeans_model is not None:
            return self.kmeans_model.predict(np.array([feature_vector]))[0]
        else:
            print("Error: Fit KMeans model first.")
            return None


In [None]:
# Load feature vectors
dataset_path = "../data/processed"
text_features = np.load(f"{dataset_path}/text_features.npy", allow_pickle=True)
image_features = np.load(f"{dataset_path}/image_features.npy", allow_pickle=True)
print("text_features:", type(text_features), text_features.shape)
print("image_features:", type(image_features), image_features.shape)

# Initialize ProductClustering instance
clustering_model = ProductClustering()

# Load feature vectors
clustering_model.load_feature_vectors(text_features, image_features)

# Prepare input data
input_data = clustering_model.prepare_input_data()

# Split data into train, validation, and test sets
X_train, X_val, X_test = clustering_model.split_data(input_data)

# Fit KMeans model
clustering_model.fit_kmeans(X_train)

# Get cluster centers
cluster_centers = clustering_model.get_cluster_centers()

# Get cluster labels
cluster_labels = clustering_model.get_cluster_labels()

# Predict cluster for a feature vector
predicted_cluster = clustering_model.predict_cluster(X_val)