### Train K-Means model using vector embeddings using all-MiniLM-L6-v2 model

In [None]:
import pandas as pd
import joblib
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import numpy as np
import os
from dotenv import load_dotenv
load_dotenv()

In [2]:
# Load abstracts
abstracts = pd.read_csv(os.getenv("FINAL_DATA_PATH"))['AbstractNarration']

In [3]:
data = abstracts.to_list()

In [None]:
len(data)

In [5]:
# Generate embeddings from desired model
def generate_embeddings(
        data: list[str],
        model_name: str
):
    """
    Generate embeddings from a list of abstracts using the SentenceTransformer model.

    Args:
        abstracts (list): A list of abstract texts.

    Returns:
        numpy.ndarray: An array of embeddings corresponding to the input abstracts.
    """
    model = SentenceTransformer(model_name)
    embeddings = model.encode(data)

    return model, embeddings

In [6]:
model, embeddings = generate_embeddings(data=data, model_name='all-MiniLM-L6-v2')

In [None]:
len(embeddings)

In [8]:
# Clustering simple approach defining arbitrary cluster number
def create_kmeans_model(
        embeddings: np.ndarray,
        num_clusters: int = 5
):
    """
    Creates a KMeans clustering model with the specified number of clusters
    and fits it to the given embeddings.

    Args:
        embeddings (numpy.ndarray): The embeddings to cluster.
        num_clusters (int): The number of clusters to create.

    Returns:
        sklearn.cluster.KMeans: The fitted KMeans model.
        labels: numpy.ndarray
    """

    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(embeddings)
    labels = kmeans.fit_predict(embeddings)

    return kmeans, labels

In [None]:
kmeans, labels = create_kmeans_model(embeddings)

In [10]:
# Save the model and the embeddings
from dotenv import load_dotenv
load_dotenv()
def save_models(kmeans, model):
    joblib.dump(kmeans, f'{os.getenv("MODELS_PATH")}\kmeans_model.pkl')
    joblib.dump(model, f'{os.getenv("MODELS_PATH")}\sentence_model.pkl')

In [11]:
save_models(kmeans, model)

In [18]:
# Save the abstracts and their cluster labels to a CSV file using environment variable
from dotenv import load_dotenv
load_dotenv()
def save_clustered_abstracts(data, labels):
    clustered_abstracts = pd.DataFrame(columns=['AbstractNarration', 'label'])
    clustered_abstracts['AbstractNarration'] = data
    clustered_abstracts['label'] = labels
    clustered_abstracts.to_csv(os.getenv("CLUSTERED_ABSTRACTS_PATH"), index=False)

In [19]:
save_clustered_abstracts(data, labels)

In [None]:
#save_clustered_abstracts(abstracts, labels)
print(len(data))
print(len(labels))

In [21]:
from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score
)

def evaluate_kmeans_model(
        data: pd.DataFrame,
        labels: pd.Series,
        model: KMeans
) -> dict:
    """
    Evaluates a K-Means model using multiple clustering evaluation metrics.
    
    Args:
    - data: pd.DataFrame - The dataset (without labels) used for clustering
    - labels: pd.Series - The predicted cluster labels for each data point
    - model: KMeans - The fitted KMeans model
    
    Returns:
    - dict: Dictionary containing evaluation metrics
    """
    
    # Inertia (within-cluster sum of squares)
    inertia = model.inertia_
    
    # Silhouette Score
    silhouette_avg = silhouette_score(data, labels)
    
    # Calinski-Harabasz Index
    calinski_harabasz = calinski_harabasz_score(data, labels)
    
    # Davies-Bouldin Index
    davies_bouldin = davies_bouldin_score(data, labels)
    
    # Store all metrics in a dictionary
    metrics = {
        'inertia': inertia,
        'silhouette_score': silhouette_avg,
        'calinski_harabasz_score': calinski_harabasz,
        'davies_bouldin_score': davies_bouldin
    }
    
    return metrics


In [22]:
results = evaluate_kmeans_model(embeddings, labels, kmeans)

In [None]:
results

### Analysis
this result show it not a good clusterization. Let's improve modifying numb_cluster

In [None]:
analysisDataFrame=pd.DataFrame(columns=[
    "num_clusters",
    "inertia",
    "silhouette_score",
    "calinski_harabasz_score",
    "davies_bouldin_score"
])
for i in list([3,5,10,15,20]):
    kmeans, labels = create_kmeans_model(embeddings, num_clusters=i)
    results = evaluate_kmeans_model(embeddings, labels, kmeans)
    new_row = pd.DataFrame({
        "num_clusters": i,
        "inertia": results["inertia"],
        "silhouette_score": results["silhouette_score"],
        "calinski_harabasz_score": results["calinski_harabasz_score"],
        "davies_bouldin_score": results["davies_bouldin_score"]
    }, index=[0])
    analysisDataFrame = pd.concat([analysisDataFrame, new_row], ignore_index=True)

In [None]:
analysisDataFrame

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

def plot_kmeans_clusters(data, n_clusters=5, use_pca=False):
    """
    Plots the K-Means clustering results.

    Args:
    - data: array-like, shape (n_samples, n_features)
        The data to cluster.
    - n_clusters: int, default=3
        The number of clusters for K-Means.
    - use_pca: bool, default=False
        Whether to apply PCA for dimensionality reduction if data has more than 2 features.

    Returns:
    - None: Shows the plot of clusters and centroids.
    """
    
    # If data has more than 2 dimensions and PCA is requested
    if use_pca and data.shape[1] > 2:
        print(f"Data has {data.shape[1]} dimensions. Reducing to 2 dimensions using PCA...")
        pca = PCA(n_components=2)
        data_2d = pca.fit_transform(data)
    else:
        data_2d = data

    # Step 2: Fit the K-Means model
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(data_2d)

    # Step 3: Get predicted cluster labels
    labels = kmeans.labels_

    # Step 4: Plot the clusters
    plt.figure(figsize=(8, 6))
    sns.scatterplot(
        x=data_2d[:, 0],
        y=data_2d[:, 1],
        hue=labels,
        palette='viridis',
        s=100,
        alpha=0.7
    )

    # Plot centroids
    centroids = kmeans.cluster_centers_
   
    plt.scatter(
        centroids[:, 0],
        centroids[:, 1],
        s=300,
        c='red',
        marker='X',
        label='Centroids'
    )

    # Add plot title and labels
    plt.title(f'K-Means Clustering Results (n_clusters={n_clusters})')
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.legend()
    plt.show()

In [None]:
plot_kmeans_clusters(embeddings)