# Adam vs EM

In [16]:
import torch
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
from matplotlib.colors import ListedColormap
import time
import pandas as pd

import importlib
import utils.gmm
import utils.gmm_adam
import utils.metrics
import utils.priors
importlib.reload(utils.gmm)
importlib.reload(utils.gmm_adam)
importlib.reload(utils.metrics)
importlib.reload(utils.priors)
from utils.metrics import ClusteringMetrics
from utils.gmm import GaussianMixture
from utils.gmm_adam import GaussianMixtureAdam

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

init_params = 'kmeans'
n_features = 4
n_components = 4
max_iter = 1001
covariance_type = 'full'
reg_covar = 1e-4
tol = 1e-4

# Updated data creation for 4 dimensions
n_samples_1 = 1000
n_samples_2 = 800
n_samples_3 = 400
n_samples_4 = 600

# Center coordinates in a 4D space
center_1 = np.array([0, 0, 0, 0])
center_2 = np.array([-4, 4, -4, 4])
center_3 = np.array([4, -4, 4, -4])
center_4 = np.array([4, 4, 4, 4])

# Generating 4-dimensional datasets
np.random.seed(0)
C_1 = np.random.rand(4, 4)  # Random transformation matrix for component 1
C_2 = np.random.rand(4, 4)  # Random transformation matrix for component 2

component_1 = np.dot(np.random.randn(n_samples_1, 4), C_1) + center_1
component_2 = 0.7 * np.random.randn(n_samples_2, 4) + center_2
component_3 = .5 * np.random.randn(n_samples_3, 4) + center_3
component_4 = np.dot(np.random.randn(n_samples_4, 4), C_2) + center_4

X = np.concatenate([component_1, component_2, component_3, component_4])
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.cat([torch.zeros(n_samples_1), torch.ones(n_samples_2), 2 * torch.ones(n_samples_3), 3 * torch.ones(n_samples_4)]).long()

centers_4d = [np.array([0, 0, 0, 0]), np.array([-4, 4, -4, 4]), np.array([4, -4, 4, -4]), np.array([4, 4, 4, 4])]

random_state = 0

In [3]:
n_features = X_tensor.shape[1]

# Initialize the GMM optimized with Adam
gmm_adam = GaussianMixtureAdam(
    n_features=n_features,
    n_components=n_components,
    covariance_type=covariance_type,
    max_iter=max_iter,
    learning_rate=1e-1,
    init_params=init_params,
    reg_covar=reg_covar,
)

# Initialize the EM-based GMM
gmm_em = GaussianMixture(
    n_features=n_features,
    n_components=n_components,
    covariance_type=covariance_type,
    max_iter=max_iter,
    init_params=init_params,
    reg_covar=reg_covar,
)

# Fit the EM-based GMM
gmm_em.fit(X_tensor)

# Predict and evaluate
labels_em = gmm_em.predict(X_tensor)
results_em = gmm_em.evaluate_clustering(X_tensor, true_labels=y_tensor)
print("EM-based GMM Results:", results_em)

# Fit the Adam-optimized GMM
gmm_adam.fit(X_tensor)

# Predict and evaluate
labels_adam = gmm_adam.predict(X_tensor)
results_adam = gmm_adam.evaluate_clustering(X_tensor, true_labels=y_tensor)
print("Adam-optimized GMM Results:", results_adam)


EM-based GMM Results: {'rand_score': 0.9963458776473999, 'adjusted_rand_score': 0.990842342376709, 'mutual_info_score': 1.3139123916625977, 'adjusted_mutual_info_score': 0.9851972460746765, 'normalized_mutual_info_score': 0.98520427942276, 'fowlkes_mallows_score': 0.9933634996414185, 'homogeneity_score': 0.9851346015930176, 'completeness_score': 0.9852737784385681, 'v_measure_score': 0.9852041851005188, 'purity_score': 0.9967857003211975, 'classification_report': {0: {'precision': 0.00667779632721202, 'recall': 0.004, 'f1-score': 0.0050031269543464665, 'support': 1000, 'jaccard': 0.0025078369905956114, 'roc_auc': 0.1655922532081604}, 1: {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 800, 'jaccard': 0.0, 'roc_auc': 0.03547938913106918}, 2: {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 400, 'jaccard': 1.0, 'roc_auc': 1.0}, 3: {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 600, 'jaccard': 0.0, 'roc_auc': 0.5632045269012451}}, 'confusion_matrix

In [4]:
# Function to generate synthetic data
def generate_data(n_samples, n_features, centers, random_state):
    np.random.seed(random_state)
    X = []
    for center in centers:
        center_adjusted = np.pad(center, (0, max(0, n_features - len(center))), mode='constant')
        X.append(np.random.randn(n_samples // len(centers), n_features) + center_adjusted[:n_features])  # Divide samples equally
    X = np.concatenate(X)
    return torch.tensor(X, dtype=torch.float32)


In [5]:
# Function to run a single experiment (comparing EM and Adam-based GMM with different learning rates)
def run_experiment(X_tensor, y_tensor, dim, size, n_components, experiment_name, learning_rates=[1e-1, 1e-2, 1e-3]):
    results = []
    print(f"\n Running {experiment_name} with {dim} dimensions, with {n_components} clusters and {size} samples")
    
    # Time the EM-based GMM
    start_time = time.time()  # Start the timer
    gmm_em = GaussianMixture(
        n_features=dim,
        n_components=n_components,
        covariance_type=covariance_type,
        max_iter=max_iter,
        init_params=init_params,
        reg_covar=reg_covar,
        random_state=random_state,
    )
    gmm_em.fit(X_tensor)
    end_time = time.time()  # End the timer
    time_taken_em = end_time - start_time  # Calculate the time difference

    results_em = gmm_em.evaluate_clustering(X_tensor, true_labels=y_tensor)
    print(f"EM converged in {gmm_em.n_iter_} iterations with log-likelihood: {gmm_em.lower_bound_}, Time taken: {time_taken_em:.4f} seconds")

    # Collect EM results
    results.append({
        'experiment': experiment_name,
        'method': 'EM',
        'n_features': dim,
        'n_clusters': n_components,
        'data_size': size,
        'learning_rate': None,
        'iterations': gmm_em.n_iter_,
        'log_likelihood': gmm_em.lower_bound_,
        'time_taken': time_taken_em,  # Add time to the results
        **results_em
    })
    
    # Run Adam-based GMM with different learning rates
    for lr in learning_rates:
        start_time = time.time()  # Start the timer
        gmm_adam = GaussianMixtureAdam(
            n_features=dim,
            n_components=n_components,
            covariance_type=covariance_type,
            max_iter=max_iter,
            learning_rate=lr,
            init_params=init_params,
            reg_covar=reg_covar,
            random_state=random_state,
        )
        gmm_adam.fit(X_tensor)
        end_time = time.time()  # End the timer
        time_taken_adam = end_time - start_time  # Calculate the time difference

        results_adam = gmm_adam.evaluate_clustering(X_tensor, true_labels=y_tensor)
        print(f"Adam (lr={lr}) converged in {gmm_adam.n_iter_} iterations with log-likelihood: {gmm_adam.lower_bound_}, Time taken: {time_taken_adam:.4f} seconds")
        
        # Collect Adam results for each learning rate
        results.append({
            'experiment': experiment_name,
            'method': 'Adam',
            'n_features': dim,
            'n_clusters': n_components,
            'data_size': size,
            'learning_rate': lr,
            'iterations': gmm_adam.n_iter_,
            'log_likelihood': gmm_adam.lower_bound_,
            'time_taken': time_taken_adam,  # Add time to the results
            **results_adam
        })
    
    return pd.DataFrame(results)


## 1. Experiment 1: Well-separated Clusters, Increasing Dimensions


In [6]:
# Store results in a list of DataFrames
experiment_results = []

# Experiment 1: Well-separated clusters with increasing dimensions
for dim in [2, 4, 8, 16, 32, 64, 128]:
    X_tensor = generate_data(1000, dim, centers_4d, random_state).to(device)
    
    # Fix the label generation to match the data size
    n_per_cluster = 1000 // 4  # 1000 samples divided by 4 clusters
    y_tensor = torch.cat([torch.ones(n_per_cluster) * i for i in range(4)]).long()
    
    df = run_experiment(X_tensor, y_tensor, dim, 1000, 4, "Experiment 1")
    experiment_results.append(df)

# Combine all results into a single DataFrame
final_results = pd.concat(experiment_results)

file_name = 'gmm_adam_vs_em_varying_dimensions.xlsx'
folder = 'results'
final_results.to_excel(f'{folder}/{file_name}', index=False)

print(final_results)



 Running Experiment 1 with 2 dimensions, with 4 clusters and 1000 samples
EM converged in 8 iterations with log-likelihood: -4.162824630737305, Time taken: 0.0547 seconds
Adam (lr=0.1) converged in 54 iterations with log-likelihood: -4.162992477416992, Time taken: 1.1293 seconds
Adam (lr=0.01) converged in 20 iterations with log-likelihood: -4.163053035736084, Time taken: 0.4838 seconds
Adam (lr=0.001) converged in 184 iterations with log-likelihood: -4.163110733032227, Time taken: 2.9411 seconds

 Running Experiment 1 with 4 dimensions, with 4 clusters and 1000 samples
EM converged in 2 iterations with log-likelihood: -6.963935375213623, Time taken: 0.0239 seconds
Adam (lr=0.1) converged in 28 iterations with log-likelihood: -6.96526575088501, Time taken: 0.6280 seconds
Adam (lr=0.01) converged in 14 iterations with log-likelihood: -6.964781761169434, Time taken: 0.3228 seconds
Adam (lr=0.001) converged in 152 iterations with log-likelihood: -6.964130401611328, Time taken: 2.2223 sec

## 2. Experiment 2: Increasing Number of Clusters (Fixed Dimension)


In [7]:
# Store results in a list of DataFrames
experiment_results = []

# Function to generate random cluster centers
def generate_random_centers(n_clusters, n_features, scale=10):
    """Generates random centers for the clusters."""
    return [np.random.randn(n_features) * scale for _ in range(n_clusters)]

# Experiment 2: Increasing number of clusters with fixed dimensions
for n_clusters in [2, 4, 8, 16, 32, 64, 128]:
    centers = generate_random_centers(n_clusters, 4)  # Random cluster centers for each experiment
    X_tensor = generate_data(1000, 4, centers, random_state).to(device)
    y_tensor = torch.cat([torch.ones(1000 // n_clusters) * i for i in range(n_clusters)]).long()
    df = run_experiment(X_tensor, y_tensor, 4, 1000, n_clusters, "Experiment 2")
    experiment_results.append(df)

# Combine all results into a single DataFrame
final_results = pd.concat(experiment_results)

file_name = 'gmm_adam_vs_em_varying_clusters.xlsx'
folder = 'results'
final_results.to_excel(f'{folder}/{file_name}', index=False)

print(final_results)



 Running Experiment 2 with 4 dimensions, with 2 clusters and 1000 samples
EM converged in 2 iterations with log-likelihood: -6.28389835357666, Time taken: 0.0120 seconds
Adam (lr=0.1) converged in 19 iterations with log-likelihood: -6.286192893981934, Time taken: 0.1008 seconds
Adam (lr=0.01) converged in 36 iterations with log-likelihood: -6.2839460372924805, Time taken: 0.1796 seconds
Adam (lr=0.001) converged in 118 iterations with log-likelihood: -6.284086227416992, Time taken: 0.6731 seconds

 Running Experiment 2 with 4 dimensions, with 4 clusters and 1000 samples
EM converged in 2 iterations with log-likelihood: -6.963937282562256, Time taken: 0.0118 seconds
Adam (lr=0.1) converged in 44 iterations with log-likelihood: -6.964127063751221, Time taken: 0.4809 seconds
Adam (lr=0.01) converged in 14 iterations with log-likelihood: -6.964780807495117, Time taken: 0.1788 seconds
Adam (lr=0.001) converged in 152 iterations with log-likelihood: -6.964132308959961, Time taken: 1.7366 se

## 3. Experiment 3: Fixed Number of Clusters and Dimensions, Increasing Data Sizes

In [8]:
# Store results in a list of DataFrames
experiment_results = []

# Experiment 3: Fixed clusters and dimensions, increasing data sizes
for size in [100, 500, 1000, 5000, 10000]:
    X_tensor = generate_data(size, 4, centers_4d, random_state).to(device)
    y_tensor = torch.cat([torch.zeros(size//4), torch.ones(size//4), 2 * torch.ones(size//4), 3 * torch.ones(size//4)]).long()
    df = run_experiment(X_tensor, y_tensor, 4, size, 4, "Experiment 3")
    experiment_results.append(df)

# Combine all results into a single DataFrame
final_results = pd.concat(experiment_results)

file_name = 'gmm_adam_vs_em_varying_datasizes.xlsx'
folder = 'results'
final_results.to_excel(f'{folder}/{file_name}', index=False)

print(final_results)



 Running Experiment 3 with 4 dimensions, with 4 clusters and 100 samples
EM converged in 2 iterations with log-likelihood: -6.779717922210693, Time taken: 0.0167 seconds
Adam (lr=0.1) converged in 46 iterations with log-likelihood: -6.7802653312683105, Time taken: 0.5551 seconds
Adam (lr=0.01) converged in 82 iterations with log-likelihood: -6.779794692993164, Time taken: 0.9292 seconds
Adam (lr=0.001) converged in 526 iterations with log-likelihood: -6.780829906463623, Time taken: 3.3411 seconds

 Running Experiment 3 with 4 dimensions, with 4 clusters and 500 samples
EM converged in 2 iterations with log-likelihood: -6.931983470916748, Time taken: 0.0060 seconds
Adam (lr=0.1) converged in 44 iterations with log-likelihood: -6.932250499725342, Time taken: 0.2743 seconds
Adam (lr=0.01) converged in 51 iterations with log-likelihood: -6.932024002075195, Time taken: 0.3136 seconds
Adam (lr=0.001) converged in 210 iterations with log-likelihood: -6.9322710037231445, Time taken: 1.2294 se

## Experiment 4: Fixed Number of Clusters, Dimensions, Data Sizes - Different initial points

In [21]:
# 10 random states
random_states = np.arange(10)
max_iter = 10001
learning_rates = [10, 1., 1e-1, 1e-2]
tol = 1e-4

# Only using random and points initialization methods
initialization_strategies = ['random', 'points']

# Function to run a single experiment (comparing EM and Adam-based GMM with different learning rates and initializations)
def run_experiment_with_init(X_tensor, y_tensor, dim, size, n_components, experiment_name, init_method, random_state, learning_rates=learning_rates):
    results = []
    print(f"\n Running {experiment_name} with {dim} dimensions, {n_components} clusters, {size} samples, and initialization: {init_method} with random state: {random_state}")
    
    # Time the EM-based GMM
    start_time = time.time()
    gmm_em = GaussianMixture(
        n_features=dim,
        n_components=n_components,
        covariance_type=covariance_type,
        max_iter=max_iter,
        init_params=init_method,
        reg_covar=reg_covar,
        random_state=random_state,
        tol=tol
    )
    gmm_em.fit(X_tensor)
    end_time = time.time()
    time_taken_em = end_time - start_time

    results_em = gmm_em.evaluate_clustering(X_tensor, true_labels=y_tensor)
    print(f"EM converged in {gmm_em.n_iter_} iterations with log-likelihood: {gmm_em.lower_bound_}, Time taken: {time_taken_em:.4f} seconds")

    # Collect EM results
    results.append({
        'experiment': experiment_name,
        'method': 'EM',
        'n_features': dim,
        'n_clusters': n_components,
        'data_size': size,
        'learning_rate': None,
        'iterations': gmm_em.n_iter_,
        'log_likelihood': gmm_em.lower_bound_,
        'time_taken': time_taken_em,
        'init_method': init_method,
        **results_em
    })

    # Run Adam-based GMM with different learning rates
    for lr in learning_rates:
        start_time = time.time()
        gmm_adam = GaussianMixtureAdam(
            n_features=dim,
            n_components=n_components,
            covariance_type=covariance_type,
            max_iter=max_iter,
            learning_rate=lr,
            init_params=init_method,
            reg_covar=reg_covar,
            random_state=random_state,
            tol=tol
        )
        gmm_adam.fit(X_tensor)
        end_time = time.time()
        time_taken_adam = end_time - start_time

        results_adam = gmm_adam.evaluate_clustering(X_tensor, true_labels=y_tensor)
        print(f"Adam (lr={lr}) converged in {gmm_adam.n_iter_} iterations with log-likelihood: {gmm_adam.lower_bound_}, Time taken: {time_taken_adam:.4f} seconds")

        # Collect Adam results
        results.append({
            'experiment': experiment_name,
            'method': 'Adam',
            'n_features': dim,
            'n_clusters': n_components,
            'data_size': size,
            'learning_rate': lr,
            'iterations': gmm_adam.n_iter_,
            'log_likelihood': gmm_adam.lower_bound_,
            'time_taken': time_taken_adam,
            'init_method': init_method,
            **results_adam
        })

    return pd.DataFrame(results)


# Experiment 4: Fixed clusters, fixed dimensions, and varying initializations
experiment_results = []
for init_method in initialization_strategies:
    for random_state in random_states:
        X_tensor = generate_data(1000, 4, centers_4d, random_state).to(device)
        y_tensor = torch.cat([torch.zeros(250), torch.ones(250), 2 * torch.ones(250), 3 * torch.ones(250)]).long()
        df = run_experiment_with_init(X_tensor, y_tensor, 4, 1000, 4, "Experiment 4", init_method, random_state)
        experiment_results.append(df)

# Combine all results into a single DataFrame
final_results = pd.concat(experiment_results)

# Save the results to an Excel file
file_name = 'gmm_experiment_random_vs_points_initialization.xlsx'
folder = 'results'
final_results.to_excel(f'{folder}/{file_name}', index=False)

print(f"Results saved to {file_name}")


 Running Experiment 4 with 4 dimensions, 4 clusters, 1000 samples, and initialization: random with random state: 0
EM converged in 17 iterations with log-likelihood: -7.31906795501709, Time taken: 0.0377 seconds
Adam (lr=10) converged in 386 iterations with log-likelihood: -19.558486938476562, Time taken: 3.1530 seconds
Adam (lr=1.0) converged in 104 iterations with log-likelihood: -8.479379653930664, Time taken: 1.2602 seconds
Adam (lr=0.1) converged in 51 iterations with log-likelihood: -8.124852180480957, Time taken: 0.6178 seconds
Adam (lr=0.01) converged in 1148 iterations with log-likelihood: -7.318942070007324, Time taken: 13.2956 seconds

 Running Experiment 4 with 4 dimensions, 4 clusters, 1000 samples, and initialization: random with random state: 1
EM converged in 13 iterations with log-likelihood: -7.033401012420654, Time taken: 0.0253 seconds
Adam (lr=10) converged in 30 iterations with log-likelihood: -18.858457565307617, Time taken: 0.3543 seconds
Adam (lr=1.0) converge