version DEC

In [98]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.model_selection import train_test_split


In [99]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim, embedding_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 512), 
            nn.ReLU(),
            nn.Linear(512, 256),       
            nn.ReLU(),
            nn.Linear(256, embedding_dim) 
        )
        
        self.decoder = nn.Sequential(
            nn.Linear(embedding_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, input_dim) 
            #sigmoid ?
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded, encoded

In [100]:

embeddings_np = np.load("../embeddings/merged_embeddings.npy")
data = torch.tensor(embeddings_np, dtype=torch.float)
dataset = TensorDataset(data)
dataloader = DataLoader(dataset, batch_size=256, shuffle=True)

train_embeddings, test_embeddings = train_test_split(embeddings_np, test_size=0.2, random_state=42)
train_data = torch.tensor(train_embeddings, dtype=torch.float)
test_data = torch.tensor(test_embeddings, dtype=torch.float)

train_dataset = TensorDataset(train_data)
test_dataset = TensorDataset(test_data)

train_dataloader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=256, shuffle=False)

input_dim = embeddings_np.shape[1]
embedding_dim = 256

In [101]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

autoencoder = Autoencoder(input_dim, embedding_dim)

autoencoder.to(device)
criterion_pretrain = nn.MSELoss() #tester d'autres maybe
optimizer_pretrain = optim.AdamW(autoencoder.parameters(), lr=0.001, weight_decay=1e-2)

In [102]:
from tqdm import tqdm

epochs = 1000

for epoch in range(epochs):
    total_loss = 0
    for batch_idx, (data,) in enumerate(loop):
        data = data.to(device)  # <-- la ligne cruciale !

        optimizer_pretrain.zero_grad()
        reconstructed_data, _ = autoencoder(data)
        loss = criterion_pretrain(reconstructed_data, data)
        loss.backward()
        optimizer_pretrain.step()

        total_loss += loss.item() * data.size(0)
        loop.set_postfix(batch_loss=loss.item())

    avg_loss = total_loss / len(train_dataloader.dataset)
    if (epoch) % 50 == 0:
        tqdm.write(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}")


Epoch [1/1000], Loss: 196.0834
Epoch [51/1000], Loss: 11.8243
Epoch [101/1000], Loss: 6.2941
Epoch [151/1000], Loss: 4.8958
Epoch [201/1000], Loss: 4.1334
Epoch [251/1000], Loss: 3.7617
Epoch [301/1000], Loss: 3.1265
Epoch [351/1000], Loss: 2.7008
Epoch [401/1000], Loss: 2.8219
Epoch [451/1000], Loss: 2.5849
Epoch [501/1000], Loss: 2.2049
Epoch [551/1000], Loss: 2.2421
Epoch [601/1000], Loss: 2.1664
Epoch [651/1000], Loss: 2.2481
Epoch [701/1000], Loss: 2.2795
Epoch [751/1000], Loss: 2.3155
Epoch [801/1000], Loss: 2.0140
Epoch [851/1000], Loss: 1.6907
Epoch [901/1000], Loss: 1.8684
Epoch [951/1000], Loss: 2.0264


In [103]:
autoencoder.eval()
with torch.no_grad():
    test_data = test_data.to(device)
    reconstructed_test, encoded_test = autoencoder(test_data)
    test_loss = criterion_pretrain(reconstructed_test, test_data)
print(f"Test Loss (MSE): {test_loss.item():.4f}")

Test Loss (MSE): 2.9350


In [104]:
# Use the encoder to generate embeddings for the entire dataset
embeddings_np = np.load("../embeddings/merged_embeddings.npy")
data = torch.tensor(embeddings_np, dtype=torch.float)

autoencoder.eval()
with torch.no_grad():
    data = data.to(device)
    _, combined_embeddings = autoencoder(data)
    combined_embeddings_np = combined_embeddings.cpu().numpy()
print(f"Combined embeddings shape: {combined_embeddings_np.shape}")
# Save the embeddings to a .npy file
np.save("../embeddings/combined_embeddings.npy", combined_embeddings_np)

Combined embeddings shape: (8739, 256)


In [106]:
import torch.nn.functional as F
from sklearn.cluster import KMeans

class DEC(nn.Module):
    def __init__(self, input_dim, embedding_dim, num_clusters, pretrained_ae=None):
        super(DEC, self).__init__()
        self.encoder = pretrained_ae.encoder if pretrained_ae else Autoencoder(input_dim, embedding_dim).encoder
        
        self.num_clusters = num_clusters
        self.embedding_dim = embedding_dim
        self.cluster_layer = nn.Parameter(torch.Tensor(num_clusters, embedding_dim))
    
        
    def forward(self, x):
        embeddings = self.encoder(x)
        self.cluster_layer.data = self.cluster_layer.data.to(embeddings.device)
        #on utilise la distrib student t distrib 
        diff = embeddings.unsqueeze(1) - self.cluster_layer  # (B, K, D)
        dist_sq = torch.sum(diff ** 2, dim=2)                # (B, K)
        q = 1.0 / (1.0 + dist_sq)
        q = q ** ((1 + 1) / 2.0)
        q = q / torch.sum(q, dim=1, keepdim=True)
        
        return embeddings, q

def target_distribution(q):
    weight = q**2 / torch.sum(q, 0)
    return (weight.T / torch.sum(weight, 1)).T

k = 6 # jsp lol faut tester

dec_model = DEC(input_dim, embedding_dim, k, pretrained_ae=autoencoder).to(device)

autoencoder.eval() 
all_embeddings_pretrain = []
with torch.no_grad():
    for batch_idx, (data,) in enumerate(dataloader):
        data = data[0].to(device)
        _, encoded = autoencoder(data)
        if encoded.dim() == 1:
            encoded = encoded.unsqueeze(0)
        all_embeddings_pretrain.append(encoded.cpu().numpy())
        
        
all_embeddings_pretrain = np.concatenate(all_embeddings_pretrain, axis=0)

kmeans = KMeans(n_clusters=k, n_init=20, random_state=0)
kmeans.fit(all_embeddings_pretrain)
initial_cluster_centroids = torch.tensor(kmeans.cluster_centers_, dtype=torch.float)
initial_cluster_centroids = initial_cluster_centroids.to(device)

dec_model.cluster_layer.data = initial_cluster_centroids


optimizer_dec = optim.Adam(dec_model.parameters(), lr=0.001)
dataloader_dec = DataLoader(dataset, batch_size=256, shuffle=True)

num_epochs_dec = 100 # Plus d'epochs pour l'affinement
update_interval = 140 
previous_cluster_assignments = None

for epoch in range(num_epochs_dec):
    total_loss_dec = 0
    num_batches = 0
    
    if epoch % 1 == 0: 
        dec_model.eval() 
        all_q = []
        with torch.no_grad():
            for batch_idx, (data,) in enumerate(dataloader_dec):
                data = data.to(device)
                _, q_batch = dec_model(data)
                all_q.append(q_batch.cpu())
        all_q = torch.cat(all_q, dim=0)
        p_target = target_distribution(all_q).to(data.device) 
        current_cluster_assignments = torch.argmax(all_q, dim=1).numpy()
        if previous_cluster_assignments is not None:
            n_changes = np.sum(current_cluster_assignments != previous_cluster_assignments)
            print(f"Epoch {epoch}: {n_changes} assignations ont changé.")
            if n_changes < 0.001 * len(dataset): 
                break
        previous_cluster_assignments = current_cluster_assignments
        dec_model.train() 

    for batch_idx, (data,) in enumerate(dataloader_dec):
        data = data.to(device)
        optimizer_dec.zero_grad()
        
        _, q_batch = dec_model(data)

        if len(dataloader_dec.dataset) < 10000: 
            loss_kl = F.kl_div(q_batch.log(), p_target[batch_idx*data.size(0):(batch_idx+1)*data.size(0)], reduction='batchmean')
        else:
            loss_kl = F.kl_div(q_batch.log(), target_distribution(q_batch), reduction='batchmean')


        loss_kl.backward()
        optimizer_dec.step()
        total_loss_dec += loss_kl.item() * data.size(0)
        num_batches += 1
        
    avg_loss_dec = total_loss_dec / len(dataloader_dec.dataset)
    print(f"Epoch DEC [{epoch+1}/{num_epochs_dec}], Perte KL: {avg_loss_dec:.4f}")

dec_model.eval()
final_embeddings_dec = []
final_cluster_assignments = []
with torch.no_grad():
    for batch_idx, (data,) in enumerate(dataloader_dec):
        data = data.to(device)
        embeddings, q_batch = dec_model(data)
        final_embeddings_dec.append(embeddings.cpu().numpy())
        final_cluster_assignments.append(torch.argmax(q_batch, dim=1).cpu().numpy())

final_embeddings_dec = np.concatenate(final_embeddings_dec, axis=0)
final_cluster_assignments = np.concatenate(final_cluster_assignments, axis=0)

print(f"Exemple {final_cluster_assignments[:20]}")



Epoch DEC [1/100], Perte KL: 0.1685
Epoch 1: 3459 assignations ont changé.
Epoch DEC [2/100], Perte KL: 0.0062
Epoch 2: 265 assignations ont changé.
Epoch DEC [3/100], Perte KL: 0.0035
Epoch 3: 99 assignations ont changé.
Epoch DEC [4/100], Perte KL: 0.0019
Epoch 4: 29 assignations ont changé.
Epoch DEC [5/100], Perte KL: 0.0011
Epoch 5: 8 assignations ont changé.
Exemple [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


VERsion GMM : c'est plus simple mais en fait c'est peut etre moins adapté 