In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
import time
import re
import shutil
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data.sampler import SequentialSampler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score, jaccard_score, f1_score, homogeneity_completeness_v_measure, adjusted_rand_score
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, pairwise_distances
import numpy as np
from scipy.optimize import linear_sum_assignment
import torch.nn.functional as F

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!wget -O Stsbenchmark.tar.gz http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz
shutil.unpack_archive('./Stsbenchmark.tar.gz', extract_dir='./', format='gztar')

--2024-01-29 23:53:12--  http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz
Resolving ixa2.si.ehu.es (ixa2.si.ehu.es)... 158.227.106.100
Connecting to ixa2.si.ehu.es (ixa2.si.ehu.es)|158.227.106.100|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: http://ixa2.si.ehu.eus/stswiki/images/4/48/Stsbenchmark.tar.gz [following]
--2024-01-29 23:53:13--  http://ixa2.si.ehu.eus/stswiki/images/4/48/Stsbenchmark.tar.gz
Resolving ixa2.si.ehu.eus (ixa2.si.ehu.eus)... 158.227.106.100
Connecting to ixa2.si.ehu.eus (ixa2.si.ehu.eus)|158.227.106.100|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 409630 (400K) [application/x-gzip]
Saving to: ‘Stsbenchmark.tar.gz’


2024-01-29 23:53:16 (302 KB/s) - ‘Stsbenchmark.tar.gz’ saved [409630/409630]



In [4]:
def getSTSBenchmarkSents(filename='sts-train.csv', root='stsbenchmark/', encoding='utf-8'):
  f = open(root+filename, 'r', encoding=encoding)
  s1, s2, target = [], [], []
  for line in f:
    example = re.split(r'\t+', line)
    if len(example) > 7:
      example = example[:-2]
    s2.append(example[-1])
    s1.append(example[-2])
    target.append(float(example[-3]))
  print("{} samples: {}".format(filename, len(target)))
  return s1, s2, target

In [5]:
s1_test,s2_test,target_test= getSTSBenchmarkSents(filename='sts-test.csv')

sts-test.csv samples: 1379


In [6]:
BERT_PATH = "bert-base-uncased"
root_drive = '/content/drive/MyDrive/Tesis/STS_Benchmark/transformer_tunned_BERT/uncase_base/'

In [7]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Device: {device}')

Device: cuda


In [8]:
#(num_senquence, num_capa)
CLS_vectors = torch.load(root_drive + 'bert-base-uncased_CLS_outputs_test_STS-B.pth', map_location=torch.device(device))

In [9]:
CLS_vectors[(0,0)]['vectors'].shape

torch.Size([1, 768])

In [10]:
layers = 12
heads = 12
num_sentences = list(CLS_vectors.keys())[-1][0]+1
CLS_vectors_list = []

for i in range(num_sentences):
  for j in range(layers):
    CLS_vectors_list.append(((i,j),CLS_vectors[(i,j)]['sequence'],CLS_vectors[(i,j)]['label'],CLS_vectors[(i,j)]['dimension'],CLS_vectors[(i,j)]['vectors']))

In [11]:
CLS_vectors_list[0]

((0, 0),
 's1: A girl is styling her hair. s2: A girl is brushing her hair.',
 2.5,
 17,
 tensor([[ 2.1241e-03,  8.7672e-02, -2.0873e-01, -3.1814e-01,  2.7714e-01,
           6.1523e-02, -1.8077e-01, -3.2233e-02, -1.0158e-01, -4.0595e-01,
           3.4837e-02, -3.4986e-02,  1.5076e-01, -1.4688e-01, -1.2356e-01,
           4.7144e-01,  4.5217e-01,  2.8083e-01, -1.1098e-01, -5.7833e-01,
          -5.3669e-01, -3.2599e-01,  3.8374e-02, -2.1438e-01, -1.6631e-01,
          -2.5951e-01, -3.9027e-01,  3.3140e-02,  5.1995e-02, -1.2945e-01,
          -1.0947e-01, -1.7551e-01,  2.0924e-01,  1.3261e-01, -3.7521e-01,
           7.5862e-02, -3.9207e-01,  1.6900e-01, -2.4508e-01,  2.2689e-01,
           3.1682e-01, -1.9301e-01, -1.1316e-01,  1.7409e-01,  9.5839e-02,
           1.2638e-01, -1.5249e+00, -1.8080e-02,  5.6530e-01, -2.1335e-01,
           1.8438e-01,  4.1332e-02, -2.7097e-01,  8.9426e-01,  3.6618e-02,
          -1.1993e-01, -2.5171e-01,  9.9663e-02,  2.7470e-01, -2.6993e-01,
          -

In [12]:
CLS_vectors_list[0][4].shape

torch.Size([1, 768])

Dataloader

In [13]:
# Crear un sampler secuencial
# attentions_list = dataset
# SequentialSampler does not perform any shuffling or random selection of items.
sampler = SequentialSampler(CLS_vectors_list)

# Definir el tamaño del lote
batch_size = 12 # always 12, because it is the number of attention layers, 12 layers for the same sentence

# Crear el DataLoader sin un BatchSampler
dataloader = DataLoader(CLS_vectors_list, batch_size=batch_size, sampler=sampler)

In [14]:
len(dataloader) #1379 OK because there are 1379 sentences, 1 batch = 1 same sentence , 12 elements in the batch because there are 12 layers

1379

In [15]:
sr = next(iter(dataloader))
sr[0] # ids (num_sent, num_layer)
sr[1] # similarity label
sr[2].shape
sr[3].shape
sr[4].shape #[batch_size, len_sentence, input_size] with attentions ([12,289,12])

torch.Size([12, 1, 768])

In [16]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [28]:
def train_loop(model, iterator, optimizer, criterion, device=device, clip = 1.0):
    #Training loop
    model.train()
    loss_sum = 0
    seed = 42
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    #torch.cuda.set_device(0)

    for i, (_,_,_,_,input) in enumerate(iterator):
        optimizer.zero_grad()
        output, _ = model(input.to(device))
        loss = criterion(output, input)
        loss.backward()
        #prevent gradients from exploding
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        #Update params
        optimizer.step()
        loss_sum += loss.item()

    epoch_train_loss = loss_sum * batch_size / len(iterator)

    return epoch_train_loss

In [34]:
# Extraer el vector latente fijo de cada elemento del batch
def extract_latent_vectors(model, dataloader, device, model_type='RNN'):
    model.eval()
    vector_representations = {}

    with torch.no_grad():
        for (id,s,label,dim,input) in dataloader:
            latent_vectors = []
            if model_type == 'RNN':
              _, (latent_representation, _) = model.encoder(input)
              latent = latent_representation.squeeze(0)
            else:
              latent_representation = model.encoder(input)
              latent = latent_representation.squeeze(0).squeeze(1)
            tuples = list(zip(id[0].tolist(), id[1].tolist()))
            #latent = latent_representation.squeeze(0)
            for i in range(latent.size(0)):
              latent_vectors.append(latent[i].cpu().numpy())
              vector_representations[tuples[i]] = { 'vector' : latent[i].cpu().numpy(), 'sequence': s, 'label': label, 'dimension':dim}

    return vector_representations

In [29]:
class LinearAutoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(LinearAutoencoder, self).__init__()
        dims = [768, 512, 256, 128, 32, 8, 2]

        layer_dims= [i for i in dims if i > latent_dim ]
        layer_dims.append(latent_dim)

        encoder_layers = []
        decoder_layers = []

        for i in range(len(layer_dims) - 1):
            encoder_layers.append(nn.Linear(layer_dims[i], layer_dims[i+1]))
            encoder_layers.append(nn.GELU())
            decoder_layers.insert(0, nn.Linear(layer_dims[i+1], layer_dims[i]))
            decoder_layers.insert(0, nn.GELU())

        self.encoder = nn.Sequential(*encoder_layers)
        self.decoder = nn.Sequential(*decoder_layers)

    def forward(self, x):
        # Codificación
        encoded = self.encoder(x)
        # Decodificación
        decoded = self.decoder(encoded)
        return decoded, encoded


In [31]:
NUM_EPOCHS = 10 #500
best_valid_loss = float('inf')
model_name = 'Autoencoder_Lineal_CLS'
train_loss_values = []
history = {"train": {"loss": []}}

input_size = 768  #due to 768 dimensión of BERT
hidden_size = 64  # size of fixed vector #laten dim
learning_rate = 0.00025 #0.00025 #0.0007

seed = 42
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)


#model = AttentionLinearAutoencoder(input_size, hidden_size)
model = LinearAutoencoder(input_size, hidden_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(NUM_EPOCHS):

    start_time = time.time()
    epoch_train_loss = train_loop(model.to(device),dataloader,optimizer,criterion,device)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    train_loss_values.append(epoch_train_loss)

    history["train"]["loss"].append(epoch_train_loss)

    print('-' * 80)
    #print(f'Epoch: {epoch+1:03}/{NUM_EPOCHS} | Epoch Time: {epoch_mins}m {epoch_secs}s | Train loss: {epoch_train_loss:.4f} | Train acc: {epoch_train_acc:.4f} | Dev loss: {epoch_dev_loss:.4f} | Dev acc: {epoch_dev_acc:.4f}')
    print(f'Epoch: {epoch+1:03}/{NUM_EPOCHS} | Epoch Time: {epoch_mins}m {epoch_secs}s | Train loss: {epoch_train_loss:.4f}')


--------------------------------------------------------------------------------
Epoch: 001/10 | Epoch Time: 0m 6s | Train loss: 1.0422
--------------------------------------------------------------------------------
Epoch: 002/10 | Epoch Time: 0m 4s | Train loss: 0.5931
--------------------------------------------------------------------------------
Epoch: 003/10 | Epoch Time: 0m 4s | Train loss: 0.5575
--------------------------------------------------------------------------------
Epoch: 004/10 | Epoch Time: 0m 5s | Train loss: 0.5390
--------------------------------------------------------------------------------
Epoch: 005/10 | Epoch Time: 0m 4s | Train loss: 0.5206
--------------------------------------------------------------------------------
Epoch: 006/10 | Epoch Time: 0m 4s | Train loss: 0.5046
--------------------------------------------------------------------------------
Epoch: 007/10 | Epoch Time: 0m 5s | Train loss: 0.4968
------------------------------------------------

In [35]:
vector_representations = extract_latent_vectors(model, dataloader, device, model_type='Linear')

In [39]:
vector_representations[(0,0)]['vector'].shape

(64,)

In [40]:
# Guardar el diccionario en un archivo
torch.save(vector_representations, root_drive + BERT_PATH + '_vector_representations_CLS_Linear64.pth')

In [None]:
vector_representations = torch.load(root_drive + BERT_PATH + '_vector_representations_CLS_Linear.pth')

In [None]:
len(vector_representations) #(num_sentence,layer)

16548

In [None]:
def get_representations_per_layer(num_sentences, vector_representations, layers = 12):
  vectors_per_layer = {}
  labels = {}
  for l in range(layers):
    #vectors_per_layer[l] = np.array([vector_representations[(i,l)]['vector'].detach().numpy() for i in range(num_sentences)])
    vectors_per_layer[l] = np.array([vector_representations[(i,l)]['vector'] for i in range(num_sentences)])
  labels = { i: vector_representations[(i,0)]['label'][0].item() for i in range(num_sentences)}
  return vectors_per_layer, labels

In [None]:
vectors_per_layer, labels = get_representations_per_layer(num_sentences, vector_representations)

In [None]:
# Guardar el diccionario en un archivo
torch.save(vectors_per_layer, root_drive + BERT_PATH + str(hidden_size) + '_vectors_per_layer_CLS_LSTM.pth')