In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
import time
import re
import shutil
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data.sampler import SequentialSampler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score, jaccard_score, f1_score, homogeneity_completeness_v_measure, adjusted_rand_score
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, pairwise_distances
import numpy as np
from scipy.optimize import linear_sum_assignment
import torch.nn.functional as F
import h5py
import ast
import json

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!wget -O Stsbenchmark.tar.gz http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz
shutil.unpack_archive('./Stsbenchmark.tar.gz', extract_dir='./', format='gztar')

--2024-01-26 04:42:55--  http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz
Resolving ixa2.si.ehu.es (ixa2.si.ehu.es)... 158.227.106.100
Connecting to ixa2.si.ehu.es (ixa2.si.ehu.es)|158.227.106.100|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: http://ixa2.si.ehu.eus/stswiki/images/4/48/Stsbenchmark.tar.gz [following]
--2024-01-26 04:42:55--  http://ixa2.si.ehu.eus/stswiki/images/4/48/Stsbenchmark.tar.gz
Resolving ixa2.si.ehu.eus (ixa2.si.ehu.eus)... 158.227.106.100
Connecting to ixa2.si.ehu.eus (ixa2.si.ehu.eus)|158.227.106.100|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 409630 (400K) [application/x-gzip]
Saving to: ‘Stsbenchmark.tar.gz’


2024-01-26 04:42:56 (465 KB/s) - ‘Stsbenchmark.tar.gz’ saved [409630/409630]



In [4]:
def getSTSBenchmarkSents(filename='sts-train.csv', root='stsbenchmark/', encoding='utf-8'):
  f = open(root+filename, 'r', encoding=encoding)
  s1, s2, target = [], [], []
  for line in f:
    example = re.split(r'\t+', line)
    if len(example) > 7:
      example = example[:-2]
    s2.append(example[-1])
    s1.append(example[-2])
    target.append(float(example[-3]))
  print("{} samples: {}".format(filename, len(target)))
  return s1, s2, target

In [5]:
s1_test,s2_test,target_test= getSTSBenchmarkSents(filename='sts-test.csv')

sts-test.csv samples: 1379


In [6]:
BERT_PATH = "bert-base-uncased"
root_drive = '/content/drive/MyDrive/Tesis/STS_Benchmark/transformer_tunned_BERT/uncase_base/'

In [7]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Device: {device}')

Device: cpu


In [8]:
def save_with_hdf5(file_path, data_dict):
    # Save data in HDF5 format
    with h5py.File(file_path, 'w') as hf:
        # Iterate through the dictionary and save the data to the HDF5 file
        def save_dict(group, data):
            for key, value in data.items():
                key_str = str(key)  # Convert the key to a string
                if isinstance(value, dict):
                    # If the value is a dictionary, create a new group and save the data inside it
                    subgroup = group.create_group(key_str)
                    save_dict(subgroup, value)
                elif isinstance(value, np.ndarray):
                    # If the value is a NumPy array, save it as a dataset
                    group.create_dataset(key_str, data=value)
                else:
                    # For other data types, save them as attributes
                    group.attrs[key_str] = value

        save_dict(hf, data_dict)

In [9]:
def is_tuple(string):
    try:
        result = ast.literal_eval(string)
        return isinstance(result, tuple)
    except (SyntaxError, ValueError):
        return False

In [10]:
def load_with_hdf5(file_path):
    loaded_data = {}

    # Load data from the HDF5 file
    with h5py.File(file_path, 'r') as hf:
        # Recursive function to load data from the HDF5 file
        def load_dict(group):
            result = {}
            for key, item in group.items():
                # Use the key directly if it is a string, otherwise, convert back to tuple
                key_obj = ast.literal_eval(key) if is_tuple(key) else key
                if isinstance(item, h5py.Group):
                    # If it's a group, load its data recursively
                    result[key_obj] = load_dict(item)
                    # Retrieve attributes from the group
                    for attr_key, attr_value in item.attrs.items():
                        result[key_obj][attr_key] = attr_value
                else:
                    # If it's a dataset, load its values
                    result[key_obj] = torch.tensor(np.array(item))
            return result

        loaded_data = load_dict(hf)

    return loaded_data

In [11]:
file_name = 'embeddings_att_sts_b.h5'

In [12]:
!cp {root_drive + file_name} /content/

In [13]:
#all_attentions_matrix = load_with_hdf5(root_drive + file_name)
all_attentions_matrix = load_with_hdf5(file_name)

In [14]:
len(all_attentions_matrix)

198576

In [15]:
layers = 12
heads = 12
#num_sentences = list(all_attentions_matrix.keys())[-1][0]+1
num_sentences = int(len(all_attentions_matrix.keys())/(layers*heads))
attentions_concat_heads = {}
attentions_matrix_list = []
attentions_list = []

for i in range(num_sentences):
  for j in range(layers):
    tensor_list = []
    for k in range(heads):
      tensor_list.append(torch.tensor(all_attentions_matrix[(i,j,k)]['vectors']).flatten())
    #attentions_concat_heads[(i,j)] = torch.stack(tensor_list).unsqueeze(0).permute(0, 2, 1)
    stack = torch.stack(tensor_list)
    attentions_concat_heads[(i,j)] = stack
    attentions_matrix_list.append(stack)
    #attentions_list.append(((i,j),all_attentions_matrix[(i,j,k)]['label'], stack))
    attentions_list.append(((i,j),all_attentions_matrix[(i,j,k)]['sequence'],all_attentions_matrix[(i,j,k)]['label'],all_attentions_matrix[(i,j,k)]['dimension'],stack))
attentions_list = [(id,s,label,dim,tensor.permute(1,0)) for id, s, label, dim, tensor in attentions_list]

  tensor_list.append(torch.tensor(all_attentions_matrix[(i,j,k)]['vectors']).flatten())


In [16]:
len(attentions_list)

16548

In [17]:
attentions_list[24]

((2, 0),
 "s1: One woman is measuring another woman's ankle. s2: A woman measures another woman's ankle.",
 '5.0',
 '22',
 tensor([[0.0408, 0.6959, 0.4869,  ..., 0.0665, 0.6562, 0.8600],
         [0.0776, 0.0043, 0.0227,  ..., 0.1529, 0.0165, 0.0114],
         [0.0221, 0.0020, 0.0099,  ..., 0.0690, 0.0122, 0.0037],
         ...,
         [0.0360, 0.0218, 0.0171,  ..., 0.0340, 0.0367, 0.0661],
         [0.1160, 0.4015, 0.3665,  ..., 0.0342, 0.1426, 0.0011],
         [0.1761, 0.0575, 0.1498,  ..., 0.0353, 0.5425, 0.1911]]))

In [18]:
attentions_list[0][4].shape

torch.Size([289, 12])

Dataloader

In [19]:
# Crear un sampler secuencial
# attentions_list = dataset
# SequentialSampler does not perform any shuffling or random selection of items.
sampler = SequentialSampler(attentions_list)

# Definir el tamaño del lote
batch_size = 12 # always 12, because it is the number of attention layers, 12 layers for the same sentence

# Crear el DataLoader sin un BatchSampler
dataloader = DataLoader(attentions_list, batch_size=batch_size, sampler=sampler)

In [20]:
len(dataloader) #1379 OK because there are 1379 sentences, 1 batch = 1 same sentence , 12 elements in the batch because there are 12 layers

1379

In [21]:
sr = next(iter(dataloader))
sr[0] # ids (num_sent, num_layer)
sr[1] # sequence
sr[2] # similarity label
sr[3] # dimension
sr[4].shape #[batch_size, len_sentence, input_size] with attentions ([12,289,12])

torch.Size([12, 289, 12])

In [22]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [23]:
def train_loop(model, iterator, optimizer, criterion, device=device, clip = 1.0):
    #Training loop
    model.train()
    loss_sum = 0
    seed = 42
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    #torch.cuda.set_device(0)

    for i, (_,_,_,_,input) in enumerate(iterator):
        optimizer.zero_grad()
        output, _ = model(input)
        loss = criterion(output, input)
        loss.backward()
        #prevent gradients from exploding
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        #Update params
        optimizer.step()
        loss_sum += loss.item()

    epoch_train_loss = loss_sum * batch_size / len(iterator)

    return epoch_train_loss

In [24]:
# Extraer el vector latente fijo de cada elemento del batch
def extract_latent_vectors(model, dataloader, device, model_type='RNN'):
    model.eval()
    vector_representations = {}

    with torch.no_grad():
        for (id,s,label,dim,input) in dataloader:
            latent_vectors = []
            if model_type == 'RNN':
              _, (latent_representation, _) = model.encoder(input)
              latent = latent_representation.squeeze(0)
            else:
              latent_representation = model.encoder(input)
              latent = latent_representation.squeeze(0).squeeze(1)
            tuples = list(zip(id[0].tolist(), id[1].tolist()))
            #latent = latent_representation.squeeze(0)
            for i in range(latent.size(0)):
              latent_vectors.append(latent[i].numpy())
              vector_representations[tuples[i]] = { 'vector' : latent[i].numpy(), 'sequence': s, 'label': label, 'dimension':dim}

    return vector_representations

In [25]:
# Definir la arquitectura del autoencoder con LSTM
class AutoencoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, latent_size = 128):
        super(AutoencoderLSTM, self).__init__()
        self.encoder = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.decoder = nn.LSTM(hidden_size, input_size, batch_first=True)
    def forward(self, x):
        # Codificación
        #o = [1, 12, 128] = [batch_size, len_sents, hidden_size]
        #x = [1, 12, 289] = [batch_size, len_sents, input_size]
        #print(x.shape)
        o, (h_n, _) = self.encoder(x)
        #h = [1, 1, 128] = [batch_size, num_layers * num_directions, hidden_size]
        # Reducción a tamaño latente
        #latent = [1, 128] = [num_layers * num_directions, hidden_size]
        latent = h_n.squeeze(0)
        # Decodificación
        #output, _ = self.decoder(latent.unsqueeze(0).repeat(1, x.size(1), 1))
        output, _ = self.decoder(o)
        return output, latent


In [26]:
NUM_EPOCHS = 4
best_valid_loss = float('inf')
model_name = 'Autoencoder_LSTM'
train_loss_values = []
history = {"train": {"loss": []}}

input_size = 12  #due to 768 dimensión of BERT
hidden_size = 2  # size of fixed vector #laten dim
learning_rate = 0.001 #0.00025 #0.0007

seed = 42
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)


model = AutoencoderLSTM(input_size, hidden_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(NUM_EPOCHS):

    start_time = time.time()
    epoch_train_loss = train_loop(model,dataloader,optimizer,criterion,device)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    train_loss_values.append(epoch_train_loss)

    history["train"]["loss"].append(epoch_train_loss)

    print('-' * 80)
    #print(f'Epoch: {epoch+1:03}/{NUM_EPOCHS} | Epoch Time: {epoch_mins}m {epoch_secs}s | Train loss: {epoch_train_loss:.4f} | Train acc: {epoch_train_acc:.4f} | Dev loss: {epoch_dev_loss:.4f} | Dev acc: {epoch_dev_acc:.4f}')
    print(f'Epoch: {epoch+1:03}/{NUM_EPOCHS} | Epoch Time: {epoch_mins}m {epoch_secs}s | Train loss: {epoch_train_loss:.4f}')


--------------------------------------------------------------------------------
Epoch: 001/4 | Epoch Time: 1m 10s | Train loss: 0.0875
--------------------------------------------------------------------------------
Epoch: 002/4 | Epoch Time: 1m 9s | Train loss: 0.0503
--------------------------------------------------------------------------------
Epoch: 003/4 | Epoch Time: 1m 13s | Train loss: 0.0462
--------------------------------------------------------------------------------
Epoch: 004/4 | Epoch Time: 1m 13s | Train loss: 0.0452


In [27]:
vector_representations = extract_latent_vectors(model, dataloader, device)

In [28]:
vector_representations[(0,0)]['vector']

array([ 0.29809073, -0.13135618], dtype=float32)

In [42]:
save_with_hdf5('reduced_vectors_sts_b_att_2D.h5', vector_representations)

In [30]:
root_drive

'/content/drive/MyDrive/Tesis/STS_Benchmark/transformer_tunned_BERT/uncase_base/'

In [43]:
!cp 'reduced_vectors_sts_b_att_2D.h5' {root_drive}

In [32]:
len(vector_representations) #(num_sentence,layer)

16548

In [39]:
#save with PyTorch
#torch.save(vector_representations, 'reduced_vectors_sts_b_att_2D.pth')

In [41]:
#load with PyTorch
#data_loaded = torch.load('reduced_vectors_sts_b_att_2D.pth')

In [33]:
def convert_to_num(text):
    try:
        # Try to convert the text to a number (int or float)
        number = float(text)
        # Check if the number is an integer, and if so, convert it to int
        if number.is_integer():
            return int(number)
        return number
    except ValueError:
        # If there is a ValueError, return the original text
        return text

In [34]:
def get_representations_per_layer(num_sentences, vector_representations, layers = 12, main_key = 'vector', add_keys = ['sequence','label','dimension']):
  vectors_per_layer = {}
  labels = {}
  data = []
  for l in range(layers):
    vectors_per_layer[l] = np.array([vector_representations[(i,l)][main_key] for i in range(num_sentences)])
  data.append(vectors_per_layer)
  for j in add_keys:
    data.append({i: vector_representations[(i, 0)][j][0].item() if isinstance(vector_representations[(i, 0)][j][0], torch.Tensor) else convert_to_num(vector_representations[(i, 0)][j][0]) for i in range(num_sentences)})
  return tuple(data)

In [35]:
vector_representations[(0,0)]['label'][0]

'2.5'

In [36]:
vectors_per_layer, sequences, labels, dimensions = get_representations_per_layer(num_sentences, vector_representations)

In [37]:
save_with_hdf5('vectors_per_layer_ATT_LSTM_sts_b_h5.h5', vectors_per_layer)

In [38]:
!cp 'vectors_per_layer_ATT_LSTM_sts_b_h5.h5' {root_drive}