# Spanish QA asymmetric

Training a model to create embeddings for asymmetric semantic search using Spanish language:

### References:

* https://www.sbert.net/docs/training/overview.html
* https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/ms_marco/train_bi-encoder_mnrl.py
* https://www.pinecone.io/learn/nlp/
* https://www.pinecone.io/learn/fine-tune-sentence-transformers-mnr/
* https://huggingface.co/datasets/unicamp-dl/mmarco/viewer/spanish/train
* https://huggingface.co/datasets/unicamp-dl/mrobust/viewer/queries-spanish

# Config

In [None]:
CONFIG = {
    # Base model
    'model_name': 'PlanTL-GOB-ES/roberta-base-bne',  # TODO: check large?
    'max_seq_length': 512,

    # Train model
    'epochs': 10,  # 4, 10, 30
    'warmup_steps': 1,  # 100, 1000
    'batch_size': 16,  # 32
    'optimizer_params': {'lr': 2e-5},
    'loss': 'tl',  # {'mnrl', 'mse', 'tl'}

    # Dataset
    'dataset_train_size': 512,  # 481_335 # 500_000 # bottleneck: GPU memory limits
    # 'multiple_negatives': False,
    'dataset_name': "dariolopez/ms-marco-es-500k",  # {"dariolopez/ms-marco-es-500k", "IIC/ms_marco_es"}

    # General
    'seed': 42
}

In [None]:
print(CONFIG)

{'model_name': 'PlanTL-GOB-ES/roberta-base-bne', 'max_seq_length': 512, 'epochs': 10, 'warmup_steps': 1, 'batch_size': 16, 'optimizer_params': {'lr': 2e-05}, 'loss': 'tl', 'dataset_train_size': 512, 'dataset_name': 'dariolopez/ms-marco-es-500k', 'seed': 42}


# Install libraries

In [None]:
!pip install sentence-transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Import libraries

In [None]:
import os
from datetime import datetime

from sentence_transformers import InputExample, SentenceTransformer, models, losses
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

# Check GPU

In [None]:
!nvidia-smi

Wed May  3 07:18:25 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P8    11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
print(device)

cuda


# Seed

In [None]:
import numpy as np


def set_seed(seed):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG['seed'])

# Model

In [None]:
word_embedding_model = models.Transformer(
    model_name_or_path=CONFIG['model_name'],
    max_seq_length=CONFIG['max_seq_length'],
    tokenizer_name_or_path=CONFIG['model_name']
)
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension()
)
model = SentenceTransformer(
    modules=[word_embedding_model, pooling_model],
    device=device
)

Some weights of the model checkpoint at PlanTL-GOB-ES/roberta-base-bne were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
CONFIG['length_embedding'] = word_embedding_model.get_word_embedding_dimension()

In [None]:
print(CONFIG)

{'model_name': 'PlanTL-GOB-ES/roberta-base-bne', 'max_seq_length': 512, 'epochs': 10, 'warmup_steps': 1, 'batch_size': 16, 'optimizer_params': {'lr': 2e-05}, 'loss': 'tl', 'dataset_train_size': 512, 'dataset_name': 'dariolopez/ms-marco-es-500k', 'seed': 42, 'length_embedding': 768}


In [None]:
print(model)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: RobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)


# Load Dataset

In [None]:
%%time
import datasets

marco_es = datasets.load_dataset(CONFIG['dataset_name'])



  0%|          | 0/1 [00:00<?, ?it/s]

CPU times: user 922 ms, sys: 172 ms, total: 1.09 s
Wall time: 4.78 s


In [None]:
from datasets import concatenate_datasets

if CONFIG['dataset_name'] == 'dariolopez/ms-marco-es-500k':
    marco_es = marco_es['train']
else:  # "IIC/ms_marco_es"
    marco_es = concatenate_datasets([marco_es['train'], marco_es['validation'], marco_es['test']])

In [None]:
print(marco_es)

Dataset({
    features: ['query', 'positive', 'negative'],
    num_rows: 500000
})


# Prepare for training

In [None]:
if CONFIG['dataset_name'] == 'dariolopez/ms-marco-es-500k':  # query - positive - negative  https://huggingface.co/datasets/dariolopez/ms-marco-es
    train_samples = [
        InputExample(texts=[row['query'], row['positive'], row['negative']])
        for row in marco_es.select(range(CONFIG['dataset_train_size']))
    ]
else:  # query - passage - label https://huggingface.co/datasets/IIC/ms_marco_es
    train_samples = [
        InputExample(texts=[row['query'], row['passages']], label=row['labels'])
        for row in marco_es.select(range(CONFIG['dataset_train_size']))
    ]

In [None]:
print(f"length train samples: {len(train_samples)}")

length train samples: 512


In [None]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=CONFIG['batch_size'])

In [None]:
import json

now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
model_save_path = os.path.join('output', now)
os.makedirs(model_save_path, exist_ok=True)
with open(os.path.join(model_save_path, 'train_config.json'), 'w') as file:
    file.write(json.dumps(CONFIG, indent=4))

In [None]:
import gc

def free_memory(score, epoch, steps):
    torch.cuda.empty_cache()
    gc.collect()

In [None]:
if CONFIG['loss'] == 'mnrl':
    train_loss = losses.MultipleNegativesRankingLoss(model=model)
elif CONFIG['loss'] == 'mse':
    train_loss = losses.MarginMSELoss(model=model)
elif CONFIG['loss'] == 'tl':
    train_loss = losses.TripletLoss(model=model)
else:
    train_loss = losses.CosineSimilarityLoss(model=model)

In [None]:
print(train_loss)

TripletLoss(
  (model): SentenceTransformer(
    (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: RobertaModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  )
)


# Train

In [None]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=CONFIG['epochs'],
    warmup_steps=CONFIG['warmup_steps'],
    save_best_model=True,
    show_progress_bar=True,
    use_amp=True,  # If your GPU does not have FP16 cores, set use_amp=False
    callback=free_memory,
    checkpoint_save_steps=len(train_dataloader),
    checkpoint_path=model_save_path,
)

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

Iteration:   0%|          | 0/32 [00:00<?, ?it/s]

# Save and push to HuggingFace

In [None]:
# Train latest model
model.save(model_save_path)

In [None]:
print(model_save_path)

output/2023-05-03_07-18-35
