### Imports

In [1]:
from IPython.display import display_html, clear_output
from itertools import chain,cycle
import plotly.express as px
from copy import deepcopy
import urllib.request
import transformers
import numpy as np
import json
import time
import os
import torch
import random 
import warnings
import pandas as pd
from tqdm import tqdm

from sklearn.model_selection import GroupShuffleSplit
from datasets import *
from transformers import AutoTokenizer, PreTrainedTokenizerFast, EncoderDecoderModel, Seq2SeqTrainingArguments, Seq2SeqTrainer, AdamW, DataCollatorForSeq2Seq

# warnings.filterwarnings(action='once')    
warnings.filterwarnings(action='ignore')

# IMPORTANTE
# QUESTA NELL'ULTIMA RUN NON ERA SETTATA VA TESTATA
os.environ['TOKENIZERS_PARALLELISM'] = 'true'
# os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# Display dataframes
def display(*args,titles=cycle([''])):
    html_str=''
    for df,title in zip(args, chain(titles,cycle(['</br>'])) ):
        html_str+='<th style="text-align:left"><td style="vertical-align:top">'
        html_str+=f'<h4 style="text-align: left;">{title}</h2>'
        html_str+=df.to_html().replace('table','table style="display:inline"')
        html_str+='</td></th>'
    display_html(html_str,raw=True)
    
# Setting seeds for reproducibility
def set_reproducibility(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    transformers.set_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    
def plot_histogram(feature, figsize = (10,6)):
    fig = px.histogram(train_df_to_ds.iloc[:train_samples], x=train_df_to_ds.iloc[:train_samples][feature].apply(len), nbins=20, 
                       title=f'{feature.capitalize()}s Length Histogram', width=figsize[0]*100, height=figsize[1]*100)
    fig.update_layout(showlegend=False, xaxis_title="Length", yaxis_title="Count", 
                      plot_bgcolor="rgba(0,0,0,0)", paper_bgcolor="rgba(0,0,0,0)", 
                      font=dict(color="#333"))
    fig.update_xaxes(gridcolor="#ddd")
    fig.show()

2023-03-09 15:46:37.146090: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-09 15:46:42.910178: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2023-03-09 15:46:42.911418: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")     

print("Using device:", device)

Using device: cuda


In [None]:
!nvidia-smi

### Dataset Download

In [3]:
class DownloadProgressBar(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)
        
def download_url(url, output_path):
    with DownloadProgressBar(unit='B', unit_scale=True,
                             miniters=1, desc=url.split('/')[-1]) as t:
        urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to)

def download_data(data_path, url_path, suffix):    
    if not os.path.exists(data_path):
        os.makedirs(data_path)
        
    data_path = os.path.join(data_path, f'{suffix}.json')

    if not os.path.exists(data_path):
        print(f"Downloading CoQA {suffix} data split... (it may take a while)")
        download_url(url=url_path, output_path=data_path)
        urllib.request.urlretrieve(url_path, filename=data_path)
        print("Download completed!")

In [4]:
# Train data
train_url = "https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json"
download_data(data_path='coqa', url_path=train_url, suffix='train')

# Test data
test_url = "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json"
download_data(data_path='coqa', url_path=test_url, suffix='test') 

### Preprocessing

In [5]:
# Creating Dataframes and removing unanswerable questions
train_data = json.load((open('coqa/train.json')))
test_data = json.load((open('coqa/test.json')))

qas = pd.json_normalize(train_data['data'], ['questions'], ['source', 'id', 'story'])
ans = pd.json_normalize(train_data['data'], ['answers'],['id'])
train_val_df = pd.merge(qas,ans, left_on=['id','turn_id'], right_on=['id','turn_id'])
train_val_df = train_val_df.loc[train_val_df['input_text_y']!='unknown']

qas = pd.json_normalize(test_data['data'], ['questions'], ['source', 'id', 'story'])
ans = pd.json_normalize(test_data['data'], ['answers'],['id'])
test_df = pd.merge(qas,ans, left_on=['id','turn_id'], right_on=['id','turn_id'])
test_df = test_df.loc[test_df['input_text_y']!='unknown']

In [6]:
# Removing bad turns
train_val_df = train_val_df.loc[(train_val_df['bad_turn_x'] != 'True') & (train_val_df['bad_turn_y'] != 'True')]

# Removing equal text/answer entries
train_val_df = train_val_df[train_val_df.story != train_val_df.input_text_y]
test_df = test_df[test_df.story != test_df.input_text_y]

# Removing enties with empty answers
train_val_df = train_val_df[train_val_df['input_text_y'].str.len()>0]
test_df = test_df[test_df['input_text_y'].str.len()>0]

In [7]:
# Text preprocess
def preprocess(ds,columns):
    ds = ds.replace(r'\n',' ', regex=True)
#     ds = ds.replace(r'[^\w\s]+', ' ', regex=True)
#     for feature in columns:
#         ds[feature] = ds[feature].str.lower().str.strip()
        
    return ds

columns = ['story', 'input_text_x', 'span_text', 'input_text_y']

train_val_df = preprocess(train_val_df,columns)
test_df = preprocess(test_df,columns)

In [8]:
# Train/Validation Split
set_reproducibility(42)

train_inds, val_inds = next(GroupShuffleSplit(test_size=.20, n_splits=2, random_state = 42).split(train_val_df, groups=train_val_df['id']))

train_df = train_val_df.iloc[train_inds]
val_df = train_val_df.iloc[val_inds].reset_index()

print(train_df.columns)

Index(['input_text_x', 'turn_id', 'bad_turn_x', 'source', 'id', 'story',
       'span_start', 'span_end', 'span_text', 'input_text_y', 'bad_turn_y'],
      dtype='object')


In [9]:
# Checking the Dataframes
print(f'Training set [{train_df.shape}]')
print(f'\tFeatures: {list(train_df.columns)}')
display(train_df.loc[11:15,['id', 'input_text_x', 'input_text_y', 'span_text']])

print(f'Validation set [{val_df.shape}]')
print(f'\tFeatures: {list(val_df.columns)}')
display(val_df.loc[11:15,['id', 'input_text_x', 'input_text_y', 'span_text']])

print(f'Test set [{test_df.shape}]')
print(f'\tFeatures: {list(test_df.columns)}')
display(test_df.loc[11:15,['id', 'input_text_x', 'input_text_y', 'span_text']])

Training set [(85823, 11)]
	Features: ['input_text_x', 'turn_id', 'bad_turn_x', 'source', 'id', 'story', 'span_start', 'span_end', 'span_text', 'input_text_y', 'bad_turn_y']


Unnamed: 0,id,input_text_x,input_text_y,span_text
11,3zotghdk5ibi9cex97fepx7jetpso7,how many items are in this secret collection?,150000,"Vatican Secret Archives were separated from the library at the beginning of the 17th century; they contain another 150,000 items."
12,3zotghdk5ibi9cex97fepx7jetpso7,Can anyone use this library?,anyone who can document their qualifications and research needs.,The Vatican Library is open to anyone who can document their qualifications and research needs.
14,3zotghdk5ibi9cex97fepx7jetpso7,what must be requested in person or by mail?,Photocopies,Photocopies for private study of pages from books published between 1801 and 1990 can be requested in person or by mail.
15,3zotghdk5ibi9cex97fepx7jetpso7,of what books?,only books published between 1801 and 1990,hotocopies for private study of pages from books published between 1801 and 1990


Validation set [(21452, 12)]
	Features: ['index', 'input_text_x', 'turn_id', 'bad_turn_x', 'source', 'id', 'story', 'span_start', 'span_end', 'span_text', 'input_text_y', 'bad_turn_y']


Unnamed: 0,id,input_text_x,input_text_y,span_text
11,3bdcf01ogxu7zdn9vlrbf2rqzwplyf,Where was Milly led to?,Cottonwoods,led Milly Erne to Cottonwoods
12,3bdcf01ogxu7zdn9vlrbf2rqzwplyf,Who took her there?,A man,the man who had led Milly Erne to Cottonwoods
13,3bdcf01ogxu7zdn9vlrbf2rqzwplyf,Whose name would Jane not speak?,this Mormon's name,this Mormon's name
14,3bdcf01ogxu7zdn9vlrbf2rqzwplyf,Did she allow herself to even think it?,No,she did not even think it.
15,3bdcf01ogxu7zdn9vlrbf2rqzwplyf,What was Jane hoping Lassiter would become to her?,"a helper, of a friend, of a champion","the need of a helper, of a friend, of a champio"


Test set [(7917, 9)]
	Features: ['input_text_x', 'turn_id', 'source', 'id', 'story', 'span_start', 'span_end', 'span_text', 'input_text_y']


Unnamed: 0,id,input_text_x,input_text_y,span_text
11,3dr23u6we5exclen4th8uq9rb42tel,Did they want Cotton to change the color of her fur?,no,We would never want you to be any other way
12,3azhrg4cu4ktme1zh7c2ro3pn2430d,what was the name of the fish,Asta.,Asta.
13,3azhrg4cu4ktme1zh7c2ro3pn2430d,What looked like a birds belly,a bottle,a bottle
14,3azhrg4cu4ktme1zh7c2ro3pn2430d,who said that,Asta.,"""It looks like a bird's belly,"" said Asta."
15,3azhrg4cu4ktme1zh7c2ro3pn2430d,Was Sharkie a friend?,Yes,Asta's friend Sharkie


In [10]:
# Overlap Check
set_train = set(train_df['id'])
set_val = set(val_df['id'])

overlap = False
for i in set_train:
    if i in set_val:
        overlap = True
        break

print('Overlap' if overlap else 'No overlap')

No overlap


In [11]:
# Dataframes to Datasets
train_df_to_ds = train_df[columns]
val_df_to_ds = val_df[columns]
test_df_to_ds = test_df[columns]

train_df_to_ds = train_df_to_ds.rename(columns={'input_text_x': 'question', 'story': 'context',\
                                               'input_text_y': 'answer', 'span_text': 'text'})
val_df_to_ds = val_df_to_ds.rename(columns={'input_text_x': 'question', 'story': 'context',\
                                               'input_text_y': 'answer', 'span_text': 'text'})
test_df_to_ds = test_df_to_ds.rename(columns={'input_text_x': 'question', 'story': 'context',\
                                               'input_text_y': 'answer', 'span_text': 'text'})

In [12]:
# Datasets Batch split
batch_size = 32
ratio = 80

train_samples = (round(train_df_to_ds.shape[0] * ratio / 100) // batch_size) * batch_size
val_samples = (round(val_df_to_ds.shape[0] * ratio / 100) // batch_size) * batch_size
test_samples = (round(test_df_to_ds.shape[0] * ratio / 100) // batch_size) * batch_size

train_dataset = Dataset.from_dict(train_df_to_ds.iloc[:train_samples])
val_dataset = Dataset.from_dict(val_df_to_ds.iloc[:val_samples])
test_dataset = Dataset.from_dict(test_df_to_ds.iloc[:test_samples])

dataset_COQA = DatasetDict({'train':train_dataset,'validation':val_dataset,'test':test_dataset})
print(dataset_COQA)

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'text', 'answer'],
        num_rows: 68640
    })
    validation: Dataset({
        features: ['context', 'question', 'text', 'answer'],
        num_rows: 17152
    })
    test: Dataset({
        features: ['context', 'question', 'text', 'answer'],
        num_rows: 6304
    })
})


In [None]:
plot_histogram('context')

In [None]:
plot_histogram('question')

In [None]:
plot_histogram('answer')

In [13]:
max_length_input = 512
max_length_answer = 50

In [14]:
def prepare_features(batch, tokenizer, max_length_input, max_length_answer):
    # Tokenize the Question and Context columns
    encoded_batch_inputs = tokenizer(
        batch['question'],
        batch['context'],
        max_length=max_length_input,
        truncation='only_second',
        padding='max_length',
        return_tensors='pt'        
    )

    # Tokenize the Answer column
    encoded_batch_labels = tokenizer(
        batch['answer'],
        max_length=max_length_answer,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    encoded_batch_inputs['labels'] = encoded_batch_labels.input_ids
#   encoded_batch_inputs['decoder_input_ids'] = deepcopy(encoded_batch_inputs['labels'])
#   encoded_batch_inputs['labels'] = [[-100 if token == tokenizer.pad_token_id else token\
#                                    for token in labels]\
#                                    for labels in encoded_batch_inputs['labels']]
    
    encoded_batch_inputs['labels_mask'] = encoded_batch_labels.attention_mask


    return encoded_batch_inputs

### Tokenization

In [15]:
model_checkpoint_M1 = 'distilroberta-base'

# Tokenizer
tokenizer_M1 = AutoTokenizer.from_pretrained(model_checkpoint_M1)

ttokenizer_M1.bos_token = tokenizer_M1.cls_token
tokenizer_M1.eos_token = tokenizer_M1.sep_token

# Get the special tokens and their corresponding IDs
special_tokens = tokenizer_M1.special_tokens_map
special_ids = tokenizer_M1.convert_tokens_to_ids(list(special_tokens.values()))
print("Special tokens:")
for token_type, token_list in special_tokens.items():
    print(f"{token_type}: {token_list}")
# Print the special tokens and their corresponding IDs
for token, id in zip(special_tokens.keys(), special_ids):
    print(f"{token}: {id}")

Special tokens:
bos_token: <s>
eos_token: </s>
unk_token: <unk>
sep_token: </s>
pad_token: <pad>
cls_token: <s>
mask_token: <mask>
bos_token: 0
eos_token: 2
unk_token: 3
sep_token: 2
pad_token: 1
cls_token: 0
mask_token: 50264


In [16]:
# Tokenizing the Dataset
tokenized_datasets_M1 = DatasetDict()

# Use the `prepare_features` functions
tokenized_datasets_M1 = dataset_COQA.map(
    lambda batch: prepare_features(batch, tokenizer_M1, max_length_input, max_length_answer),
    batched=True,
    batch_size=batch_size,
    remove_columns=dataset_COQA['train'].column_names
)

print(tokenized_datasets_M1)

Map:   0%|          | 0/68640 [00:00<?, ? examples/s]

Map:   0%|          | 0/17152 [00:00<?, ? examples/s]

Map:   0%|          | 0/6304 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'labels_mask'],
        num_rows: 68640
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'labels_mask'],
        num_rows: 17152
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'labels_mask'],
        num_rows: 6304
    })
})


### Model definition

In [18]:
# Load Model
model_M1 = EncoderDecoderModel.from_encoder_decoder_pretrained(model_checkpoint_M1, model_checkpoint_M1, tie_encoder_decoder=False)

# Model special tokens
model_M1.config.decoder_start_token_id = tokenizer_M1.cls_token_id
model_M1.config_eos_token_id = tokenizer_M1.sep_token_id
model_M1.config.pad_token_id = tokenizer_M1.pad_token_id
model_M1.config.vocab_size = model_M1.config.encoder.vocab_size

# Model hyperparams
model_M1.config.max_length = max_length_answer
model_M1.config.min_length = 1
model_M1.config.no_repeat_ngram_size = 1
model_M1.config.early_stopping = True
model_M1.config.repetition_penalty= 3.
model_M1.config.num_beams = 8

print(f"Parameters #: {model_M1.num_parameters()}")

model_M1.to(device)

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForCausalLM were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['roberta.encoder.layer.0.crossattention.self.value.weight', 'roberta.encoder.layer.5.crossattention.output.dense.bias', 'roberta.encoder.layer.0.crossattention

Parameters #: 178472025


EncoderDecoderModel(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNo

### Training

In [19]:
epochs = 3

training_args_M1 = Seq2SeqTrainingArguments(
    output_dir='./M1_Checkpoints',
    evaluation_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    overwrite_output_dir=True,
    #save_total_limit=2,
    fp16=True, 
    num_train_epochs = epochs,
    weight_decay=0.01,
    logging_steps=10
    #resume_from_checkpoint = True
)

# Optimizer and scheduler
optimizer_M1 = AdamW(model_M1.parameters(),lr= 5e-5)
train_steps  = epochs*len(tokenized_datasets_M1['train'])/batch_size
scheduler_M1 = transformers.get_cosine_schedule_with_warmup(optimizer=optimizer_M1,num_warmup_steps=50,num_training_steps=train_steps)
optimizers_M1 = optimizer_M1, scheduler_M1

trainer_M1 = Seq2SeqTrainer(
    model=model_M1,
    tokenizer=tokenizer_M1,
    args=training_args_M1,
    #compute_metrics=compute_metrics,
    train_dataset=tokenized_datasets_M1['train'],
    eval_dataset=tokenized_datasets_M1['validation'],
    optimizers=optimizers_M1,
    data_collator=DataCollatorForSeq2Seq(tokenizer_M1,model=model_M1)
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Using cuda_amp half precision backend


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [20]:
os.environ["WANDB_DISABLED"] = "true"

trainer_M1.train()

The following columns in the training set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: labels_mask. If labels_mask are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 68640
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 6435
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss
1,0.445,0.410276
2,0.3088,0.309846
3,0.2777,0.29712


Saving model checkpoint to ./M1_Checkpoints/checkpoint-500
Configuration saved in ./M1_Checkpoints/checkpoint-500/config.json
Model weights saved in ./M1_Checkpoints/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./M1_Checkpoints/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./M1_Checkpoints/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./M1_Checkpoints/checkpoint-1000
Configuration saved in ./M1_Checkpoints/checkpoint-1000/config.json
Model weights saved in ./M1_Checkpoints/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./M1_Checkpoints/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./M1_Checkpoints/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./M1_Checkpoints/checkpoint-1500
Configuration saved in ./M1_Checkpoints/checkpoint-1500/config.json
Model weights saved in ./M1_Checkpoints/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ./M1_Checkpoints/checkpo

TrainOutput(global_step=6435, training_loss=0.43006308399250825, metrics={'train_runtime': 8059.3218, 'train_samples_per_second': 25.551, 'train_steps_per_second': 0.798, 'total_flos': 6.35584350855168e+16, 'train_loss': 0.43006308399250825, 'epoch': 3.0})

In [22]:
if not os.path.exists('distilRoberta_42'):
    os.makedirs('distilRoberta_42')
trainer_M1.save_model('distilRoberta_42')

Saving model checkpoint to distilRoberta_42
Configuration saved in distilRoberta_42/config.json
Model weights saved in distilRoberta_42/pytorch_model.bin
tokenizer config file saved in distilRoberta_42/tokenizer_config.json
Special tokens file saved in distilRoberta_42/special_tokens_map.json


In [29]:
# Initialize the data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer_M1, model=model_M1)

# Create a DataLoader for the dataset using the data collator
test_loader = torch.utils.data.DataLoader(tokenized_datasets_M1['test'], 
                                          batch_size=batch_size, 
                                          collate_fn=data_collator)
torch.cuda.empty_cache()

# Generate answers
cont=0
for batch in tqdm(test_loader):
    
    example = batch['input_ids'].to(device)
    att_mask = batch['attention_mask'].to(device)
    generated_ids = model_M1.generate(input_ids=example, 
                                      attention_mask=att_mask,
                                      max_length=max_length_answer
                                     )
    ex = tokenizer_M1.batch_decode(example, skip_special_tokens=True)

    # print(dataset_COQA['test']['question'][i:i+batch_size])
    
    generated_answers = tokenizer_M1.batch_decode(generated_ids, skip_special_tokens=True)
    ground_truth = tokenizer_M1.batch_decode(batch["labels"], skip_special_tokens=True)
    
    # for idx in range(batch_size):
    for idx in range(batch_size/8):
        print(f'Question: [{dataset_COQA["test"]["question"][cont+idx]}]')
        print(f'\tGT: {ground_truth[idx]}\t-\tGENERATED: {generated_answers[idx]}')
    cont+=batch_size

  0%|          | 0/197 [00:00<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 384.00 MiB (GPU 0; 15.90 GiB total capacity; 13.42 GiB already allocated; 237.81 MiB free; 14.90 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF