<a href="https://colab.research.google.com/github/Zamachi/hugging-face-model-finetuning/blob/main/Diplomski_projekat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Uvod

Ovde instaliramo i importujemo neophodne stvari za rad

In [None]:
#WARNING ONLY RUN THIS IF YOU'RE USING COLLAB, OTHERWISE SKIP IT(YOU SHOULD HAVE THINGS INSTALLED LOCALLY) WARNING
%pip install transformers 
%pip install datasets
%pip install sentencepiece
%pip install accelerate

In [1]:
import torch
import numpy as np
from transformers.utils.dummy_tokenizers_objects import BertTokenizerFast, DistilBertTokenizerFast, AlbertTokenizerFast, BartTokenizerFast, T5TokenizerFast
from accelerate import Accelerator, notebook_launcher
from transformers import pipeline, AutoTokenizer,AdamW, get_scheduler, BertForSequenceClassification, BertConfig, DistilBertForSequenceClassification, DistilBertConfig, AlbertForSequenceClassification, AlbertConfig, PreTrainedModel, BertForQuestionAnswering, DistilBertForQuestionAnswering, AlbertForQuestionAnswering, BartForConditionalGeneration, BartConfig, T5ForConditionalGeneration, T5Config, AutoModel, DataCollatorWithPadding
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers
from torch.utils.data.dataloader import DataLoader
from datasets import load_dataset, interleave_datasets, load_metric, Features, ClassLabel, Value
from tqdm import tqdm
from collections import defaultdict
from datasets.arrow_dataset import Dataset
from datasets.dataset_dict import DatasetDict
from html import unescape
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
from torch.optim.optimizer import Optimizer
import unicodedata
from random import randint
from huggingface_hub import notebook_login, Repository, get_full_repo_name

#Razrada

## Ucitavanje datasetova

### Sentiment analysis dataset

In [None]:
data_sentiment_analysis_stream = load_dataset("tweet_eval", "emotion",streaming=True)

data_sentiment_analysis = defaultdict(list)

features = Features({"label" : ClassLabel(num_classes=4,names=["anger","joy","optimism", "sadness"]), "text" : Value("string")}) # koristimo ovaj objekat da specificiramo Feature-e

for dataset_split, dataset in data_sentiment_analysis_stream.items():
  data_sentiment_analysis_dict = defaultdict(list) # prazan dictionary u koji cemo smestati nase filtrirane podatke
  for row in tqdm(iter(dataset)):
    for key, value in row.items():
      if(value is not None):
        #print(value)
        if(key == "text"):
          data_sentiment_analysis_dict[key].append(unescape(unicodedata.normalize("NFKC", value)))
        elif(key=="label"):
          data_sentiment_analysis_dict[key].append(value)
  data_sentiment_analysis[dataset_split] = Dataset.from_dict(data_sentiment_analysis_dict, features=features) # moramo specificirati features jer ih ne formatiramo lepo ovim pristupom

data_sentiment_analysis = DatasetDict(data_sentiment_analysis)

### Question answering dataset

In [2]:
# data_question_answering_stream = load_dataset("squad_v2", "squad_v2",streaming=True)
data_question_answering = load_dataset("squad_v2", "squad_v2")
data_question_answering = data_question_answering.flatten()
data_question_answering=data_question_answering.rename_column("answers.text", "text")
data_question_answering=data_question_answering.rename_column("answers.answer_start", "start_positions")
data_question_answering=data_question_answering.remove_columns("title")
data_question_answering= data_question_answering.filter(lambda x: x['question'] is not None and x['text'] is not None and x['start_positions'] is not None and x['context'] is not None and len(x['text'])>0 and len(x['start_positions'])>0 and len(x['context'])>0 and x['context'] != "")
data_question_answering

Reusing dataset squad_v2 (C:\Users\Zamachi\.cache\huggingface\datasets\squad_v2\squad_v2\2.0.0\09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d)


  0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/131 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'question', 'text', 'start_positions'],
        num_rows: 86821
    })
    validation: Dataset({
        features: ['id', 'context', 'question', 'text', 'start_positions'],
        num_rows: 5928
    })
})

### Text summarization dataset

In [None]:
data_text_summarization = load_dataset("cnn_dailymail", "3.0.0")
data_text_summarization

### Text transduction dataset

In [None]:
data_text_translation = load_dataset("wmt16", "de-en")
data_text_translation

## Tokenizacija - setup

In [3]:
#Emoticons
emoticons =["😍","😂","💕","🔥","😊","😎","✨","💙","😘","📷","🇺🇸","☀","💜","😉","💯","😁","🎄","📸","😜"]

In [4]:
#Encoder-models
#============================================================================================================================
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
distillbert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
albert_tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")

bert_tokenizer.add_tokens(emoticons) # https://github.com/huggingface/transformers/issues/7648 see why
distillbert_tokenizer.add_tokens(emoticons)
albert_tokenizer.add_tokens(emoticons)
#Decoder-models
#============================================================================================================================


#Seq2Seq model
#============================================================================================================================


19

In [5]:
def tokenize_sentence_classification(element):
  result = tokenizer(element["text"])
  result["label"] = torch.nn.functional.one_hot(torch.tensor(element["label"]), num_classes=model.config.num_labels).to(torch.float32).tolist()
  return result

def tokenize_question_answering(element):
  '''
  offset_mapping = [ [(0, 0), (0, 4), (5, 8), (9, 16), (17, 22) ], [...] ] - (0,0) pocetak ili kraj recenice, ostali parovi oznacavaju 
  pocetak i kraj(not inclusive) svakog TOKENA u originalnom stringu(ili question ili context) - npr. ako je token 135 za rec "test3", u recenici bi to bilo "Test1 test2 test3", par bi bio (13,17)

  .sequence_ids(i)
  Koristimo da otkrijemo koji par iz offset_mapping-a pripada question-u, a koji context-u. 0=question, 1=context, None=separator

  start_positions i end_positions (tenzor shape-a (batch_size - vrv 1000 )) Labele koje oznacavaju indeks prvog(x1,y1) i poslednjeg para(x2,y2) iz offset mapiranja, a koristimo za loss computation. 
  Pozicije su fiksi za sequence_length. Pozicije van sekvence nisu uzete u obzir za racunanje loss-a.
  '''
  questions = [question.strip() for question in element['question']]

  tokenized_input = tokenizer(
      questions, 
      element["context"], 
      max_length=None, 
      truncation="only_second", 
      return_offsets_mapping=True,
      return_overflowing_tokens=True, 
      stride=128, 
      # If set to a number along with max_length, the overflowing tokens returned when return_overflowing_tokens=True will contain some tokens from the end of the truncated sequence returned to provide 
      # some overlap between truncated and overflowing sequences. The value of this argument defines the number of overlapping tokens.
      )
  
  offset_mappings = tokenized_input.pop("offset_mapping")
  # Kada procesiramo vise inputa odjednom u tokenizeru, a ujedno svaki ima context duzi od duzine naseg modela, onda ce overflow_to_sample_mapping vratiti listu/tuple vrednosti, gde svaka vrednost opisuje koji feature pripada kojem odgovoru, tako ako posaljemo 4 inputa mozemo imati nesto u fazonu (0,0,0,1,1,2,2,2,2,3,3,3,3,3), gde prva 3 feature-a pripadaju prvom kontekstu, druga dva drugom itd. 
  sample_mapping = tokenized_input.pop("overflow_to_sample_mapping")

  start_positions = [position[0] for position in element["start_positions"]] # ovo radimo da bi flatten-ovali 
  answers = [answer[0] for answer in element["text"]] # ovo radimo da bi flatten-ovali 

  answer_start_positions = []
  answer_end_positions = []
  for index, offset in enumerate(offset_mappings):

    #Za tekucu recenicu; koji par iz offset_mappings-a pripada question-u, a koji context-u
    seq_ids = tokenized_input.sequence_ids(index) 
    # One example can give several spans, this is the index of the example containing this span of text.
    sample_index = sample_mapping[index]

    answer = answers[sample_index] # svaka recenica ima po 1 odgovor. MODIFIED koristimo sample_index jer sada za 1 kontekst/odgovor imamo vise feature-a(ako je kontekst > context_len modela!)
    answer_start_index = start_positions[sample_index] # uzimamo pocetni index tekuce recenice(token-a ?) MODIFIED, bilo je [i], ali tkao sada ne radi, pitanje je da li je ovo dobro?
    answer_end_index = answer_start_index + len(answer)# i idemo do kraja odgovora

    #Na osnovu seq_ids mozemo pronaci gde pocinje kontekst, a gde se zavrsava
    context_indices = [index for index, value in enumerate(seq_ids) if value == 1] # context=1, tako da nam treba prvi i poslednji indeks pojavljivanja jedinice
    context_start = context_indices[0]
    context_end = context_indices[len(context_indices)-1]

    #ako indeksi odgovora(answer_start/end_index) nisu unutar opsega kao i za offset, onda upisujemo 0,0
    #tj. ako je start_index van poslednje vrednosti iz konteksta ili ako je end_index ispod prve vrednosti konteksta, to znaci da odgovor uopste nije u kontekstu(vrv. zbog truncation-a), tada upisujemo par (0,0)
    if answer_start_index > offset[context_end][1] or answer_end_index < offset[context_start][0]:
      answer_start_positions.append(0)
      answer_end_positions.append(0)
    else:
      #u suprotnom znamo da odgovor jeste u kontekstu
      #Za startni indeks pocinjemo od pocetka konteksta i idemo do kraja, ili dok ne naidjemo na pocetni token odgovora(tj. indeks gde se taj token javlja u recenici)
      idx= context_start
      while idx <= context_end and offset[idx][0] <= answer_start_index:
          idx += 1
      answer_start_positions.append(idx - 1)  

      #za end indeks pocinjemo od kraja konteksta i idemo unazad do pocetka ili dok ne naidjemo na krajnji token odgovora(tj. indeks gde se poslednji token iz odgovora javlja u originalnom kontekst tekst formatu)
      idx = context_end
      while idx >= context_start and offset[idx][1] >= answer_end_index:
          idx -= 1
      answer_end_positions.append(idx + 1)
  #NOTE: ako performanse budu lose, blame the function: https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb#scrollTo=tqg1q9oD3_il original ovde
  tokenized_input["start_positions"] = answer_start_positions
  tokenized_input["end_positions"] = answer_end_positions

  return tokenized_input

def tokenize_question_answering_validation(element):
  questions = [q.strip() for q in element["question"]]
  inputs = tokenizer(
      questions,
      element["context"],
      max_length=None,
      truncation="only_second",
      stride=128,
      return_overflowing_tokens=True,
      return_offsets_mapping=True,
  )
  # [0,0,0,1,1,1,2,2,3,3,3,3,3,3] Koji feature pripada kojem kontekstu(ako se kontekst podeli na vise delova, u suprotnom je jedna cifra samo)
  sample_map = inputs.pop("overflow_to_sample_mapping")
  example_ids = []
  assert len(sample_map) == len(inputs["input_ids"])
  for i in range(len(inputs["input_ids"])):
      sample_idx = sample_map[i]
      example_ids.append(element["id"][sample_idx])
      sequence_ids = inputs.sequence_ids(i)
      offset = inputs["offset_mapping"][i]
      inputs["offset_mapping"][i] = [
          o if sequence_ids[k] == 1 else [0] for k, o in enumerate(offset)
      ]
  inputs["example_id"] = example_ids
  return inputs

In [None]:
#Encoder-models
#============================================================================================================================
#Model configs

albert_config = AlbertConfig()
#============================================================================================================================
#Sentiment analysis

albert_model_sentiment = AlbertForSequenceClassification(albert_config)
#============================================================================================================================
#Extractive question answering
bert_large_model_question_answering = BertForQuestionAnswering(bert_large_config)
distillbert_model_question_answering = DistilBertForSequenceClassification(distillbert_config)
albert_model_question_answering = AlbertForSequenceClassification(albert_config)

#Decoder-model
gpt_neo_125m_model = AutoModel.from_pretrained("EleutherAI/gpt-neo-125M")
gpt_neo_13b_model = AutoModel.from_pretrained("EleutherAI/gpt-neo-1.3B")
gpt2_model = AutoModel.from_pretrained("gpt2")

#Seq2Seq model
#============================================================================================================================
#Model configs
bart_config = BartConfig()
t5_config = T5Config()
#============================================================================================================================
#Both for summarization and transduction
bart_model = BartForConditionalGeneration(bart_config)
t5_model = T5ForConditionalGeneration(t5_config)

## Training 

### Training functions

In [None]:
def evaluate(model : PreTrainedModel, eval_dataloader:DataLoader, accelerator, metric=load_metric("accuracy"), isCalledFromTraining=False):

    model.eval()
    if(not isCalledFromTraining):
      eval_dataloader = accelerator.prepare( eval_dataloader )

    if("questionanswering" in str(model.__class__.__name__).lower()):
      return evaluate_question_answering(model, eval_dataloader, accelerator) # TODO:
    elif("sequenceclassification" in str(model.__class__.__name__).lower()):
      return evaluate_sentiment(model, eval_dataloader, accelerator)

def evaluate_sentiment(model : PreTrainedModel, eval_dataloader:DataLoader, accelerator, metric=load_metric("accuracy")):
  losses = []
  accuracy = []
  for step, batch in enumerate(eval_dataloader):
      with torch.no_grad():
          outputs = model(batch["input_ids"], labels=batch["labels"]) # ovde je stojalo labels=batch["input_ids"] - ZASTO?! ja sam izmenio na labels
      losses.append(accelerator.gather(outputs.loss))
      metric.add_batch(predictions=torch.argmax(outputs.logits, dim=-1), references=torch.argmax(batch["labels"], dim=-1))
  loss = torch.mean(torch.tensor(losses)) # originalno bilo torch.mean(torch.cat(losses)), sto bi u principu trebalo da uradi konkatenaciju Tenzora i onda da uzme srednju vrednost od toga, ali se ispostavlja da to ne radi
  try:
      perplexity = torch.exp(loss)
  except OverflowError:
      perplexity = float("inf")
  return loss.item(), perplexity.item(), metric.compute()["accuracy"]

def evaluate_question_answering(model : PreTrainedModel, eval_dataloader:DataLoader, accelerator, metric):
  start_logits = []
  end_logits = []
  for batch in enumerate(eval_dataloader):
      with torch.no_grad():
          outputs = model(**batch)
      # start_logits.append(accelerator.gather(outputs.start_logits).cpu().numpy()) # might not be used if on GPU
      start_logits.append(accelerator.gather(outputs.start_logits))
      # end_logits.append(accelerator.gather(outputs.end_logits).cpu().numpy()) # might not be used if on GPU
      end_logits.append(accelerator.gather(outputs.end_logits))
  start_logits = np.concatenate(start_logits)
  end_logits = np.concatenate(end_logits)
  start_logits = start_logits[: len(validation_dataset)]# TODO:
  end_logits = end_logits[: len(validation_dataset)]# TODO:

  formatted_predictions = [{"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in final_predictions.items()]
  references = [{"id": ex["id"], "answers": ex["answers"]} for row in enumerate(eval_dataloader)]
  return metric.compute(predictions=formatted_predictions, references=references)# TODO:
   

def training_procedure(model : PreTrainedModel, train_dataloader : DataLoader, eval_dataloader:DataLoader, output_dir=None, repo:Repository=None, optimizer:Optimizer=torch.optim.AdamW, scheduler="linear", num_warmup_steps=1_000, num_epochs=3, learning_rate=5e-5, gradient_accumulation_steps=8, eval_steps=5_00, metric=load_metric("accuracy")):
  accelerator = Accelerator(fp16=True)

  optimizer = optimizer(model.parameters(), lr=learning_rate)

  train_dl, eval_dl, model, optimizer = accelerator.prepare(
    train_dataloader, eval_dataloader, model, optimizer
  )

  num_training_steps = num_epochs * len(train_dl)

  #optimizer scheduler
  lr_scheduler = get_scheduler(
      scheduler,
      optimizer=optimizer,
      num_warmup_steps=num_warmup_steps,
      num_training_steps=num_training_steps,
  )

  #training loop
  model.train() # turn on training mode
  completed_steps=0
  for epoch in range(num_epochs):
      for step, batch in tqdm( enumerate(train_dl, start=1), total=num_training_steps ):
          outputs = model(**batch)
          loss = outputs.loss 
          if step % 100 == 0:
            accelerator.print(
                {
                    "lr": lr_scheduler.get_lr(),
                    "samples": step * len(batch),
                    "steps": completed_steps,
                    "loss/train": loss.item() * gradient_accumulation_steps,
                }
            )
          loss = loss / gradient_accumulation_steps
          accelerator.backward(loss) # accelerated hugging face backpropagation
          if step % gradient_accumulation_steps == 0:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1
          if (step % (eval_steps * gradient_accumulation_steps)) == 0:
            eval_loss, perplexity, accuracy = evaluate(model,eval_dl, accelerator, metric, isCalledFromTraining=True)
            accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity, "accuracy":accuracy})
            model.train()
            accelerator.wait_for_everyone()
            if output_dir is not None:
              unwrapped_model = accelerator.unwrap_model(model)
              unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
              if accelerator.is_main_process:
                  tokenizer.save_pretrained(output_dir)
                  repo.push_to_hub(
                      commit_message=f"Training in progress step {step}", blocking=False
                  )

Konekcija ka Hugging Face-u da uploadujemo modele

In [None]:
notebook_login()

### Training Encoders

#### Sentence classification

##### BERT
Inicijalizacija repozitorijuma za ovaj model

In [None]:
model_name = "bert-base-for-multilabel-sentence-classification"
repo_name = get_full_repo_name(model_name)
repo = Repository(model_name, clone_from=repo_name)

In [None]:
bert_base_config = BertConfig(vocab_size=len(bert_tokenizer), problem_type="multi_label_classification", num_labels=4)
bert_base_model_sentiment = BertForSequenceClassification(bert_base_config)
# bert_base_model_sentiment.config.num_labels = 4
bert_base_model_sentiment.config.id2label = {
    0: "anger", 1: "joy", 2: "optimism", 3: "sadness"
}
bert_base_model_sentiment.config.label2id = {
    "anger":0, "joy":1,"optimism":2,"sadness":3
}
bert_base_model_sentiment.resize_token_embeddings(len(bert_tokenizer))

data_collator = DataCollatorWithPadding(bert_tokenizer, return_tensors="pt")

model = bert_base_model_sentiment# OVO MORAMO DA URADIMO PRE POZIVA MAPIRANJA
tokenizer = bert_tokenizer # OVO MORAMO DA URADIMO PRE POZIVA MAPIRANJA

tokenized_dataset = data_sentiment_analysis.map(tokenize_sentence_classification, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns("text")
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")

train_dataloader = DataLoader(tokenized_dataset["train"], batch_size=16, shuffle=True, collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_dataset["validation"], batch_size=16, collate_fn=data_collator)
tokenized_dataset["train"][0:1]

Inference pre treninga

In [None]:
evaluate(model=bert_base_model_sentiment.to("cuda"), eval_dataloader=eval_dataloader, accelerator=Accelerator(fp16=True))

In [None]:
args= (model, train_dataloader, eval_dataloader, repo_name, repo, torch.optim.AdamW, "linear", 1_500, 50, 8.3913e-06, 8, 5_00, load_metric("accuracy"))
notebook_launcher(training_procedure, args=args, use_fp16=True)

Inference nakon treninga

In [None]:
evaluate(model=bert_base_model_sentiment, eval_dataloader=eval_dataloader, accelerator=Accelerator(fp16=True))

##### BERT Large

Inicijalizacija repozitorijuma za ovaj model

In [None]:
model_name = "bert-large-for-multilabel-sentence-classification"
repo_name = get_full_repo_name(model_name)
repo = Repository(model_name, clone_from=repo_name)

In [None]:
id2label = {
    0: "anger", 1: "joy", 2: "optimism", 3: "sadness"
}
label2id = {
    "anger":0, "joy":1,"optimism":2,"sadness":3
}
bert_large_config = BertConfig(vocab_size=len(bert_tokenizer), problem_type="multi_label_classification", num_labels=4, id2label=id2label, label2id=label2id, hidden_size=1024,num_hidden_layers=24,num_attention_heads=16)
bert_large_model_sentiment = BertForSequenceClassification(bert_large_config)
bert_large_model_sentiment.resize_token_embeddings(len(bert_tokenizer))

data_collator = DataCollatorWithPadding(bert_tokenizer, return_tensors="pt")

model = bert_large_model_sentiment# OVO MORAMO DA URADIMO PRE POZIVA MAPIRANJA
tokenizer = bert_tokenizer # OVO MORAMO DA URADIMO PRE POZIVA MAPIRANJA

tokenized_dataset = data_sentiment_analysis.map(tokenize_sentence_classification, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns("text")
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")

train_dataloader = DataLoader(tokenized_dataset["train"], batch_size=16, shuffle=True, collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_dataset["validation"], batch_size=16, collate_fn=data_collator)
tokenized_dataset["train"][0:1]

Inference pre treninga

In [None]:
evaluate(model=bert_large_model_sentiment, eval_dataloader=eval_dataloader, accelerator=Accelerator(fp16=True))

Training

In [None]:
args= (model, train_dataloader, eval_dataloader, repo_name, repo, torch.optim.AdamW, "linear", 1_000, 10, 5e-5, 8, 5_00, load_metric("accuracy"))
notebook_launcher(training_procedure, args=args, use_fp16=True)

Inference nakon treninga

In [None]:
evaluate(model=bert_large_model_sentiment, eval_dataloader=eval_dataloader, accelerator=Accelerator(fp16=True))

##### Distillbert

Inicijalizacija repozitorijuma za ovaj model

In [None]:
model_name = "distillbert-for-multilabel-sentence-classification"
repo_name = get_full_repo_name(model_name)
repo = Repository(model_name, clone_from=repo_name)

In [None]:
id2label = {
    0: "anger", 1: "joy", 2: "optimism", 3: "sadness"
}
label2id = {
    "anger":0, "joy":1,"optimism":2,"sadness":3
}
distillbert_config = DistilBertConfig(vocab_size=len(distillbert_tokenizer), problem_type="multi_label_classification", num_labels=4, id2label=id2label, label2id=label2id)
distillbert_model_sentiment = DistilBertForSequenceClassification(distillbert_config)
distillbert_model_sentiment.resize_token_embeddings(len(distillbert_tokenizer))

data_collator = DataCollatorWithPadding(distillbert_tokenizer, return_tensors="pt")

model = distillbert_model_sentiment# OVO MORAMO DA URADIMO PRE POZIVA MAPIRANJA
tokenizer = distillbert_tokenizer # OVO MORAMO DA URADIMO PRE POZIVA MAPIRANJA

tokenized_dataset = data_sentiment_analysis.map(tokenize_sentence_classification, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns("text")
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")

train_dataloader = DataLoader(tokenized_dataset["train"], batch_size=16, shuffle=True, collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_dataset["validation"], batch_size=16, collate_fn=data_collator)
tokenized_dataset["train"][0:1]

Inference pre treninga

In [None]:
evaluate(model=model.to("cuda:0"), eval_dataloader=eval_dataloader, accelerator=Accelerator(fp16=True))

Training

In [None]:
args= (model, train_dataloader, eval_dataloader, repo_name, repo, torch.optim.AdamW, "linear", 1_000, 10, 5e-5, 8, 5_00, load_metric("accuracy"))
notebook_launcher(training_procedure, args=args, use_fp16=True)

Inference nakon treninga

In [None]:
evaluate(model=model.to("cuda"), eval_dataloader=eval_dataloader, accelerator=Accelerator(fp16=True))

##### Albert

Inicijalizacija repozitorijuma za ovaj model

In [None]:
model_name = "albert-for-multilabel-sentence-classification"
repo_name = get_full_repo_name(model_name)
repo = Repository(model_name, clone_from=repo_name)

In [None]:
id2label = {
    0: "anger", 1: "joy", 2: "optimism", 3: "sadness"
}
label2id = {
    "anger":0, "joy":1,"optimism":2,"sadness":3
}
albert_config = AlbertConfig(vocab_size=len(albert_tokenizer), problem_type="multi_label_classification", num_labels=4, id2label=id2label, label2id=label2id)
albert_model_sentiment = AlbertForSequenceClassification(albert_config)
albert_model_sentiment.resize_token_embeddings(len(albert_tokenizer))

data_collator = DataCollatorWithPadding(albert_tokenizer, return_tensors="pt")

model = albert_model_sentiment# OVO MORAMO DA URADIMO PRE POZIVA MAPIRANJA
tokenizer = albert_tokenizer # OVO MORAMO DA URADIMO PRE POZIVA MAPIRANJA

tokenized_dataset = data_sentiment_analysis.map(tokenize_sentence_classification, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns("text")
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")

train_dataloader = DataLoader(tokenized_dataset["train"], batch_size=16, shuffle=True, collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_dataset["validation"], batch_size=16, collate_fn=data_collator)
tokenized_dataset["train"][0:1]

Inference pre treninga

In [None]:
evaluate(model=model.to("cuda"), eval_dataloader=eval_dataloader, accelerator=Accelerator(fp16=True))

Training

In [None]:
args= (model, train_dataloader, eval_dataloader, repo_name, repo, torch.optim.AdamW, "linear", 1_000, 10, 5e-5, 8, 5_00, load_metric("accuracy"))
notebook_launcher(training_procedure, args=args, use_fp16=True)

Inference nakon treninga

In [None]:
evaluate(model=model.to("cuda"), eval_dataloader=eval_dataloader, accelerator=Accelerator(fp16=True))

#### Question answering

##### BERT

Inicijalizacija repozitorijuma za ovaj model

In [None]:
model_name = "bert-base-for-question-answering"
repo_name = get_full_repo_name(model_name)
repo = Repository(model_name, clone_from=repo_name)

In [6]:
bert_base_config = BertConfig(vocab_size=len(bert_tokenizer))
bert_base_model_question_answering = BertForQuestionAnswering(bert_base_config)
bert_base_model_question_answering.resize_token_embeddings(len(bert_tokenizer))

data_collator = DataCollatorWithPadding(bert_tokenizer, padding="max_length", return_tensors="pt")

model = bert_base_model_question_answering# OVO MORAMO DA URADIMO PRE POZIVA MAPIRANJA
tokenizer = bert_tokenizer # OVO MORAMO DA URADIMO PRE POZIVA MAPIRANJA

tokenized_dataset = data_question_answering['train'].map(tokenize_question_answering, batched=True, remove_columns=data_question_answering["train"].column_names)
tokenized_dataset.set_format("torch")

tokenized_dataset_validation = data_question_answering["validation"].map(tokenize_question_answering_validation, batched=True, remove_columns=data_question_answering["validation"].column_names) 
tokenized_dataset_validation.set_format("torch")

num_rows_in_val=data_question_answering["validation"].num_rows
desired_no=16

train_dataloader = DataLoader(tokenized_dataset, batch_size=desired_no, shuffle=True, collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_dataset_validation, batch_size=[i for i in range(desired_no,num_rows_in_val) if num_rows_in_val % i == 0][0], collate_fn=data_collator)

  0%|          | 0/87 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [17]:
tokenizer = bert_tokenizer
model = bert_base_model_question_answering.to("cpu") 

question, text = data_question_answering["validation"][0:5]["question"], data_question_answering["validation"][0:5]["context"]

inputs = tokenizer(
      question, 
      text, 
      max_length=None, 
      truncation="only_second", 
      return_offsets_mapping=True,
      return_overflowing_tokens=True, 
      stride=128, 
      # If set to a number along with max_length, the overflowing tokens returned when return_overflowing_tokens=True will contain some tokens from the end of the truncated sequence returned to provide 
      # some overlap between truncated and overflowing sequences. The value of this argument defines the number of overlapping tokens.
      padding="max_length",
      return_tensors="pt"
      )

offset_mapping = inputs.pop("offset_mapping")
overflow_to_sample_mapping = inputs.pop("overflow_to_sample_mapping")

with torch.no_grad():
    outputs = model(**inputs) # NOTE is validation supposed to have start_positions ? If so, what do we even need it for?

# print(outputs.start_logits.shape,"\n")
answer_start_index = outputs.start_logits.argmax(dim=-1)
answer_end_index = outputs.end_logits.argmax(dim=-1)
# print("Predicted start(s) are: ",answer_start_index)
# print("Real start is: ",data_question_answering["validation"][which_number_to]["start_positions"])
# print("Predicted end(s) are: ",answer_end_index, "\n")

formatted_predictions = []
for index, value in enumerate(inputs.input_ids):
    if(answer_start_index[index] > answer_end_index[index]):
        print("Start index > End index") # TODO figure out what this means and whether it can be fixed; aka does it mean that the answer spans around two contexts?
    mapiranja_ofseta = offset_mapping[index]
    # We want to extract an index of the first 1 so we can know where the context part begins    
    idjevi_sekvenci = inputs.sequence_ids(index)
    index_for_context_start = idjevi_sekvenci.index(1) # NOTE index where the context starts
    index_for_context_end = len(idjevi_sekvenci) - idjevi_sekvenci[ -1: -len(idjevi_sekvenci)-1 : -1].index(1) - 1 
    overflows = overflow_to_sample_mapping[index].item() # NOTE use this to find question's ID
    que = question[overflows]
    if answer_start_index[index] < index_for_context_start or answer_end_index[index] > index_for_context_end:
        print("Answer isn't in the context again?")
    else:
        # start_index, end_index = mapiranja_ofseta[answer_start_index[index].item()][0], mapiranja_ofseta[answer_end_index[index].item()][1]
        solution = tokenizer.decode(inputs.input_ids[index][answer_start_index[index].item():answer_end_index[index].item()+1]) # ovde cemo vrsiti dekodiranje
        print(index,"\t",que,"\t",solution,"\n")
        # print("Real answer:\n",data_question_answering["validation"][which_number_to+overflows]["text"])
        formatted_predictions.append({"id":data_question_answering["validation"][overflows]["id"],"prediction_text":solution, "no_answer_probability":0})
references = [{"id": row["id"], "answers": { "answer_start" : [answer for index, answer in enumerate(row["start_positions"])], "text":[text for index, text in enumerate(row["text"])] }} for index,row in enumerate(Dataset.from_dict(data_question_answering["validation"][0:5]))]

load_metric("squad_v2").compute(predictions=formatted_predictions, references=references)

Start index > End index
0 	 In what country is Normandy located? 	  

1 	 When were the Normans in Normandy? 	 the normans ( norman : nourmands ; french : normands ; latin : normanni ) were the people who in the 10th and 11th centuries gave their name to normandy, a region in france. they were descended from norse ( " norman " comes from " norseman " ) raiders and pirates from denmark, iceland and norway who, under their leader rollo, agreed to swear fealty to king charles iii of west francia. through generations of assimilation and mixing with the native frankish and roman - gaulish populations, their descendants would gradually merge with the carolingian - based 

2 	 From which countries did the Norse originate? 	 nourmands ; french : normands ; latin : normanni ) were the people who in the 10th and 11th centuries gave their name to normandy, a region in france. they were descended from norse ( " norman " comes from " norseman " ) raiders and pirates from denmark, iceland and norway

{'exact': 0.0,
 'f1': 5.994579945799457,
 'total': 5,
 'HasAns_exact': 0.0,
 'HasAns_f1': 5.994579945799457,
 'HasAns_total': 5,
 'best_exact': 0.0,
 'best_exact_thresh': 0.0,
 'best_f1': 5.994579945799457,
 'best_f1_thresh': 0.0}

In [None]:
model=bert_base_model_question_answering.to("cuda")
model.eval()
accelerator=Accelerator(fp16=True)
eval_dataloader = accelerator.prepare( eval_dataloader )
start_logits = []
end_logits = []
metric=load_metric("squad_v2")
for batch in enumerate(eval_dataloader):
  print(batch.keys())
  with torch.no_grad():
      outputs = model(**batch)
  # start_logits.append(accelerator.gather(outputs.start_logits).cpu().numpy()) # might not be used if on GPU
  start_logits.append(accelerator.gather(outputs.start_logits))
  # end_logits.append(accelerator.gather(outputs.end_logits).cpu().numpy()) # might not be used if on GPU
  end_logits.append(accelerator.gather(outputs.end_logits))
duzina_starta_pre = len(start_logits)
duzina_enda_pre = len(end_logits)
start_logits = torch.cat(start_logits)
end_logits = torch.cat(end_logits)
duzina_starta_posle = len(start_logits)
duzina_enda_posle = len(end_logits)

#NOTE How the metric works
# The metric takes two files or two lists - one representing model predictions and the other the references to compare them to.

# Predictions : List of triple for question-answers to score with the following key-value pairs:

# 'id': the question-answer identification field of the question and answer pair
# 'prediction_text' : the text of the answer
# 'no_answer_probability' : the probability that the question has no answer


# References: List of question-answers dictionaries with the following key-value pairs:

# 'id': id of the question-answer pair (see above),
# 'answers': a list of Dict {‘text’: text of the answer as a string}
# 'no_answer_threshold': the probability threshold to decide that a question has no answer.


# start_logits = start_logits[: len(validation_dataset)]# TODO:
# end_logits = end_logits[: len(validation_dataset)]# TODO:

'''
references = [{"id": row["id"], "answers": [{"text":text} for index, text in enumerate(row["text"])] , "no_answer_threshold" : 0} for index,row in enumerate(data_question_answering["validation"])]
NOTE
references look like:

[
  { 
    'id': '56ddde6b9a695914005b9628', 
    'answers': [
      {'text': 'France'}, {'text': 'France'}, {'text': 'France'}, {'text': 'France'}
    ], 
    'no_answer_threshold': 0
  }, 
  {
    'id': '56ddde6b9a695914005b9629', 
    'answers': [
      {'text': '10th and 11th centuries'}, {'text': 'in the 10th and 11th centuries'}, {'text': '10th and 11th centuries'}, {'text': '10th and 11th centuries'}
    ], 
    'no_answer_threshold': 0
  }
]

NOTE usage example

from evaluate import load
squad_metric = load("squad_v2")
predictions = [
  {'prediction_text': '1976', 'id': '56e10a3be3433e1400422b22', 'no_answer_probability': 0.}, 
  {'prediction_text': 'Beyonce', 'id': '56d2051ce7d4791d0090260b', 'no_answer_probability': 0.},  
  {'prediction_text': 'climate change', 'id': '5733b5344776f419006610e1', 'no_answer_probability': 0.}
]

references = [
  {'answers':{
    'answer_start': [97], 'text': ['1976']
    }, 
    'id': '56e10a3be3433e1400422b22'
  }, 
  { 'answers': {
    'answer_start': [233], 'text': ['Beyoncé and Bruno Mars']
    }, 'id': '56d2051ce7d4791d0090260b'
  }, 
  {'answers': {
    'answer_start': [891], 'text': ['climate change']
    }, 'id': '5733b5344776f419006610e1'
  }
]
results = squad_v2_metric.compute(predictions=predictions, references=references)


'''


# return metric.compute(predictions=formatted_predictions, references=references)# TODO:

In [12]:
squad_metric = load_metric("squad_v2")
predictions = [
  {'prediction_text': '1976', 'id': '56e10a3be3433e1400422b22', 'no_answer_probability': 0.}, 
  {'prediction_text': 'Beyonce', 'id': '56d2051ce7d4791d0090260b', 'no_answer_probability': 0.},  
  {'prediction_text': 'climate change', 'id': '5733b5344776f419006610e1', 'no_answer_probability': 0.}
]

references = [
  {'answers':{
    'answer_start': [97], 'text': ['1976']
    }, 
    'id': '56e10a3be3433e1400422b22'
  }, 
  { 'answers': {
    'answer_start': [233], 'text': ['Beyoncé and Bruno Mars']
    }, 'id': '56d2051ce7d4791d0090260b'
  }, 
  {'answers': {
    'answer_start': [891], 'text': ['climate change']
    }, 'id': '5733b5344776f419006610e1'
  }
]
results = squad_metric.compute(predictions=predictions, references=references)


Inference pre treninga

In [None]:
evaluate(model=bert_base_model_question_answering.to("cuda"), eval_dataloader=eval_dataloader, accelerator=Accelerator(fp16=True), metric=load_metric("squad_v2"))

Trening

In [None]:
args= (model, train_dataloader, eval_dataloader, repo_name, repo, torch.optim.AdamW, "linear", 1_500, 50, 8.3913e-06, 8, 5_00, load_metric("squad_v2"))
notebook_launcher(training_procedure, args=args, use_fp16=True)

Inference nakon treninga

In [None]:
evaluate(model=bert_base_model_sentiment, eval_dataloader=eval_dataloader, accelerator=Accelerator(fp16=True), metric=load_metric("squad_v2"))

##### BERT Large

##### DistillBERT

##### Albert