In [None]:
import torch
from torch.cuda.amp import GradScaler
from torch.utils.data import Dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch.optim import AdamW
from torch.utils.data import DataLoader
from project_evaluate import calculate_score
import torch.nn as nn
import gc
import math

HF_HUB_DISABLE_SYMLINKS_WARNING = False
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
torch.backends.cudnn.benchmark = True

class TrainExpDataset(Dataset):
    def __init__(self, path):
        with open(path, "r", encoding="utf-8") as f:
            text = f.read()
        sentences = text.split("\n\n")
        splitted = [sentence.split("\nEnglish:\n") for sentence in sentences]
        self.german_sents = [sentence[0][8:] for sentence in splitted if sentence[0] != ""]
        self.english_sents = []
        self.roots = []
        self.modifiers = []
        for sentence in splitted:
          if sentence[0] != "":
            english_sent, other = sentence[1].split("Roots in English: ")
            self.english_sents.append(english_sent)
            try:
              roots, modifiers = other.split("\n")
            except:
              print(other, sentence)
            self.roots.append(roots)
            modifiers = modifiers.replace("Modifiers in English: (", "")
            modifiers = modifiers.replace("), (", "; ")
            self.modifiers.append(modifiers[:-1])

    def __len__(self):
        return len(self.german_sents)

    def __getitem__(self, index):
        return self.german_sents[index], self.english_sents[index], self.roots[index], self.modifiers[index]

class ValExpDataset(Dataset):
    def __init__(self, path):
        with open(path, "r", encoding="utf-8") as f:
            text = f.read()
        sentences = text.split("\n\n")
        splitted = [sentence.split("\nRoots in English:") for sentence in sentences]
        self.german_sents = [sentence[0][8:] for sentence in splitted if sentence[0] != ""]
        self.roots = []
        self.modifiers = []
        for sentence in splitted:
            if len(sentence) == 1:
              break
            roots, modifiers = sentence[1].split("\n")
            self.roots.append(str(roots.split(", ")))
            modifiers = modifiers.replace("Modifiers in English: (", "")
            modifiers = modifiers.split("), (")
            some = []
            for mod in modifiers:
              some.append(mod.split(", "))
            some[-1][-1] = some[-1][-1][:-1]
            self.modifiers.append(str(some))


    def __len__(self):
        return len(self.german_sents)

    def __getitem__(self, index):
        return self.german_sents[index], self.roots[index], self.modifiers[index]

def calc_bleu(epoch, model, tokenizer, val_loader, beams=1, task_prefix="translate German to English: "):
    with open(f"predict{epoch}.labeled", "w", encoding="utf-8") as f:
        for i, (input_sequences, english_roots, english_modifiers
            ) in enumerate(val_loader):
        
            # english_roots = [root.replace(",", "") for root in english_roots]
            # english_modifiers = [mod.replace(",", "").replace(";", "") for mod in english_modifiers]
            english_hints = [f"[{root + mod}]" for root, mod in zip(english_roots, english_modifiers)]
            if i % 10 == 0:
                print(i)
            encoding = tokenizer(
            [task_prefix +" ||| roots: "+ root + " ||| modifiers: "+ mod +" ||| "+ " german: " + sequence for sequence, root, mod in zip(
                input_sequences, english_roots, english_modifiers)],  
            padding="longest",
            max_length=1000,
            truncation=True,
            return_tensors="pt",
            )

            with torch.no_grad():
                input_ids, attention_mask = encoding.input_ids.to(device, non_blocking=True), encoding.attention_mask.to(
                    device, non_blocking=True)
                outputs_ids = model.generate(input_ids, attention_mask=attention_mask,
                                             max_length=1000, num_beams=beams)
                outputs_ids = outputs_ids.tolist()
                decoded_outputs = tokenizer.batch_decode(outputs_ids, skip_spaciel_tokens=True)
                truncated_outputs = []
                for output in decoded_outputs:
                    output = output[6:]
                    if "</s>" in output:
                        truncated_outputs.append(output[:output.index("</s>")])
                    else:
                        truncated_outputs.append(output)
                #output_text = tokenizer.decode(outputs_ids[0][1:], skip_spaciel_tokens=True)
                #output_text = tokenizer.decode(outputs_ids, skip_spaciel_tokens=True)
            for inp, out in zip(input_sequences, truncated_outputs):
              f.write("German:\n")
              f.write(inp)
              f.write("\nEnglish:\n")
              f.write(out)
              f.write("\n\n")
    return calculate_score(f"predict{epoch}.labeled", "val.labeled")


def train_epoch(tokenizer, data_loader, model, optimizer, scaler
                , task_prefix="translate German to English with hints: "):
    losses = []
    for i, (input_sequences, target_sequences, english_roots, english_modifiers
            ) in enumerate(data_loader):
        # english_roots = [root.replace(",", "") for root in english_roots]
        # english_modifiers = [mod.replace(",", "").replace(";", "") for mod in english_modifiers]
        english_hints = [f"[{root + mod}]" for root, mod in zip(english_roots, english_modifiers)]

        encoding = tokenizer(
            [task_prefix +" ||| roots: "+ root + " ||| modifiers: "+ mod +" ||| "+ " german: " + sequence for sequence, root, mod in zip(
                input_sequences, english_roots, english_modifiers)],                
            padding="longest",
            max_length=1000,
            truncation=True,
            return_tensors="pt",
        )
        input_ids, attention_mask = encoding.input_ids.to(device, non_blocking=True), encoding.attention_mask.to(device,
                                                                                                                 non_blocking=True)

        # encode the targets
        target_encoding = tokenizer(
            target_sequences,
            padding="longest",
            max_length=1000,
            truncation=True,
            return_tensors="pt",
        )
        labels = target_encoding.input_ids.to(device, non_blocking=True)

        # replace padding token id's of the labels by -100 so it's ignored by the loss
        labels[labels == tokenizer.pad_token_id] = -100

        # forward pass
        with torch.cuda.amp.autocast():
            loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
        if not math.isnan(loss.item()):
          losses.append(loss.item())
        if i % 20 == 0:
          print(loss.item())
        #loss.backward()
        scaler.scale(loss).backward()
        if True or (i+1) % 2 == 0:
            #optimizer.step()
            scaler.step(optimizer)
            scaler.update()
            #optimizer.zero_grad(set_to_none=True)
    return sum(losses) / len(losses)


def main():
    torch.cuda.empty_cache()
    gc.collect()
    print("\n\n")
    print(device)
    print("\n\n")
    model = T5ForConditionalGeneration.from_pretrained("t5-base")
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=1e-4)
    torch.save(model.state_dict(), f'weights{0}.pkl')
    #torch.save(optimizer.state_dict(), f"optimizer_epoch_{0}.pt")
    tokenizer = T5Tokenizer.from_pretrained("t5-base")

    #batch_size = 16
    batch_size = 8

    # train_dataset = TrainExpDataset("train.extra_labeled")
    # train_loader = DataLoader(train_dataset, batch_size=batch_size, pin_memory=True, num_workers=4,
    #                           shuffle=True)

    val_dataset = ValExpDataset("val.unlabeled")
    #val_loader = DataLoader(val_dataset, batch_size=20, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=8, pin_memory=True)

    #model.eval()
    #calc_bleu(0, model, tokenizer, val_loader)

    model.train()
    scaler = GradScaler()
    epochs = 15

    wide_scores = []
    avg_losses = []
    #config = T5Config.from_pretrained('t5-base')
    #config.fp16 = False
    for epoch in range(epochs):
      #torch.cuda.empty_cache()
      #gc.collect() 
      #model = T5ForConditionalGeneration(config=config)
      #model = T5ForConditionalGeneration.from_pretrained("t5-base")
      #model.to(device) 
      #model.load_state_dict(torch.load(f'weights{epoch}.pkl'))
      #optimizer = AdamW(model.parameters(), lr=1e-4)
      #optimizer.load_state_dict(torch.load(f"optimizer_epoch_{epoch}.pt"))
      train_dataset = TrainExpDataset(f"train.extra_labeled{epoch}")
      train_loader = DataLoader(train_dataset, batch_size=batch_size, pin_memory=True,
                            shuffle=True)
      print(f'Epoch {epoch}/{epochs}')
      model.train()
      avg_loss = train_epoch(tokenizer, train_loader, model, optimizer, scaler)
      print(f"The avg loss is {avg_loss}")
      avg_losses.append(avg_loss)
      model.eval()
      torch.save(optimizer.state_dict(), f"optimizer_epoch_{epoch+1}.pt")
      with torch.no_grad():
          wide_score = calc_bleu(epoch, model, tokenizer, val_loader, beams=4)
          print(f"The wider bleu is {wide_score}")
          wide_scores.append(wide_score)
          torch.save(model.state_dict(), f'weights{epoch+1}.pkl')

      print(wide_scores)
      print(avg_losses)

# if _name_ == "_main_":
main()




cuda



Epoch 0/15
3.9173085689544678
1.8183770179748535
1.8926398754119873
2.014730215072632
1.8306968212127686
1.8531278371810913
1.8489058017730713
1.8255528211593628
1.6332751512527466
1.7383685111999512
1.4514012336730957
1.6570998430252075
1.7615340948104858
1.6897836923599243
1.7192317247390747
1.845238447189331
1.8756437301635742
1.6406128406524658
1.59213125705719
1.579964280128479
1.671155333518982
1.5308290719985962
1.8954477310180664
1.7528046369552612
1.6376148462295532
2.0387399196624756
1.5845390558242798
1.6814318895339966
1.7465016841888428
1.8286787271499634
1.9464972019195557
1.7064290046691895
1.4980711936950684
1.5216928720474243
1.7230327129364014
1.8511635065078735
1.7285088300704956
1.5686490535736084
1.7890788316726685
1.5574474334716797
1.6661704778671265
1.5839061737060547
1.5996133089065552
1.6513259410858154
1.7753080129623413
1.6596182584762573
1.756629467010498
1.646897315979004
1.7504825592041016
1.7946631908416748
1.6473487615585327
1.805225849151611

KeyboardInterrupt: ignored

In [None]:
import torch
from torch.cuda.amp import GradScaler
from torch.utils.data import Dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch.optim import AdamW
from torch.utils.data import DataLoader
from project_evaluate import calculate_score
import math

HF_HUB_DISABLE_SYMLINKS_WARNING = False
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
torch.backends.cudnn.benchmark = True

class ValExpDataset(Dataset):
    def __init__(self, path):
        with open(path, "r", encoding="utf-8") as f:
            text = f.read()
        sentences = text.split("\n\n")
        splitted = [sentence.split("\nRoots in English:") for sentence in sentences]
        self.german_sents = [sentence[0][8:] for sentence in splitted if sentence[0] != ""]
        self.roots = []
        self.modifiers = []
        for sentence in splitted:
            if len(sentence) == 1:
              break
            roots, modifiers = sentence[1].split("\n")
            self.roots.append(str(roots.split(", ")))
            modifiers = modifiers.replace("Modifiers in English: (", "")
            modifiers = modifiers.split("), (")
            some = []
            for mod in modifiers:
              some.append(mod.split(", "))
            some[-1][-1] = some[-1][-1][:-1]
            self.modifiers.append(str(some))


    def __len__(self):
        return len(self.german_sents)

    def __getitem__(self, index):
        return self.german_sents[index], self.roots[index], self.modifiers[index]

def calc_bleu(epoch, model, tokenizer, val_loader, task_prefix="translate German to English: "):
    with open(f"predict{epoch}.labeled", "w", encoding="utf-8") as f:
        for i, (input_sequences, english_roots, english_modifiers
            ) in enumerate(val_loader):
        
            # english_roots = [root.replace(",", "") for root in english_roots]
            # english_modifiers = [mod.replace(",", "").replace(";", "") for mod in english_modifiers]
            english_hints = [f"[{root + mod}]" for root, mod in zip(english_roots, english_modifiers)]
            if i % 10 == 0:
                print(i)
            encoding = tokenizer(
            [task_prefix +" ||| roots: "+ root + " ||| modifiers: "+ mod +" ||| "+ " german: " + sequence for sequence, root, mod in zip(
                input_sequences, english_roots, english_modifiers)],  
            padding="longest",
            max_length=1000,
            truncation=True,
            return_tensors="pt",
            )

            with torch.no_grad():
                input_ids, attention_mask = encoding.input_ids.to(device, non_blocking=True), encoding.attention_mask.to(
                    device, non_blocking=True)
                outputs_ids = model.generate(input_ids, attention_mask=attention_mask,
                                             max_length=1000,num_beams=3)
                outputs_ids = outputs_ids.tolist()
                decoded_outputs = tokenizer.batch_decode(outputs_ids, skip_spaciel_tokens=True)
                truncated_outputs = []
                for output in decoded_outputs:
                    output = output[6:]
                    if "</s>" in output:
                        truncated_outputs.append(output[:output.index("</s>")])
                    else:
                        truncated_outputs.append(output)
                #output_text = tokenizer.decode(outputs_ids[0][1:], skip_spaciel_tokens=True)
                #output_text = tokenizer.decode(outputs_ids, skip_spaciel_tokens=True)
            for inp, out in zip(input_sequences, truncated_outputs):
              f.write("German:\n")
              f.write(inp)
              f.write("\nEnglish:\n")
              f.write(out)
              f.write("\n\n")
    return calculate_score(f"predict{epoch}.labeled", "val.labeled")

def main():
    print("\n\n")
    print(device)
    print("\n\n")
    model = T5ForConditionalGeneration.from_pretrained("t5-base")
    model.load_state_dict(torch.load("weights13.pkl"))
    model.to(device)
    tokenizer = T5Tokenizer.from_pretrained("t5-base")

    batch_size = 8
    val_dataset = ValExpDataset("val.unlabeled")
    val_loader = DataLoader(val_dataset, batch_size=8, pin_memory=True)
    model.eval()
    bleu = calc_bleu(0, model, tokenizer, val_loader)
    print(bleu)
      
# if _name_ == "_main_":
main()




cuda



Epoch 0/15


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


2.856860876083374
1.9917386770248413
2.0872743129730225
1.8690707683563232
1.5727643966674805
1.788523554801941
1.6146329641342163
1.5288859605789185
1.7197693586349487
1.5664262771606445
1.7692183256149292
1.6935234069824219
1.6563704013824463
1.7148308753967285
1.6467167139053345
1.656943678855896
1.5903360843658447
1.5002624988555908
1.4455012083053589
1.5768861770629883
1.6330647468566895
1.580560564994812
1.6009604930877686
1.5403094291687012
1.7366607189178467
1.378781795501709
1.504751443862915
1.3752506971359253
1.5961337089538574
1.4685829877853394
1.5168734788894653
1.3688346147537231
1.4661755561828613
1.3573142290115356
1.2625622749328613
1.4265047311782837
1.356203317642212
1.3743501901626587
1.2766581773757935
1.3651083707809448
1.615344762802124
1.931228518486023
1.350174069404602
1.5812652111053467
1.2525349855422974
1.4267679452896118
1.3822673559188843
1.307924509048462
1.18259859085083
1.1129114627838135
1.361085057258606
1.2357499599456787
1.4924968481063843
1.47953

KeyboardInterrupt: ignored

In [None]:
from transformers import T5Config
config = T5Config.from_pretrained('t5-base')
config.fp16 = False
model = T5ForConditionalGeneration(config=config)

In [None]:
model = T5ForConditionalGeneration.from_pretrained('t5-base')

In [None]:
!pip install transformers --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
model.config

T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "pre

In [None]:
help(T5ForConditionalGeneration.from_pretrained)

Help on method from_pretrained in module transformers.modeling_utils:

from_pretrained(pretrained_model_name_or_path: Union[str, os.PathLike, NoneType], *model_args, **kwargs) method of builtins.type instance
    Instantiate a pretrained pytorch model from a pre-trained model configuration.
    
    The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
    the model, you should first set it back in training mode with `model.train()`.
    
    pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
    task.
    
    weights are discarded.
    
    Parameters:
        pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
            Can be either:
    
                - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
                  Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespac

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!pip install transformers
!pip install evaluate
!pip install sentencepiece
!pip install sacrebleu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.2-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m104.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.2
Looking in indexes: https://pypi.org/simple, https://u