In [1]:
with open('/mnt/pmldl/paracrawl-release1.en-ru.zipporah0-dedup-clean.en') as f:
    eng_lines = f.readlines()

In [2]:
with open('/mnt/pmldl/paracrawl-release1.en-ru.zipporah0-dedup-clean.ru') as f:
    ru_lines = f.readlines()

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Use pretrained model and tokenizer

In [4]:
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ru-en")

In [5]:
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-ru-en")

# Freeze encoder weights

In [6]:
for param in model.base_model.parameters():
    param.requires_grad = False

# Split data

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
ru_train, ru_val, eng_train, eng_val = train_test_split(ru_lines, eng_lines, test_size=.1)

In [9]:
ru_train, ru_val, eng_train, eng_val = ru_train[:10000], ru_val[:1000],\
                                        eng_train[:10000], eng_val[:1000]

In [10]:
train_encodings = tokenizer.prepare_seq2seq_batch(ru_train, eng_train, 
                                                  truncation=True, 
                                                  padding=True,
                                                  max_length=100)
val_encodings = tokenizer.prepare_seq2seq_batch(ru_val, eng_val, 
                                                truncation=True, 
                                                padding=True,
                                                max_length=100)

In [11]:
import torch
from torch.utils.data import Dataset

In [12]:
class Seq2seqDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings["labels"])

In [13]:
train_dataset = Seq2seqDataset(train_encodings)

In [14]:
eval_dataset = Seq2seqDataset(val_encodings)

In [15]:
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW

In [16]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [17]:
model.to(device)
model.train()

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(62518, 512, padding_idx=62517)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(62518, 512, padding_idx=62517)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0): MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
   

In [18]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

optim = AdamW(model.lm_head.parameters(), lr=5e-5)

In [19]:
import numpy as np

In [20]:
from tqdm.notebook import tqdm

In [1]:
for epoch in range(3):
    epoch_loss = []
    for batch in tqdm(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
#         embeddings = model.base_model(input_ids,\
#                                       decoder_input_ids=labels,\
#                                       attention_mask=attention_mask)/
#                                       .requires_grad(True)
#         outputs = model.lm_head(embeddings)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        epoch_loss.append(loss.item())
        loss.backward()
        optim.step()
        
    print(f"Epoch {epoch} finished; Loss : {np.mean(epoch_loss)}")

model.eval()

NameError: name 'tqdm' is not defined

In [23]:
import os

In [22]:
experiment_name = "marian_model_3_epochs_10k_samples_no_max_length"

In [24]:
model.save_pretrained(os.path.join("models", experiment_name))

In [25]:
!nvidia-smi

Thu Mar 11 16:04:07 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.102.04   Driver Version: 450.102.04   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce GTX 108...  Off  | 00000000:0A:00.0 Off |                  N/A |
|  0%   46C    P2    57W / 280W |   9855MiB / 11178MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  GeForce GTX 108...  Off  | 00000000:43:00.0 Off |                  N/A |
|  0%   22C    P8    10W / 280W |     25MiB / 11176MiB |      0%      Defaul