In [1]:
# pip install datasets    

In [2]:
# pip install transformers

In [3]:
from transformers import EncoderDecoderModel, BertTokenizer,BertModel

# instantiate the encoder and decoder models
encoder_model = BertModel.from_pretrained('bert-base-multilingual-cased')
decoder_model = BertModel.from_pretrained('bert-base-multilingual-cased')

# instantiate the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

decoder_config = decoder_model.config
decoder_config.pad_token_id = tokenizer.pad_token_id

# instantiate the encoder-decoder model
model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
# create a new instance of the BertModel with the last 2 layers set to trainable
# model = BertModel.from_pretrained('bert-base-multilingual-cased')
for name, param in model.named_parameters():
    if 'layer.10' in name or 'layer.11' in name:
        param.requires_grad = True




Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.dens

In [4]:
from datasets import load_dataset
train_data = load_dataset('wmt16', 'de-en', split='train[:1%]')
val_data = load_dataset('wmt16', 'de-en', split='train[1%:2%]')
test_data = load_dataset('wmt16', 'de-en', split='test')

Found cached dataset wmt16 (/home/aayush/.cache/huggingface/datasets/wmt16/de-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227)
Found cached dataset wmt16 (/home/aayush/.cache/huggingface/datasets/wmt16/de-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227)
Found cached dataset wmt16 (/home/aayush/.cache/huggingface/datasets/wmt16/de-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227)


In [5]:
# def tokenize(batch):
#     return tokenizer([example['de'] for example in batch['translation']], padding=True, truncation=True, max_length=512, return_tensors="pt"), tokenizer([example['en'] for example in batch['translation']], padding=True, truncation=True, max_length=512, return_tensors="pt")

def tokenize(batch):
    return {"input_ids": tokenizer([example['de'] for example in batch['translation']], padding="max_length", truncation=True, max_length=512, return_tensors="pt")["input_ids"],
            "attention_mask": tokenizer([example['de'] for example in batch['translation']], padding="max_length", truncation=True, max_length=512, return_tensors="pt")["attention_mask"],
            "labels": tokenizer([example['en'] for example in batch['translation']], padding="max_length", truncation=True, max_length=512, return_tensors="pt")["input_ids"]}

train_data = train_data.map(tokenize, batched=True, batch_size=32)
val_data = val_data.map(tokenize, batched=True, batch_size=32)

train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Loading cached processed dataset at /home/aayush/.cache/huggingface/datasets/wmt16/de-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227/cache-a16d944701ca5b03.arrow
Loading cached processed dataset at /home/aayush/.cache/huggingface/datasets/wmt16/de-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227/cache-c1ea600773cba7aa.arrow


In [6]:
from torch.optim import AdamW
from torch.utils.data import DataLoader
import torch


device = torch.device('cpu')
def train(model, train_dataloader, val_dataloader, optimizer, criterion, device, epochs):
    train_losses = []
    val_losses = []
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for i, batch in enumerate(train_dataloader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            running_loss += loss.item()
            loss.backward()
            optimizer.step()

        # evaluate on validation set
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for i, batch in enumerate(val_dataloader):
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                val_loss += loss.item()

        train_loss = running_loss / len(train_dataloader)
        val_loss = val_loss / len(val_dataloader)
        print(f"Epoch {epoch+1}/{epochs}, Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")
        train_losses.append(train_loss)
        val_losses.append(val_loss)

        # early stopping
        if epoch > 0 and val_losses[-1] > val_losses[-2]:
            print("Validation loss increased, stopping early...")
            break

    return train_losses, val_losses

# def train(model, optimizer, train_dataset, val_dataset, epochs=5, batch_size=32, device='cpu'):

#     model.to(device)
    
#     for epoch in range(epochs):
#         model.train()
#         train_loss = 0.0
        
#         for i, batch in enumerate(train_dataloader):
#             input_ids = batch['input_ids'].to(device)
#             attention_mask = batch['attention_mask'].to(device)
#             labels = batch['labels'].to(device)
            
#             optimizer.zero_grad()
            
#             outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
#             # logits = outputs.logits
#             loss = outputs.loss
            
#             loss.backward()
#             optimizer.step()
            
#             train_loss += loss.item()
        
#         # Calculate validation loss
#         model.eval()
#         val_loss = 0.0
#         with torch.no_grad():
#             for batch in val_dataloader:
#                 input_ids = batch['input_ids'].to(device)
#                 attention_mask = batch['attention_mask'].to(device)
#                 labels = batch['labels'].to(device)
                
#                 outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
#                 # logits = outputs.logits
#                 loss = outputs.loss
                
#                 val_loss += loss.item()
        
#         train_loss /= len(train_dataloader)
#         val_loss /= len(val_dataloader)
        
#         print(f"Epoch {epoch+1}/{epochs} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f}")

# # fine-tune the model
# for epoch in range(10):
#     train_loss = 0
#     valid_loss = 0

#     model.train()
#     for batch in train_data:
#         optimizer.zero_grad()
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         labels = batch['labels'].to(device)
#         # decoder_input_ids = batch['decoder_input_ids'].to(device)
#         # decoder_attention_mask = batch['decoder_attention_mask'].to(device)

#         outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
#         logits = outputs.logits
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()

#         train_loss += loss.item()

#     model.eval()
#     for batch in val_data:
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         # decoder_input_ids = batch['decoder_input_ids'].to(device)
#         # decoder_attention_mask = batch['decoder_attention_mask'].to(device)

#         outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
#         loss = outputs.loss
#         valid_loss += loss.item()

#     train_loss /= len(train_data)
#     valid_loss /= len(val_data)

#     print(f'Epoch {epoch+1}, train_loss={train_loss:.4f}, valid_loss={valid_loss:.4f}')

In [7]:
batch_size = 32
optimizer = AdamW(model.parameters(), lr=5e-5)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=batch_size)
criterion = torch.nn.CrossEntropyLoss()
train(model, train_dataloader, val_dataloader, optimizer, criterion, device, epochs=5)




AttributeError: 'BaseModelOutputWithPoolingAndCrossAttentions' object has no attribute 'logits'