In [1]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, GPT2LMHeadModel, GPT2Tokenizer
import pandas as pd
from tqdm import tqdm

bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [2]:
class BERTGPT2Hybrid(nn.Module):
    def __init__(self, bert_model="bert-base-uncased", gpt2_model="gpt2"):
        super(BERTGPT2Hybrid, self).__init__()

        # Load pre-trained BERT model
        self.bert = BertModel.from_pretrained(bert_model)

        # Load pre-trained GPT2 model
        self.gpt2 = GPT2LMHeadModel.from_pretrained(gpt2_model)

        # Set GPT2 tokenizer padding token to be same as BERT's pad token
        gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

    def forward(self, input_ids, attention_mask, output_ids=None):
        # Get BERT outputs (used for feature extraction)
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_features = bert_outputs.last_hidden_state[:, 0, :]  # Get [CLS] token

        # Use GPT2 for language modeling
        gpt2_outputs = self.gpt2(input_ids=output_ids, labels=output_ids)
        return bert_features, gpt2_outputs

In [3]:
class WikiHowDataset(Dataset):
    def __init__(self, csv_path, max_length=256):
        self.data = pd.read_csv(csv_path, encoding='ISO-8859-1').dropna(subset=['input_text', 'output_text'])
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = str(self.data.iloc[idx]['input_text'])
        output_text = str(self.data.iloc[idx]['output_text'])

        # Tokenize input and output text
        input_ids = bert_tokenizer.encode(input_text, truncation=True, max_length=self.max_length, return_tensors="pt").squeeze()
        output_ids = gpt2_tokenizer.encode(output_text, truncation=True, max_length=self.max_length, return_tensors="pt").squeeze()

        return {
            "input_ids": input_ids,
            "output_ids": output_ids
        }

In [4]:
def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    output_ids = [item['output_ids'] for item in batch]

    # Pad sequences to maximum length
    input_padded = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=bert_tokenizer.pad_token_id)
    output_padded = torch.nn.utils.rnn.pad_sequence(output_ids, batch_first=True, padding_value=gpt2_tokenizer.pad_token_id)

    return {
        "input_ids": input_padded,
        "output_ids": output_padded
    }

In [None]:
def train(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}

        optimizer.zero_grad()

        input_ids = batch['input_ids']
        output_ids = batch['output_ids']

        # Get features from BERT and outputs from GPT2
        bert_features, gpt2_outputs = model(input_ids, attention_mask=(input_ids != bert_tokenizer.pad_token_id), output_ids=output_ids)

        # Compute loss (use GPT2 loss for simplicity)
        loss = gpt2_outputs.loss
        with torch.no_grad():
            loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [7]:
def main():
    # Hyperparameters
    csv_path = "C:/Users/aravi/Downloads/archive (2)/wikihowAll.csv" 
    batch_size = 4
    max_length = 256
    epochs = 3
    lr = 5e-5

    # Prepare dataset and dataloaders
    dataset = WikiHowDataset(csv_path, max_length=max_length)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

    # Initialize model and optimizer
    model = BERTGPT2Hybrid(bert_model="bert-base-uncased", gpt2_model="gpt2")
    optimizer = AdamW(model.parameters(), lr=lr)

    # Device setup
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)
    model.to(device)

    # Training loop
    for epoch in range(epochs):
        avg_loss = train(model, dataloader, optimizer, device)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

    # Save the fine-tuned model
    torch.save(model.state_dict(), "bert_gpt2_hybrid_finetuned.pth")
    print("Model saved!")

if __name__ == "__main__":
    main()


  self.data = pd.read_csv(csv_path, encoding='ISO-8859-1').dropna(subset=['input_text', 'output_text'])


cuda


  0%|          | 0/53641 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
  0%|          | 20/53641 [00:25<19:20:28,  1.30s/it]


KeyboardInterrupt: 

In [6]:
torch.device("cuda" if torch.cuda.is_available() else "cpu")

device(type='cuda')