In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Check if a GPU is available and set the device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.texts = texts
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten().to(device),
            'attention_mask': encoding['attention_mask'].flatten().to(device)
        }

def load_data(train_file, dev_file, test_file):
    def combine_texts(df):
        return (df['body_parent'] + ' ' + df['body_child']).tolist()

    train_df = pd.read_csv(train_file)
    dev_df = pd.read_csv(dev_file)
    test_df = pd.read_csv(test_file)

    train_texts = combine_texts(train_df)
    dev_texts = combine_texts(dev_df)
    test_texts = combine_texts(test_df)

    return train_texts, dev_texts, test_texts

train_texts, dev_texts, test_texts = load_data('train.csv', 'dev.csv', 'test.csv')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 128

# Combining all datasets into one for pretraining
all_texts = train_texts + dev_texts + test_texts
text_dataset = TextDataset(all_texts, tokenizer, max_len)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

train_args = TrainingArguments(
    output_dir='./pretrained_bert',
    overwrite_output_dir=True,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    save_strategy="epoch",
    logging_dir='./logs',
)


model = BertForMaskedLM.from_pretrained('bert-base-uncased').to(device)

trainer = Trainer(
    model=model,
    args=train_args,
    data_collator=data_collator,
    train_dataset=text_dataset,
)

trainer.train()

# Save the pretrained model
model.save_pretrained('./pretrained_bert')
tokenizer.save_pretrained('./pretrained_bert')
