In [90]:
!pip install --upgrade accelerate



In [91]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from transformers import TrainingArguments, Trainer

In [93]:
# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)


NameError: name 'safe_open' is not defined

In [None]:
# Load the data
hypo_data = pd.read_csv('HYPO.tsv', sep='\t', header=None, names=['HYPO', 'PARAPHRASES', 'MINIMAL UNITS CORPUS'])
# hypo_data['HYPO'].dropna(inplace=False)
# hypo_data['PARAPHRASES'].dropna(inplace=False)
# hypo_data['MINIMAL UNITS CORPUS'].dropna(inplace=False)
hypo_data.dropna(inplace=True)

train, test = train_test_split(hypo_data, test_size=0.2, random_state=42)
train, dev = train_test_split(train, test_size=0.1, random_state=42)

train_texts = train['HYPO'].tolist() + train['PARAPHRASES'].tolist() + train['MINIMAL UNITS CORPUS'].tolist()
train_labels = [1] * len(train['HYPO']) + [0] * (len(train['PARAPHRASES']) + len(train['MINIMAL UNITS CORPUS']))

dev_texts = dev['HYPO'].tolist() + dev['PARAPHRASES'].tolist() + dev['MINIMAL UNITS CORPUS'].tolist()
dev_labels = [1] * len(dev['HYPO']) + [0] * (len(dev['PARAPHRASES']) + len(dev['MINIMAL UNITS CORPUS']))

test_texts = test['HYPO'].tolist() + test['PARAPHRASES'].tolist() + test['MINIMAL UNITS CORPUS'].tolist()
test_labels = [1] * len(test['HYPO']) + [0] * (len(test['PARAPHRASES']) + len(test['MINIMAL UNITS CORPUS']))

print(len(train_texts), len(train_labels))

# removing Nan values
# train_texts = [text for text in train_texts if isinstance(text, str)]
# dev_texts = [text for text in dev_texts if isinstance(text, str)]
# test_texts = [text for text in test_texts if isinstance(text, str)]

print(len(train_texts), len(train_labels))

# Tokenize and encode data
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
dev_encodings = tokenizer(dev_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)



In [None]:
train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_encodings['input_ids']),
                                               torch.tensor(train_encodings['attention_mask']),
                                               torch.tensor(train_labels))

dev_dataset = torch.utils.data.TensorDataset(torch.tensor(dev_encodings['input_ids']),
                                             torch.tensor(dev_encodings['attention_mask']),
                                             torch.tensor(dev_labels))

test_dataset = torch.utils.data.TensorDataset(torch.tensor(test_encodings['input_ids']),
                                              torch.tensor(test_encodings['attention_mask']),
                                              torch.tensor(test_labels))

In [None]:
# Fine-tune the model

training_args = TrainingArguments(output_dir='./results', num_train_epochs=3)
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, 
                  eval_dataset=dev_dataset)

trainer.train()

In [None]:
# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

In [None]:
# Freeze BERT layers
for param in model.base_model.parameters():
    param.requires_grad = False

In [None]:
# Add linear classification layer on top
classifier = torch.nn.Linear(768, 2)
model.classifier = classifier

In [None]:
# Train model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
epoch_progress_bar = tqdm(range(3), desc="Epochs")

for epoch in epoch_progress_bar:
    model.train()
    # Initialize tqdm progress bar for training batches
#     train_progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{3}', leave=False)
    
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
#         train_progress_bar.set_postfix(loss=loss.item())

    
    # Evaluate model
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()
            _, predicted = torch.max(outputs.logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    accuracy = correct / total
    print(f'Epoch {epoch+1}, Validation Loss: {val_loss/len(val_loader)}, Validation Accuracy: {accuracy}')