In [23]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm


In [24]:
# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Load the data
hypo_data = pd.read_csv('/HYPO.tsv', sep='\t', header=None, names=['HYPO', 'PARAPHRASES', 'MINIMAL UNITS CORPUS'])
# hypo_data['HYPO'].dropna(inplace=False)
# hypo_data['PARAPHRASES'].dropna(inplace=False)
# hypo_data['MINIMAL UNITS CORPUS'].dropna(inplace=False)
hypo_data.dropna(inplace=True)

train, test = train_test_split(hypo_data, test_size=0.2, random_state=42)
train, dev = train_test_split(train, test_size=0.1, random_state=42)

train_texts = train['HYPO'].tolist() + train['PARAPHRASES'].tolist() + train['MINIMAL UNITS CORPUS'].tolist()
train_labels = [1] * len(train['HYPO']) + [0] * (len(train['PARAPHRASES']) + len(train['MINIMAL UNITS CORPUS']))

dev_texts = dev['HYPO'].tolist() + dev['PARAPHRASES'].tolist() + dev['MINIMAL UNITS CORPUS'].tolist()
dev_labels = [1] * len(dev['HYPO']) + [0] * (len(dev['PARAPHRASES']) + len(dev['MINIMAL UNITS CORPUS']))

test_texts = test['HYPO'].tolist() + test['PARAPHRASES'].tolist() + test['MINIMAL UNITS CORPUS'].tolist()
test_labels = [1] * len(test['HYPO']) + [0] * (len(test['PARAPHRASES']) + len(test['MINIMAL UNITS CORPUS']))

print(len(train_texts), len(train_labels))

# removing Nan values
# train_texts = [text for text in train_texts if isinstance(text, str)]
# dev_texts = [text for text in dev_texts if isinstance(text, str)]
# test_texts = [text for text in test_texts if isinstance(text, str)]

print(len(train_texts), len(train_labels))

# Tokenize and encode data
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
dev_encodings = tokenizer(dev_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)



1509 1509
1509 1509


In [32]:
train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_encodings['input_ids']),
                                               torch.tensor(train_encodings['attention_mask']),
                                               torch.tensor(train_labels))

dev_dataset = torch.utils.data.TensorDataset(torch.tensor(dev_encodings['input_ids']),
                                             torch.tensor(dev_encodings['attention_mask']),
                                             torch.tensor(dev_labels))

test_dataset = torch.utils.data.TensorDataset(torch.tensor(test_encodings['input_ids']),
                                              torch.tensor(test_encodings['attention_mask']),
                                              torch.tensor(test_labels))

print(train_dataset)

<torch.utils.data.dataset.TensorDataset object at 0x7a81502e66e0>


In [34]:
pip install accelerate -U



In [36]:
# Define optimizer and learning rate
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)

# Define training epochs
num_epochs = 3

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}'):
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_train_loss = total_loss / len(train_loader)
    print(f'Average training loss for Epoch {epoch + 1}: {avg_train_loss}')

    # Evaluation loop
    model.eval()
    total_eval_loss = 0
    for batch in tqdm(dev_loader, desc=f'Evaluation Epoch {epoch + 1}'):
        input_ids, attention_mask, labels = batch
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_eval_loss += loss.item()
    avg_eval_loss = total_eval_loss / len(dev_loader)
    print(f'Average evaluation loss for Epoch {epoch + 1}: {avg_eval_loss}')

# Evaluate on the test dataset
model.eval()
predictions = []
true_labels = []
for batch in tqdm(test_loader, desc='Test Evaluation'):
    input_ids, attention_mask, labels = batch
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
    predictions.extend(torch.argmax(logits, dim=1).tolist())
    true_labels.extend(labels.tolist())

# Calculate accuracy
correct = sum(p == t for p, t in zip(predictions, true_labels))
accuracy = correct / len(true_labels)
print(f'\nTest accuracy: {accuracy}')

Epoch 1: 100%|██████████| 189/189 [11:03<00:00,  3.51s/it]


Average training loss for Epoch 1: 0.5841232868571761


Evaluation Epoch 1: 100%|██████████| 21/21 [00:14<00:00,  1.45it/s]


Average evaluation loss for Epoch 1: 0.5290474948428926


Epoch 2: 100%|██████████| 189/189 [10:48<00:00,  3.43s/it]


Average training loss for Epoch 2: 0.3923428462295936


Evaluation Epoch 2: 100%|██████████| 21/21 [00:13<00:00,  1.61it/s]


Average evaluation loss for Epoch 2: 0.41362407839014415


Epoch 3: 100%|██████████| 189/189 [10:46<00:00,  3.42s/it]


Average training loss for Epoch 3: 0.24434491427290062


Evaluation Epoch 3: 100%|██████████| 21/21 [00:13<00:00,  1.56it/s]


Average evaluation loss for Epoch 3: 0.47142694358314785


Test Evaluation: 100%|██████████| 53/53 [00:45<00:00,  1.17it/s]

Test accuracy: 0.7833333333333333



