In [4]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import torch.nn as nn


In [5]:
# Load pre-trained BERT model and tokenizer
class BertForHyperboleClassification(BertForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.classifier = nn.Linear(config.hidden_size, 2)

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForHyperboleClassification.from_pretrained(model_name)



Some weights of BertForHyperboleClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Load the data
hypo_data = pd.read_csv('/content/HYPO.tsv', sep='\t', header=None, names=['HYPO', 'PARAPHRASES', 'MINIMAL UNITS CORPUS'])
# hypo_data['HYPO'].dropna(inplace=False)
# hypo_data['PARAPHRASES'].dropna(inplace=False)
# hypo_data['MINIMAL UNITS CORPUS'].dropna(inplace=False)
hypo_data.dropna(inplace=True)

train, test = train_test_split(hypo_data, test_size=0.2, random_state=42)
train, dev = train_test_split(train, test_size=0.1, random_state=42)

train_texts = train['HYPO'].tolist() + train['PARAPHRASES'].tolist() + train['MINIMAL UNITS CORPUS'].tolist()
train_labels = [1] * len(train['HYPO']) + [0] * (len(train['PARAPHRASES']) + len(train['MINIMAL UNITS CORPUS']))

dev_texts = dev['HYPO'].tolist() + dev['PARAPHRASES'].tolist() + dev['MINIMAL UNITS CORPUS'].tolist()
dev_labels = [1] * len(dev['HYPO']) + [0] * (len(dev['PARAPHRASES']) + len(dev['MINIMAL UNITS CORPUS']))

test_texts = test['HYPO'].tolist() + test['PARAPHRASES'].tolist() + test['MINIMAL UNITS CORPUS'].tolist()
test_labels = [1] * len(test['HYPO']) + [0] * (len(test['PARAPHRASES']) + len(test['MINIMAL UNITS CORPUS']))

print(len(train_texts), len(train_labels))

# removing Nan values
# train_texts = [text for text in train_texts if isinstance(text, str)]
# dev_texts = [text for text in dev_texts if isinstance(text, str)]
# test_texts = [text for text in test_texts if isinstance(text, str)]

print(len(train_texts), len(train_labels))

# Tokenize and encode data
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
dev_encodings = tokenizer(dev_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)



1509 1509
1509 1509


In [8]:
train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_encodings['input_ids']),
                                               torch.tensor(train_encodings['attention_mask']),
                                               torch.tensor(train_labels))

dev_dataset = torch.utils.data.TensorDataset(torch.tensor(dev_encodings['input_ids']),
                                             torch.tensor(dev_encodings['attention_mask']),
                                             torch.tensor(dev_labels))

test_dataset = torch.utils.data.TensorDataset(torch.tensor(test_encodings['input_ids']),
                                              torch.tensor(test_encodings['attention_mask']),
                                              torch.tensor(test_labels))

print(train_dataset)

<torch.utils.data.dataset.TensorDataset object at 0x7c3e41d7dae0>


In [9]:
# Define optimizer and learning rate
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)

# Define training epochs
num_epochs = 3

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}'):
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_train_loss = total_loss / len(train_loader)
    print(f'Average training loss for Epoch {epoch + 1}: {avg_train_loss}')

    # Evaluation loop
    model.eval()
    total_eval_loss = 0
    for batch in tqdm(dev_loader, desc=f'Evaluation Epoch {epoch + 1}'):
        input_ids, attention_mask, labels = batch
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_eval_loss += loss.item()
    avg_eval_loss = total_eval_loss / len(dev_loader)
    print(f'Average evaluation loss for Epoch {epoch + 1}: {avg_eval_loss}')

# Evaluate on the test dataset
model.eval()
predictions = []
true_labels = []
for batch in tqdm(test_loader, desc='Test Evaluation'):
    input_ids, attention_mask, labels = batch
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
    predictions.extend(torch.argmax(logits, dim=1).tolist())
    true_labels.extend(labels.tolist())

# Calculate accuracy
correct = sum(p == t for p, t in zip(predictions, true_labels))
accuracy = correct / len(true_labels)
print(f'\nTest accuracy: {accuracy}')

Epoch 1: 100%|██████████| 189/189 [11:43<00:00,  3.72s/it]


Average training loss for Epoch 1: 0.6080636905614661


Evaluation Epoch 1: 100%|██████████| 21/21 [00:13<00:00,  1.55it/s]


Average evaluation loss for Epoch 1: 0.4196370272409348


Epoch 2: 100%|██████████| 189/189 [11:33<00:00,  3.67s/it]


Average training loss for Epoch 2: 0.5250578620840632


Evaluation Epoch 2: 100%|██████████| 21/21 [00:14<00:00,  1.49it/s]


Average evaluation loss for Epoch 2: 0.6188210774035681


Epoch 3: 100%|██████████| 189/189 [11:38<00:00,  3.70s/it]


Average training loss for Epoch 3: 0.43948854308910473


Evaluation Epoch 3: 100%|██████████| 21/21 [00:13<00:00,  1.53it/s]


Average evaluation loss for Epoch 3: 0.4621313705685593


Test Evaluation: 100%|██████████| 53/53 [00:43<00:00,  1.21it/s]


Test accuracy: 0.7714285714285715





In [12]:
# Text to classify
text = "I feel like a million bucks!"

# Tokenize the text
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

# Inference
with torch.no_grad():
    outputs = model(**inputs)

# Get the predicted probabilities
probs = torch.softmax(outputs.logits, dim=1)

# Extract the probability for the positive class (hyperbole)
hyperbole_probability = probs[:, 1].item()

# Thresholding: You can set a threshold to determine if the text is classified as hyperbole or not
threshold = 0.5
is_hyperbole = hyperbole_probability >= threshold

print(f"Probability of hyperbole: {hyperbole_probability:.2f}")
print(f"Is hyperbole? {'Yes' if is_hyperbole else 'No'}")


Probability of hyperbole: 0.91
Is hyperbole? Yes
