In [1]:
!pip install torch pandas transformers scikit-learn tqdm

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import f1_score, accuracy_score


class TextDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.bert_model = BertModel.from_pretrained('bert-base-uncased')
        # Move the model to GPU as soon as it is loaded
        self.bert_model.to('cuda')
        self.bert_model.eval()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['headline']
        label = self.data.iloc[idx]['clickbait']
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        # Directly process and move tensors to GPU
        inputs = {key: value.to('cuda') for key, value in inputs.items()}
        with torch.no_grad():
            outputs = self.bert_model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :]
        # Convert label to tensor and move to GPU
        return embeddings.squeeze(), torch.tensor(label).to('cuda')


In [3]:
class LSTMClassifier(torch.nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        _, (hn, _) = self.lstm(x.unsqueeze(1))  # unsqueeze to add a fake sequence length dimension
        return self.fc(hn.squeeze())


In [10]:
def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    true_labels = []
    pred_labels = []
    with torch.no_grad():
        for embeddings, labels in dataloader:
            embeddings, labels = embeddings.to('cuda'), labels.to('cuda')
            outputs = model(embeddings)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            true_labels.extend(labels.cpu().tolist())
            pred_labels.extend(predicted.cpu().tolist())

    avg_loss = total_loss / len(dataloader)
    f1 = f1_score(true_labels, pred_labels, average='weighted')
    acc = accuracy_score(true_labels, pred_labels)
    return avg_loss, f1, acc

def train(model, train_dataloader, val_dataloader, test_dataloader, epochs):
    print("Starting training...")
    model = model.to('cuda')
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters())

    best_val_acc = 0.0
    best_model_path = 'best_model.pth'  # Define path to save the best model

    for epoch in tqdm(range(epochs)):
        model.train()
        true_labels = []
        pred_labels = []

        for embeddings, labels in train_dataloader:
            embeddings, labels = embeddings.to('cuda'), labels.to('cuda')

            optimizer.zero_grad()
            outputs = model(embeddings)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            _, predicted = torch.max(outputs, 1)
            true_labels.extend(labels.cpu().tolist())
            pred_labels.extend(predicted.cpu().tolist())

        train_f1 = f1_score(true_labels, pred_labels, average='weighted')
        train_acc = accuracy_score(true_labels, pred_labels)
        val_loss, val_f1, val_acc = evaluate(model, val_dataloader, criterion)

        print(f"Epoch {epoch+1}, Training Loss: {loss.item()}, Training F1: {train_f1:.4f}, Training Acc: {train_acc:.4f}")
        print(f"Validation Loss: {val_loss:.4f}, Validation F1: {val_f1:.4f}, Validation Acc: {val_acc:.4f}")

        # Save the best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), best_model_path)
            print("Saved best model")

    # Load the best model for testing
    model.load_state_dict(torch.load(best_model_path))
    test_loss, test_f1, test_acc = evaluate(model, test_dataloader, criterion)
    print(f"Test Loss: {test_loss:.4f}, Test F1: {test_f1:.4f}, Test Acc: {test_acc:.4f}")

# Example usage:
# train(model, train_loader, val_loader, test_loader, epochs=10)


In [11]:
from sklearn.model_selection import train_test_split
data = pd.read_csv('clickbait_data.csv')
# Split the data into training and remaining data
train_data, remaining_data = train_test_split(data, test_size=0.4, random_state=42)

# Split the remaining data into validation and test data
val_data, test_data = train_test_split(remaining_data, test_size=0.5, random_state=42)

model = LSTMClassifier(embedding_dim=768, hidden_dim=128, output_dim=2).to('cuda')
train_dataloader = DataLoader(TextDataset(train_data), batch_size=16, shuffle=True)
val_dataloader = DataLoader(TextDataset(val_data), batch_size=16, shuffle=True)
test_dataloader = DataLoader(TextDataset(test_data), batch_size=16, shuffle=True)


# Train the model
train(model, train_dataloader, val_dataloader, test_dataloader, epochs=10)


Starting training...


 10%|█         | 1/10 [04:40<42:07, 280.84s/it]

Epoch 1, Training Loss: 0.0028090483974665403, Training F1: 0.9706, Training Acc: 0.9706
Validation Loss: 0.0675, Validation F1: 0.9769, Validation Acc: 0.9769
Saved best model


 20%|██        | 2/10 [09:16<37:01, 277.70s/it]

Epoch 2, Training Loss: 0.007919272407889366, Training F1: 0.9798, Training Acc: 0.9798
Validation Loss: 0.0585, Validation F1: 0.9811, Validation Acc: 0.9811
Saved best model


 30%|███       | 3/10 [13:52<32:18, 276.88s/it]

Epoch 3, Training Loss: 0.013588412664830685, Training F1: 0.9833, Training Acc: 0.9833
Validation Loss: 0.0574, Validation F1: 0.9809, Validation Acc: 0.9809


 40%|████      | 4/10 [18:28<27:39, 276.53s/it]

Epoch 4, Training Loss: 0.0008541347342543304, Training F1: 0.9876, Training Acc: 0.9876
Validation Loss: 0.0610, Validation F1: 0.9828, Validation Acc: 0.9828
Saved best model


 50%|█████     | 5/10 [23:03<23:00, 276.09s/it]

Epoch 5, Training Loss: 0.012171884067356586, Training F1: 0.9915, Training Acc: 0.9915
Validation Loss: 0.0653, Validation F1: 0.9791, Validation Acc: 0.9791


 60%|██████    | 6/10 [27:38<18:22, 275.68s/it]

Epoch 6, Training Loss: 0.0003630506689660251, Training F1: 0.9931, Training Acc: 0.9931
Validation Loss: 0.0609, Validation F1: 0.9844, Validation Acc: 0.9844
Saved best model


 70%|███████   | 7/10 [32:12<13:45, 275.19s/it]

Epoch 7, Training Loss: 0.002676372416317463, Training F1: 0.9956, Training Acc: 0.9956
Validation Loss: 0.0655, Validation F1: 0.9825, Validation Acc: 0.9825


 80%|████████  | 8/10 [36:47<09:10, 275.11s/it]

Epoch 8, Training Loss: 0.04988257586956024, Training F1: 0.9964, Training Acc: 0.9964
Validation Loss: 0.0920, Validation F1: 0.9775, Validation Acc: 0.9775


 90%|█████████ | 9/10 [41:24<04:35, 275.58s/it]

Epoch 9, Training Loss: 0.0008059106767177582, Training F1: 0.9968, Training Acc: 0.9968
Validation Loss: 0.0740, Validation F1: 0.9830, Validation Acc: 0.9830


100%|██████████| 10/10 [45:58<00:00, 275.84s/it]

Epoch 10, Training Loss: 0.0018734449986368418, Training F1: 0.9995, Training Acc: 0.9995
Validation Loss: 0.0867, Validation F1: 0.9819, Validation Acc: 0.9819





Test Loss: 0.0544, Test F1: 0.9819, Test Acc: 0.9819
