In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import f1_score, accuracy_score


class TextDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.bert_model = BertModel.from_pretrained('bert-base-uncased')
        # Move the model to GPU as soon as it is loaded
        self.bert_model.to('cuda')
        self.bert_model.eval()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['headline']
        label = self.data.iloc[idx]['clickbait']
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        # Directly process and move tensors to GPU
        inputs = {key: value.to('cuda') for key, value in inputs.items()}
        with torch.no_grad():
            outputs = self.bert_model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :]
        # Convert label to tensor and move to GPU
        return embeddings.squeeze(), torch.tensor(label).to('cuda')


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
class LSTMClassifier(torch.nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        _, (hn, _) = self.lstm(x.unsqueeze(1))  # unsqueeze to add a fake sequence length dimension
        return self.fc(hn.squeeze())


In [None]:
def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    true_labels = []
    pred_labels = []
    with torch.no_grad():
        for embeddings, labels in dataloader:
            embeddings, labels = embeddings.to('cuda'), labels.to('cuda')
            outputs = model(embeddings)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            true_labels.extend(labels.cpu().tolist())
            pred_labels.extend(predicted.cpu().tolist())

    avg_loss = total_loss / len(dataloader)
    f1 = f1_score(true_labels, pred_labels, average='weighted')
    acc = accuracy_score(true_labels, pred_labels)
    return avg_loss, f1, acc

def train(model, train_dataloader, val_dataloader, test_dataloader, epochs):
    print("Starting training...")
    model = model.to('cuda')
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters())

    best_val_acc = 0.0
    best_model_path = 'best_model.pth'  # Define path to save the best model

    for epoch in tqdm(range(epochs)):
        model.train()
        true_labels = []
        pred_labels = []

        for embeddings, labels in train_dataloader:
            embeddings, labels = embeddings.to('cuda'), labels.to('cuda')

            optimizer.zero_grad()
            outputs = model(embeddings)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            _, predicted = torch.max(outputs, 1)
            true_labels.extend(labels.cpu().tolist())
            pred_labels.extend(predicted.cpu().tolist())

        train_f1 = f1_score(true_labels, pred_labels, average='weighted')
        train_acc = accuracy_score(true_labels, pred_labels)
        val_loss, val_f1, val_acc = evaluate(model, val_dataloader, criterion)

        print(f"Epoch {epoch+1}, Training Loss: {loss.item()}, Training F1: {train_f1:.4f}, Training Acc: {train_acc:.4f}")
        print(f"Validation Loss: {val_loss:.4f}, Validation F1: {val_f1:.4f}, Validation Acc: {val_acc:.4f}")

        # Save the best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), best_model_path)
            print("Saved best model")

    # Load the best model for testing
    model.load_state_dict(torch.load(best_model_path))
    test_loss, test_f1, test_acc = evaluate(model, test_dataloader, criterion)
    print(f"Test Loss: {test_loss:.4f}, Test F1: {test_f1:.4f}, Test Acc: {test_acc:.4f}")

# Example usage:
# train(model, train_loader, val_loader, test_loader, epochs=10)


In [None]:
from sklearn.model_selection import train_test_split
data = pd.read_csv('clickbait_data.csv')
# Split the data into training and remaining data
train_data, remaining_data = train_test_split(data, test_size=0.4, random_state=42)

# Split the remaining data into validation and test data
val_data, test_data = train_test_split(remaining_data, test_size=0.5, random_state=42)

model = LSTMClassifier(embedding_dim=768, hidden_dim=128, output_dim=2).to('cuda')
train_dataloader = DataLoader(TextDataset(train_data), batch_size=16, shuffle=True)
val_dataloader = DataLoader(TextDataset(val_data), batch_size=16, shuffle=True)
test_dataloader = DataLoader(TextDataset(test_data), batch_size=16, shuffle=True)


# Train the model
train(model, train_dataloader, val_dataloader, test_dataloader, epochs=10)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Starting training...


 10%|█         | 1/10 [04:46<43:00, 286.71s/it]

Epoch 1, Training Loss: 0.2110298126935959, Training F1: 0.9714, Training Acc: 0.9714
Validation Loss: 0.0671, Validation F1: 0.9769, Validation Acc: 0.9769
Saved best model


 20%|██        | 2/10 [09:19<37:10, 278.76s/it]

Epoch 2, Training Loss: 0.02915932610630989, Training F1: 0.9794, Training Acc: 0.9794
Validation Loss: 0.0593, Validation F1: 0.9791, Validation Acc: 0.9791
Saved best model


 30%|███       | 3/10 [13:55<32:20, 277.14s/it]

Epoch 3, Training Loss: 0.011797720566391945, Training F1: 0.9837, Training Acc: 0.9838
Validation Loss: 0.0596, Validation F1: 0.9811, Validation Acc: 0.9811
Saved best model


 40%|████      | 4/10 [18:33<27:45, 277.54s/it]

Epoch 4, Training Loss: 0.008705410175025463, Training F1: 0.9872, Training Acc: 0.9872
Validation Loss: 0.0571, Validation F1: 0.9806, Validation Acc: 0.9806


 50%|█████     | 5/10 [23:07<23:01, 276.37s/it]

Epoch 5, Training Loss: 0.0024994679260998964, Training F1: 0.9898, Training Acc: 0.9898
Validation Loss: 0.0623, Validation F1: 0.9816, Validation Acc: 0.9816
Saved best model


 60%|██████    | 6/10 [27:41<18:22, 275.61s/it]

Epoch 6, Training Loss: 0.004873306956142187, Training F1: 0.9933, Training Acc: 0.9933
Validation Loss: 0.0616, Validation F1: 0.9808, Validation Acc: 0.9808


 70%|███████   | 7/10 [32:13<13:43, 274.46s/it]

Epoch 7, Training Loss: 0.006722663063555956, Training F1: 0.9952, Training Acc: 0.9952
Validation Loss: 0.0658, Validation F1: 0.9817, Validation Acc: 0.9817
Saved best model


 80%|████████  | 8/10 [36:46<09:07, 273.76s/it]

Epoch 8, Training Loss: 0.03024662658572197, Training F1: 0.9964, Training Acc: 0.9964
Validation Loss: 0.0872, Validation F1: 0.9798, Validation Acc: 0.9798


 90%|█████████ | 9/10 [41:18<04:33, 273.23s/it]

Epoch 9, Training Loss: 0.00016002925985958427, Training F1: 0.9976, Training Acc: 0.9976
Validation Loss: 0.0845, Validation F1: 0.9812, Validation Acc: 0.9812


100%|██████████| 10/10 [45:49<00:00, 274.98s/it]

Epoch 10, Training Loss: 6.558531458722427e-05, Training F1: 0.9990, Training Acc: 0.9990
Validation Loss: 0.0864, Validation F1: 0.9833, Validation Acc: 0.9833
Saved best model





Test Loss: 0.0735, Test F1: 0.9836, Test Acc: 0.9836
