In [1]:
import torch
import io
import pandas as pd
from transformers import LongformerTokenizer
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from transformers import LongformerForSequenceClassification, AdamW, BertConfig, get_scheduler
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from tabulate import tabulate

torch.cuda.is_available()

True

In [2]:
!pip install transformers



In [5]:
df = pd.read_csv("drive/MyDrive/preprocessed_sub_and_body.csv")
df["text"] = df["text"].astype(str)
df["label"] = df["label"].astype(int)
df.shape

(11448, 3)

In [None]:
average_length = df['length'].mean()
max_length = df['length'].max()
min_length = df['length'].min()

print(f"Average length: {average_length}")
print(f"Max length: {max_length}")
print(f"Min length: {min_length}")

In [None]:

X_train_val, X_test, y_train_val, y_test = train_test_split(df['text'],df['label'],
                                                    stratify=df['label'],
                                                    test_size=0.2)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, random_state=42, test_size=0.2
)

In [None]:
# Tokenization
tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")

In [None]:
def encode_data(input_text):
    tokens = tokenizer.batch_encode_plus(
        input_text,
        add_special_tokens=True,
        truncation=True,
        max_length=1536,
        padding=True,
        return_attention_mask=True,
        return_tensors="pt",
    )
    return tokens["input_ids"], tokens["attention_mask"]


train_input_ids, train_attention_masks = encode_data(list(X_train))
val_input_ids, val_attention_masks = encode_data(list(X_val))

# Convert labels to PyTorch tensors
train_labels = torch.tensor(y_train.values, dtype=torch.float32)
val_labels = torch.tensor(y_val.values, dtype=torch.float32)

train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)

In [None]:
batch_size = 16
num_epochs = 4
lr = 3e-5

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)


validation_loader = DataLoader(val_dataset, shuffle=False, batch_size=batch_size)

In [None]:
bert_model = LongformerForSequenceClassification.from_pretrained(
    "allenai/longformer-base-4096",
    num_labels=2
)


optimizer = AdamW(bert_model.parameters(), lr=lr)
loss_fn = torch.nn.CrossEntropyLoss()

num_epochs = 4


num_training_steps = len(train_loader) * num_epochs

# Create the learning rate scheduler.
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

bert_model.to(device)

In [None]:
progress_bar = tqdm(range(num_training_steps))  # proress bar

# initialize lists to store metrics
train_losses = []
val_losses = []
val_accuracies = []
val_precisions = []
val_recalls = []
val_f1_scores = []
val_roc_aucs = []

for epoch in range(num_epochs):
    print(" Epoch {:} / {:}".format(epoch + 1, num_epochs))
    # reset the total loss for each epoch
    total_train_loss = 0

    # put the model into training mode
    bert_model.train()

    # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

    for i, batch in enumerate(train_loader):
        (b_input_ids, b_input_mask, b_labels) = tuple(
            t.to(device) for t in batch
        )  # send tesors to gpu

        # ensure labels are of type LongTensor
        b_labels = b_labels.long()

        # setting the gradients to zero, cuz PyTorch doesn't do this automatically
        # https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch
        optimizer.zero_grad()

        # forward pass
        outputs = bert_model(b_input_ids, attention_mask=b_input_mask)

        pred = outputs.logits
        loss = loss_fn(pred, b_labels)

        total_train_loss += loss.item()  # accumulate loss over all batches

        # backward pass/calculating gradients
        loss.backward()

        # optimizing the parameters of the bert model using computed gradient
        optimizer.step()

        # Update the learning rate.
        lr_scheduler.step()

        # update progress bar
        progress_bar.update(1)

    # calculate the average loss
    epoch_train_loss = total_train_loss / len(train_loader)
    train_losses.append(epoch_train_loss)
    print(f"\nTraining epoch {epoch + 1} loss: ", epoch_train_loss)

    # TESTING BLOCK STARTS
    total_val_loss = 0

    # put the model in evaluation mode
    bert_model.eval()
    all_val_preds = []
    all_val_labels = []

    for i, batch in enumerate(validation_loader):
        (b_input_ids, b_input_mask, b_labels) = tuple(t.to(device) for t in batch)

        # ensure labels are of type LongTensor
        b_labels = b_labels.long()

        # no gradients needed for testning/validation
        with torch.no_grad():
            outputs = bert_model(b_input_ids, attention_mask=b_input_mask)

        logits = outputs.logits
        loss = loss_fn(logits, b_labels)

        total_val_loss += loss.item()

        # get predictions
        preds = torch.argmax(logits, dim=-1)
        all_val_preds.extend(preds.cpu().numpy())
        all_val_labels.extend(b_labels.cpu().numpy())

    epoch_val_loss = total_val_loss / len(validation_loader)
    accuracy = accuracy_score(all_val_labels, all_val_preds)
    precision = precision_score(all_val_labels, all_val_preds, average="binary")
    recall = recall_score(all_val_labels, all_val_preds, average="binary")
    f1 = f1_score(all_val_labels, all_val_preds, average="binary")
    roc_auc = roc_auc_score(all_val_labels, all_val_preds)

    print(f"\nValidation epoch {epoch + 1} loss: {epoch_val_loss}")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print(f"ROC-AUC: {roc_auc}")

    val_losses.append(epoch_val_loss)
    val_accuracies.append(accuracy)
    val_precisions.append(precision)
    val_recalls.append(recall)
    val_f1_scores.append(f1)
    val_roc_aucs.append(roc_auc)

    #  early stopping
    # if avg_val_loss < best_val_loss or f1 > best_f1:
    #     best_val_loss = avg_val_loss
    #     best_f1 = f1
    #     trigger_times = 0
    #     # Save the best model weights
    #     torch.save(bert_model.state_dict(), 'best_model.pt')
    # else:
    #     trigger_times += 1
    #     if trigger_times >= patience:
    #         print("Early stopping!")
    #         bert_model.load_state_dict(torch.load('best_model.pt'))
    #         break

print("")
print("Training complete!")

In [None]:
model_save_path = "drive/MyDrive/bert_lonformer_model.pt"
optimizer_save_path = "drive/MyDrive/longformer_optimizer.pt"

torch.save(bert_model.state_dict(), model_save_path)

torch.save(optimizer.state_dict(), optimizer_save_path)

In [None]:
# !pip install tabulate

In [None]:
# Display the metrics in tabular format
table_data = [
    [
        "Epoch",
        "Train Loss",
        "Val Loss",
        "Val Acc",
        "Val Precision",
        "Val Recall",
        "Val F1",
        "Val ROC-AUC",
    ],
]

for epoch in range(num_epochs):
    table_data.append(
        [
            epoch + 1,
            train_losses[epoch],
            val_losses[epoch],
            val_accuracies[epoch],
            val_precisions[epoch],
            val_recalls[epoch],
            val_f1_scores[epoch],
            val_roc_aucs[epoch],
        ]
    )

print(tabulate(table_data, headers="firstrow", floatfmt=".4f"))

In [None]:
import matplotlib.pyplot as plt

# Plot the training and validation loss
plt.figure(figsize=(12, 6))
plt.plot(range(1, num_epochs + 1), train_losses, label='Training Loss')
plt.plot(range(1, num_epochs + 1), val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()


# Evaluacija nad testnim podacima

In [None]:
test_input_ids, test_attention_masks = encode_data(list(X_test))

test_labels = torch.tensor(y_test.values, dtype=torch.long)


test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)


test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)


def evaluate_model(model, dataloader, device):
    model.eval()
    total_test_loss = 0
    all_preds = []
    all_labels = []

    for batch in dataloader:
        (b_input_ids, b_input_mask, b_labels) = tuple(t.to(device) for t in batch)

        b_labels = b_labels.long()

        with torch.no_grad():
            outputs = bert_model(b_input_ids, attention_mask=b_input_mask)

        logits = outputs.logits
        loss = loss_fn(logits, b_labels)

        total_test_loss += loss.item()

        # get predictions
        preds = torch.argmax(logits, dim=-1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(b_labels.cpu().numpy())

    avg_loss = total_test_loss / len(dataloader)
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    roc_auc = roc_auc_score(all_labels, all_preds)

    return avg_loss, accuracy, precision, recall, f1, roc_auc


test_loss, test_accuracy, test_precision, test_recall, test_f1, test_roc_auc = (
    evaluate_model(bert_model, test_loader, device)
)

print(f"Test loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")
print(f"Test ROC-AUC: {test_roc_auc:.4f}")