<a href="https://colab.research.google.com/github/ayoitshasya/ML-BERT-MODEL/blob/main/docbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset
from tqdm import tqdm
import numpy as np

# Define the path to the "combined papers" folder
folder_path = "final_papers"  # Replace with your folder path

# Helper function to parse a single file
def parse_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    sections = content.split("\n\n")
    metadata = {}
    text = ""
    for section in sections:
        if section.startswith("Metadata:"):
            for line in section.split("\n"):
                if line.startswith("Category:"):
                    metadata["label"] = 1 if "Publishable" in line else 0
                elif line.startswith("Title:"):
                    metadata["title"] = line.replace("Title:", "").strip()
        else:
            text += section.strip() + " "
    return metadata.get("label"), text.strip()

# Load data from the folder
labels = []
texts = []

for file_name in tqdm(os.listdir(folder_path)):
    if file_name.endswith(".txt"):
        file_path = os.path.join(folder_path, file_name)
        label, text = parse_file(file_path)
        if label is not None:  # Ensure the file is correctly parsed
            labels.append(label)
            texts.append(text)

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize data
def tokenize_texts(texts):
    return tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

# Custom function to compute evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), axis=1).numpy()
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

# Cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
all_metrics = []

for fold, (train_index, val_index) in enumerate(kf.split(texts)):
    print(f"Fold {fold + 1}")

    # Split the data for the current fold
    train_texts = [texts[i] for i in train_index]
    val_texts = [texts[i] for i in val_index]
    train_labels = [labels[i] for i in train_index]
    val_labels = [labels[i] for i in val_index]

    # Tokenize data
    train_encodings = tokenize_texts(train_texts)
    val_encodings = tokenize_texts(val_texts)

    # Create datasets
    def create_dataset(encodings, labels):
        return Dataset.from_dict({
            "input_ids": encodings["input_ids"],
            "attention_mask": encodings["attention_mask"],
            "labels": torch.tensor(labels, dtype=torch.long)
        })

    train_dataset = create_dataset(train_encodings, train_labels)
    val_dataset = create_dataset(val_encodings, val_labels)

    # Define the DocBERT model
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased", num_labels=2
    )

    # Define training arguments with regularization
    training_args = TrainingArguments(
        output_dir=f"./results_fold_{fold + 1}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=3,  # Reduced number of epochs
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        weight_decay=0.01,  # Add weight decay for regularization
        logging_dir=f"./logs_fold_{fold + 1}",
        load_best_model_at_end=True,
        save_total_limit=2,
        logging_steps=10,
        metric_for_best_model="accuracy",
        greater_is_better=True
    )

    # Define the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    metrics = trainer.evaluate(val_dataset)
    print(f"Metrics for Fold {fold + 1}: {metrics}")
    all_metrics.append(metrics)

# Calculate average metrics across all folds
avg_metrics = {
    "eval_accuracy": np.mean([m["eval_accuracy"] for m in all_metrics]),
    "eval_precision": np.mean([m["eval_precision"] for m in all_metrics]),
    "eval_recall": np.mean([m["eval_recall"] for m in all_metrics]),
    "eval_f1": np.mean([m["eval_f1"] for m in all_metrics]),
}

print("Average Metrics Across Folds:", avg_metrics)

# Save the final model from the last fold
model.save_pretrained("./docbert_model")
tokenizer.save_pretrained("./docbert_model")


100%|██████████| 15/15 [00:00<00:00, 5979.33it/s]


Fold 1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.632935,0.666667,0.666667,1.0,0.8
2,No log,0.624344,0.666667,0.666667,1.0,0.8
3,No log,0.618857,0.666667,0.666667,1.0,0.8


Metrics for Fold 1: {'eval_loss': 0.632935106754303, 'eval_accuracy': 0.6666666666666666, 'eval_precision': 0.6666666666666666, 'eval_recall': 1.0, 'eval_f1': 0.8, 'eval_runtime': 0.1413, 'eval_samples_per_second': 21.23, 'eval_steps_per_second': 7.077, 'epoch': 3.0}
Fold 2


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.633587,0.666667,0.666667,1.0,0.8
2,No log,0.641367,0.666667,0.666667,1.0,0.8
3,No log,0.641071,0.666667,0.666667,1.0,0.8


Metrics for Fold 2: {'eval_loss': 0.633587121963501, 'eval_accuracy': 0.6666666666666666, 'eval_precision': 0.6666666666666666, 'eval_recall': 1.0, 'eval_f1': 0.8, 'eval_runtime': 0.1436, 'eval_samples_per_second': 20.897, 'eval_steps_per_second': 6.966, 'epoch': 3.0}
Fold 3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.613576,0.666667,0.666667,1.0,0.8
2,No log,0.626333,0.666667,0.666667,1.0,0.8
3,No log,0.621465,0.666667,0.666667,1.0,0.8


Metrics for Fold 3: {'eval_loss': 0.6135764718055725, 'eval_accuracy': 0.6666666666666666, 'eval_precision': 0.6666666666666666, 'eval_recall': 1.0, 'eval_f1': 0.8, 'eval_runtime': 0.1337, 'eval_samples_per_second': 22.443, 'eval_steps_per_second': 7.481, 'epoch': 3.0}
Fold 4


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.94505,0.333333,0.333333,1.0,0.5
2,No log,0.980789,0.333333,0.333333,1.0,0.5
3,No log,0.986417,0.333333,0.333333,1.0,0.5


Metrics for Fold 4: {'eval_loss': 0.9450497031211853, 'eval_accuracy': 0.3333333333333333, 'eval_precision': 0.3333333333333333, 'eval_recall': 1.0, 'eval_f1': 0.5, 'eval_runtime': 0.1354, 'eval_samples_per_second': 22.156, 'eval_steps_per_second': 7.385, 'epoch': 3.0}
Fold 5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.384497,1.0,1.0,1.0,1.0
2,No log,0.44421,1.0,1.0,1.0,1.0
3,No log,0.447513,1.0,1.0,1.0,1.0


Metrics for Fold 5: {'eval_loss': 0.384496808052063, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 0.1436, 'eval_samples_per_second': 20.886, 'eval_steps_per_second': 6.962, 'epoch': 3.0}
Average Metrics Across Folds: {'eval_accuracy': 0.6666666666666667, 'eval_precision': 0.6666666666666667, 'eval_recall': 1.0, 'eval_f1': 0.78}


('./docbert_model/tokenizer_config.json',
 './docbert_model/special_tokens_map.json',
 './docbert_model/vocab.txt',
 './docbert_model/added_tokens.json')

In [None]:
from google.colab import files
import shutil

# Compress the folder into a ZIP file
folder_to_download = "docbert_model"  # Replace with your folder name
shutil.make_archive(folder_to_download, 'zip', folder_to_download)

# Download the ZIP file
files.download(f"{folder_to_download}.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>