<a href="https://colab.research.google.com/github/baptiste-roelens/MyBiblio/blob/main/20240305_finetune_pubmedbert_for_biblio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers evaluate accelerate datasets
from huggingface_hub import notebook_login
from transformers import AutoTokenizer

notebook_login()

tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext")

In [None]:
from datasets import Dataset
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/WIP/Biblio/biblio2.tsv", sep='\t', encoding="ISO-8859-1").dropna(subset="label").dropna(subset="abstract")
df['label'] = df['label'].astype(int)
df_dataset = df[["abstract","label"]]
dataset = Dataset.from_pandas(df_dataset)
dataset = dataset.remove_columns(["__index_level_0__"])

dataset = dataset.train_test_split(test_size=0.2)

In [None]:
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np

def preprocess_function(examples):
    return tokenizer(examples["abstract"], truncation=True, max_length=512)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

tokenized_bilio = dataset.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
accuracy = evaluate.load("accuracy")

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

id2label = {0: "Not Relevant", 1: "Potentially Interesting"}
label2id = {"Not Relevant": 0, "Potentially Interesting": 1}

model = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext", num_labels=2, id2label=id2label, label2id=label2id
)

training_args = TrainingArguments(
    output_dir="PubMed_interests",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_bilio["train"],
    eval_dataset=tokenized_bilio["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()