## Fine-tune Bert model 

In [23]:
import json

# Load the preprocessed data from the JSON file
preprocessed_file_path = 'outputs/extracted_information.json'  # Replace with your file path
with open(preprocessed_file_path, 'r', encoding='utf-8') as f:
    preprocessed_data = json.load(f)

# Function to preprocess text to ensure it is clean for training
def preprocess_text(text):
    if isinstance(text, str):  # Check if text is a string before processing
        text = text.strip()  # Remove leading and trailing whitespace
    return text

# Open the output text file for writing
training_output_file_path = 'outputs/bert_preprocessed_training_texts.txt'  # Replace with your desired file path
with open(training_output_file_path, 'w', encoding='utf-8') as f:
    # Loop through each entry in the data and extract Title, Abstract, and Keywords
    for table in preprocessed_data['tables']:
        title = preprocess_text(table.get('Title', ''))
        abstract = preprocess_text(table.get('Abstract', ''))
        keywords = preprocess_text(table.get('Keywords', ''))

        # Combine the Title, Abstract, and Keywords into a single text entry
        combined_text = f"Title: {title}\nAbstract: {abstract}\nKeywords: {keywords}\n\n"

        # Write the combined text to the output file
        f.write(combined_text)

print(f"BERT training text file created successfully at {training_output_file_path}")


BERT training text file created successfully at outputs/bert_preprocessed_training_texts.txt


In [7]:
from transformers import BertTokenizer, BertForMaskedLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
import os

# Step 1: Load the preprocessed text data into a dataset
dataset_path = 'outputs/bert_preprocessed_training_texts.txt'  # Replace with your file path

# Load the dataset using Hugging Face's datasets library
dataset = load_dataset('text', data_files={'train': dataset_path})

# Step 2: Load the BERT tokenizer and model
model_name = 'bert-base-uncased'  # You can change this to any other pre-trained BERT model if needed
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)

# Step 3: Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Step 4: Set up data collator for Masked Language Modeling (MLM)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15  # 15% of tokens will be masked for prediction
)

# Step 5: Define training arguments for fine-tuning
training_args = TrainingArguments(
    output_dir='./fine_tuned_bert',  # Directory to save the model
    overwrite_output_dir=True,
    num_train_epochs=10,  # You can change the number of epochs based on your needs
    per_device_train_batch_size=16,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    learning_rate=2e-5,
    weight_decay=0.01,
    evaluation_strategy='no'  # Use 'steps' if you have a validation set
)

# Step 6: Set up the Trainer for fine-tuning the BERT model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset['train']
)

# Step 7: Fine-tune the model
trainer.train()

# Step 8: Save the fine-tuned model and tokenizer
model.save_pretrained('./fine_tuned_bert')
tokenizer.save_pretrained('./fine_tuned_bert')

print("Model training completed and saved to './fine_tuned_bert'")


  from .autonotebook import tqdm as notebook_tqdm


Downloading and preparing dataset text/default to C:/Users/dionusia/.cache/huggingface/datasets/text/default-af52c41c1bb4c6cf/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files: 100%|██████████| 1/1 [00:00<?, ?it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 22.73it/s]
                                                        

Dataset text downloaded and prepared to C:/Users/dionusia/.cache/huggingface/datasets/text/default-af52c41c1bb4c6cf/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 21.74it/s]
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
 27%|██▋       | 100/370 [37:14<1:55:24, 25.64s/it]

{'loss': 2.8675, 'learning_rate': 1.4594594594594596e-05, 'epoch': 2.7}


 54%|█████▍    | 200/370 [1:12:54<1:09:13, 24.43s/it]

{'loss': 2.469, 'learning_rate': 9.189189189189191e-06, 'epoch': 5.41}


 81%|████████  | 300/370 [1:53:42<26:25, 22.64s/it]  

{'loss': 2.3543, 'learning_rate': 3.7837837837837844e-06, 'epoch': 8.11}


100%|██████████| 370/370 [2:20:43<00:00, 22.82s/it]


{'train_runtime': 8443.5394, 'train_samples_per_second': 0.701, 'train_steps_per_second': 0.044, 'train_loss': 2.494269541147593, 'epoch': 10.0}
Model training completed and saved to './fine_tuned_bert'
