In [1]:
import json
from datasets import Dataset

def load_bio_json(json_file):
    """
    Loads a BIO-tagged JSON file and converts it into a format suitable for Hugging Face fine-tuning.
    """
    with open(json_file, "r", encoding="utf-8") as file:
        data = json.load(file)

    sentences = []
    token_lists = []
    label_lists = []

    for entry in data:
        sentences.append(entry["sentence"])
        token_lists.append([token["token"] for token in entry["tokens"]])
        label_lists.append([token["label"] for token in entry["tokens"]])

    dataset = Dataset.from_dict({"tokens": token_lists, "labels": label_lists})
    return dataset

# Load the dataset
train_dataset = load_bio_json("train_more_1.json")
train_dataset = train_dataset.train_test_split(test_size=0.1)  


print(train_dataset["train"][30])

{'tokens': ['I', 'had', 'been', 'made', ',', 'without', 'consultation', ',', 'into', 'a', 'symbol', 'and', 'figurehead', 'for', 'all', 'kinds', 'of', 'ideas', '.'], 'labels': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}


In [2]:
# Define label mappings explicitly
label_list =["O", "B-COUNTRY", "I-COUNTRY", "B-DATE", "I-DATE", "B-FATALITY", "I-FATALITY"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

print(id2label)

{0: 'O', 1: 'B-COUNTRY', 2: 'I-COUNTRY', 3: 'B-DATE', 4: 'I-DATE', 5: 'B-FATALITY', 6: 'I-FATALITY'}


In [3]:
def convert_labels(example):
    return {
        "tokens": example["tokens"],
        "labels": [label2id[label] for label in example["labels"]]  
    }

train_dataset = train_dataset.map(convert_labels)

Map:   0%|          | 0/33421 [00:00<?, ? examples/s]

Map:   0%|          | 0/3714 [00:00<?, ? examples/s]

In [4]:
print(train_dataset["train"][30])

{'tokens': ['I', 'had', 'been', 'made', ',', 'without', 'consultation', ',', 'into', 'a', 'symbol', 'and', 'figurehead', 'for', 'all', 'kinds', 'of', 'ideas', '.'], 'labels': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [5]:
from transformers import RobertaTokenizerFast, RobertaForTokenClassification

MODEL_PATH = "roberta_finetuned_MIC_3"  

# Load tokenizer and model
tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_PATH)
model = RobertaForTokenClassification.from_pretrained(MODEL_PATH, num_labels=7)  

In [6]:
from transformers import DataCollatorForTokenClassification

def tokenize_and_align_labels(examples):
    """
    Tokenizes input text and aligns BIO labels accordingly.
    """
    tokenized_inputs = tokenizer(
        examples["tokens"], 
        is_split_into_words=True, 
        padding="max_length", 
        truncation=True, 
        max_length=128
    )

    labels = []
    for i, label in enumerate(examples["labels"]):  
        word_ids = tokenized_inputs.word_ids(batch_index=i)  
        label_ids = []
        previous_word_idx = None

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])  
            else:
                label_ids.append(label[word_idx] if label[word_idx] in [2, 4, 6] else -100)  
                
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs  

# Tokenize dataset
train_tokenized = train_dataset.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer)

Map:   0%|          | 0/33421 [00:00<?, ? examples/s]

Map:   0%|          | 0/3714 [00:00<?, ? examples/s]

In [7]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./roberta_finetuned_MIC",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized["train"],
    eval_dataset=train_tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

  trainer = Trainer(


In [8]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0215,0.019229
2,0.0116,0.020002
3,0.0089,0.022669
4,0.0036,0.023861
5,0.0026,0.025486


TrainOutput(global_step=20890, training_loss=0.011581634915919325, metrics={'train_runtime': 7937.9645, 'train_samples_per_second': 21.051, 'train_steps_per_second': 2.632, 'total_flos': 1.091649208797312e+16, 'train_loss': 0.011581634915919325, 'epoch': 5.0})

In [9]:
model.save_pretrained("./roberta_finetuned_MIC_")
tokenizer.save_pretrained("./roberta_finetuned_MIC_") 

('./roberta_finetuned_MIC_\\tokenizer_config.json',
 './roberta_finetuned_MIC_\\special_tokens_map.json',
 './roberta_finetuned_MIC_\\vocab.json',
 './roberta_finetuned_MIC_\\merges.txt',
 './roberta_finetuned_MIC_\\added_tokens.json',
 './roberta_finetuned_MIC_\\tokenizer.json')