In [None]:
import glob
import os

http_proxy_url = "http://proxy.alcf.anl.gov:3128"
https_proxy_url = "http://proxy.alcf.anl.gov:3128"
os.environ['http_proxy']=http_proxy_url
os.environ['https_proxy']=https_proxy_url

In [None]:
import pandas as pd
import glob
import os
import torch
import datasets
import pandas as pd
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, AutoTokenizer, BitsAndBytesConfig
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
from datasets import load_dataset_builder, load_dataset
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import pandas as pd
from datasets import Dataset
import os
import evaluate
import numpy as np
from datasets import Dataset
from transformers import AutoModelForTokenClassification, Trainer, TrainingArguments, AutoTokenizer, DataCollatorForTokenClassification

In [None]:
publishing_name="roberta-conll"

import numpy as np
import evaluate
from datasets import load_dataset
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    Trainer,
    TrainingArguments,
)

def load_data():
    dataset = load_dataset("eriktks/conll2003")
    return dataset

def prepare_model_and_tokenizer(label_list):
    model = AutoModelForTokenClassification.from_pretrained(
        "FacebookAI/roberta-base",
        num_labels=len(label_list),
        id2label={i: label for i, label in enumerate(label_list)},
        label2id={label: i for i, label in enumerate(label_list)},
    )
    tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base", add_prefix_space=True)
    return model, tokenizer

def tokenize_and_align_labels(examples, tokenizer):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Load metric
    metric = evaluate.load("seqeval")
    
    # Get the label list from your dataset or model config
    label_list = model.config.id2label.values()

    true_labels = [
        [label_list[l] for l in label if l != -100]
        for label in labels
    ]
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

def main():
    # Load dataset
    dataset = load_data()
    
    # Get label list
    label_list = dataset["train"].features["ner_tags"].feature.names
    
    # Prepare model and tokenizer
    model, tokenizer = prepare_model_and_tokenizer(label_list)
    
    # Tokenize and align labels
    tokenized_datasets = dataset.map(
        lambda examples: tokenize_and_align_labels(examples, tokenizer),
        batched=True,
    )
    
    # Prepare data collator
    data_collator = DataCollatorForTokenClassification(tokenizer)
    
    # Prepare training arguments
    training_args = TrainingArguments(
        output_dir=publishing_name,
        load_best_model_at_end=True,
        push_to_hub=True,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=4,
    )
    
    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    
    # Train the model
    trainer.train()
    
    # Evaluate the model
    results = trainer.evaluate()
    print(results)
    
    # Push the model to the Hugging Face Hub
    trainer.push_to_hub(publishing_name)

if __name__ == "__main__":
    main()