In [1]:
!pip install transformers datasets seqeval


Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m783.3 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading d

# **Load the Dataset**

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
import json

# Define the path where your files are located in Google Drive
train_file_path = '/content/drive/My Drive/Colab Notebooks/train.json'
test_file_path = '/content/drive/My Drive/Colab Notebooks/test.json'

In [4]:
# Load train and test data from JSON
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

train_data = load_data(train_file_path)
test_data = load_data(test_file_path)

# **Preprocess the Data**

In [5]:
def process_data(data):
  all_texts=[]
  all_labels=[]

  for doc in data:
    tokens=doc['tokens']
    labels = doc.get('labels',['O']*len(tokens)) #Use 'O' for missing labels
    all_texts.append(tokens)
    all_labels.append(labels)

  return all_texts,all_labels

In [6]:
train_texts, train_labels = process_data(train_data)
test_texts, test_labels = process_data(test_data)

In [7]:
from datasets import Dataset


# Convert to huggingface dataset format
train_dataset = Dataset.from_dict({'tokens': train_texts, 'ner_tags': train_labels})
test_dataset = Dataset.from_dict({'tokens': test_texts})



In [8]:
# Define the BIO label mapping
labels_list = ['O', 'B-NAME_STUDENT', 'I-NAME_STUDENT', 'B-EMAIL', 'I-EMAIL',
               'B-USERNAME', 'I-USERNAME', 'B-ID_NUM', 'I-ID_NUM', 'B-PHONE_NUM',
               'I-PHONE_NUM', 'B-URL_PERSONAL', 'I-URL_PERSONAL', 'B-STREET_ADDRESS',
               'I-STREET_ADDRESS']
label2id = {label: i for i, label in enumerate(labels_list)}
id2label = {i: label for i, label in enumerate(labels_list)}


In [9]:
def align_labels_with_tokens(labels, word_ids):
    aligned_labels = []
    previous_word_id = None
    for word_id in word_ids:
        if word_id is None:
            aligned_labels.append(-100)
        elif word_id != previous_word_id:
            aligned_labels.append(labels[word_id])
        else:
            aligned_labels.append(-100)
        previous_word_id = word_id
    return aligned_labels

# **Load and Fine Tune the DeBERTa Model**

In [10]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from transformers import DataCollatorForTokenClassification


In [11]:
# Load DeBERTa tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")
model = AutoModelForTokenClassification.from_pretrained("microsoft/deberta-v3-large", num_labels=len(labels_list))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
print(test_dataset.column_names)


['tokens']


In [13]:
# Tokenize and align the labels
def tokenize_and_align_labels(examples, labels_available=True):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True, padding=True)

    if labels_available:
        all_labels = examples['ner_tags']
        new_labels = []
        for i, labels in enumerate(all_labels):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            new_labels.append(align_labels_with_tokens(labels, word_ids))

        # Ensure labels are integers - map 'O' and other labels
        label_to_id = {'O': 0, 'B-PER': 1, 'I-PER': 2, -100: -100}  # Extend this for other labels
        new_labels = [[label_to_id.get(label, -100) for label in label_list] for label_list in new_labels]
        tokenized_inputs["labels"] = new_labels

    return tokenized_inputs


In [14]:
# Apply the tokenization and label alignment for the training set (with labels)
train_dataset = train_dataset.map(lambda x: tokenize_and_align_labels(x, labels_available=True), batched=True)

# Apply the tokenization for the test set (without labels)
test_dataset = test_dataset.map(lambda x: tokenize_and_align_labels(x, labels_available=False), batched=True)


Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [15]:
# Data collator to ensure correct padding for token classification tasks
data_collator = DataCollatorForTokenClassification(tokenizer)


In [16]:
# Define the evaluation metric
from seqeval.metrics import accuracy_score, f1_score, classification_report

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Remove ignored index (special tokens like -100)
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_preds = [[id2label[p] for (p, l) in zip(pred, label) if l != -100] for pred, label in zip(preds, labels)]

    return {
        "accuracy": accuracy_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds),
        "classification_report": classification_report(true_labels, true_preds)
    }

In [17]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    gradient_accumulation_steps=2, # Accumulate gradients over 2 steps

)



In [18]:
# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
# Train the model
trainer.train()

# **Inference Pipeline with Aggregation Strategy**

In [None]:
from transformers import pipeline

# Load NER pipeline with trained model
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Example of using the pipeline
text = "John Doe's email is john.doe@example.com and his phone number is 123-456-7890."
ner_results = ner_pipeline(text)

print(ner_results)
