# NERC model: SVM


Dataset used:
http://huggingface.co/datasets/DFKI-SLT/few-nerd?library=datasets + CONLL2003


why we chose SVM -> https://ieeexplore.ieee.org/abstract/document/10762517

in this paper SVM is compared to a bunch of other approaches and SVM scores the best. Also important to mention that in the paper they also used the CONLL2003 dataset, which is also part of our training data

In [1]:
import pandas as pd
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer 
import re
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path = "NER-test.tsv"

# Fix error: extra tab (i.e., column) in a row
fixed_lines = []

with open(path, "r", encoding="latin1") as file:
    for i, line in enumerate(file):
        parts = line.strip().split("\t") 
        parts = [p.strip() for p in parts]
        if len(parts) != 4:  
            # If  more than 4 columns, merge into last column
            if len(parts) > 4:
                parts = parts[:3] + [" ".join(parts[3:])] 
        fixed_lines.append("\t".join(parts))  # Keep the fixed line

# Rewrite the original file with the fixed data
with open(path, "w", encoding="latin1") as file:
    file.write("\n".join(fixed_lines))

test_dataset = pd.read_csv(path, sep="\t", encoding="latin1")


In [3]:
# Clean the original training data
with open("train.txt", "r", encoding="utf-8") as f, open("train_cleaned.txt", "w", encoding="utf-8") as out_f:
    for line in f:
        # Stick to test set labelling
        line = re.sub(r'\bB-PER\b', 'B-PERSON', line)
        line = re.sub(r'\bI-PER\b', 'I-PERSON', line)
        
        # Remove MISC labels
        line = re.sub(r'\bB-MISC\b', 'O', line)  
        line = re.sub(r'\bI-MISC\b', 'O', line)  

        out_f.write(line)

# Load for dataset for work of art instances
ds = load_dataset("DFKI-SLT/few-nerd", "supervised")

# Art labels to transform in BIO format
art_labels = ["art"]#, "art-painting", "art-music", "art-film", "art-broadcastprogram", "art-other", "art-writtenart"]
labels = ds["train"].features["ner_tags"].feature.names


def label_in_bio_format(tokens, ner_tags):
    bio_labels = []
    last_tag = 'O' 
    for tag_id in ner_tags:
        tag = labels[tag_id]
        if tag == 'O':  
            bio_labels.append('O')
        else:
            # If it's  art label, apply the BIO format
            if tag in art_labels:
                if last_tag != tag:  
                    bio_labels.append(f'B-WORK_OF_ART')
                else:  
                    bio_labels.append(f'I-WORK_OF_ART')
            else:
                bio_labels.append('O')
        last_tag = tag  
    return bio_labels


art_instances = []
for instance in ds["train"]:
    tokens = instance["tokens"]
    ner_tags = instance["ner_tags"]
    bio_labels = label_in_bio_format(tokens, ner_tags)
    if any(label.startswith('B-WORK_OF_ART') or label.startswith('I-WORK_OF_ART') for label in bio_labels):
        art_instances.append({"tokens": tokens, "ner_tags": bio_labels})

# Combine the cleaned training data with work of art instances
with open("train_cleaned.txt", "a", encoding="utf-8") as out_f:
    for art_instance in art_instances:
        tokens = art_instance["tokens"]
        ner_tags = art_instance["ner_tags"]
        
        for token, tag in zip(tokens, ner_tags):
            out_f.write(f"{token} {tag}\n")
        out_f.write("\n")

In [4]:
training_features = []
training_gold_labels = []

with open('train_cleaned.txt', 'r', encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line and not line.startswith("-DOCSTART-"):
            columns = line.split()
            token = columns[0]  
            label = columns[-1]
            training_features.append({'words': token})
            training_gold_labels.append(label)

In [5]:
test_features = []

token_list = test_dataset["token"].values

for token in token_list:
    a_dict = {
        'words':token
        }
    test_features.append(a_dict)

test_labels = test_dataset["BIO_NER_tag"].values

In [6]:
vec = DictVectorizer()
all_features = training_features + test_features
the_array = vec.fit_transform(all_features)

len_training_features = len(training_features)
training_features = the_array[:len_training_features]
test_features = the_array[len_training_features:]

In [7]:
lin_clf = svm.LinearSVC()
lin_clf.fit(training_features,training_gold_labels)



In [8]:
y_pred = lin_clf.predict(test_features)
print(classification_report(test_labels, y_pred))

               precision    recall  f1-score   support

        B-LOC       1.00      0.14      0.25         7
        B-ORG       0.25      0.33      0.29         3
     B-PERSON       1.00      0.09      0.17        11
B-WORK_OF_ART       0.50      0.11      0.18         9
        I-LOC       0.00      0.00      0.00         1
        I-ORG       0.00      0.00      0.00         2
     I-PERSON       0.00      0.00      0.00         8
I-WORK_OF_ART       0.60      0.30      0.40        10
            O       0.83      0.99      0.90       186

     accuracy                           0.81       237
    macro avg       0.46      0.22      0.24       237
 weighted avg       0.77      0.81      0.75       237



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Classification Report 

### Result Analysis 