# NERC model: SVM


Poster:
+ Dataset used: http://huggingface.co/datasets/DFKI-SLT/few-nerd?library=datasets + CONLL2003
+ explain process of getting the training data (i.e., how we combined CONLL2003 dataset with the dataset from huggingface)
+ why features: pos tag, 
+ why we chose SVM -> https://ieeexplore.ieee.org/abstract/document/10762517. In this paper SVM is compared to a bunch of other approaches and SVM scores the best. Also important to mention that in the paper they also used the CONLL2003 dataset, which is also part of our training data.

In [1]:
import pandas as pd
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer 
import re
from datasets import load_dataset
import spacy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path = "NER-test.tsv"

# Fix error: extra tab (i.e., column) in a row
fixed_lines = []

with open(path, "r", encoding="latin1") as file:
    for i, line in enumerate(file):
        parts = line.strip().split("\t") 
        parts = [p.strip() for p in parts]
        if len(parts) != 4:  
            # If  more than 4 columns, merge into last column
            if len(parts) > 4:
                parts = parts[:3] + [" ".join(parts[3:])] 
        fixed_lines.append("\t".join(parts))  # Keep the fixed line

# Rewrite the original file with the fixed data
with open(path, "w", encoding="latin1") as file:
    file.write("\n".join(fixed_lines))

test_dataset = pd.read_csv(path, sep="\t", encoding="latin1")


In [None]:
# Clean the original training data
with open("train.txt", "r", encoding="utf-8") as f, open("train_cleaned.txt", "w", encoding="utf-8") as out_f:
    for line in f:
        # Stick to test set labelling
        line = re.sub(r'\bB-PER\b', 'B-PERSON', line)
        line = re.sub(r'\bI-PER\b', 'I-PERSON', line)
        
        # Remove MISC labels
        line = re.sub(r'\bB-MISC\b', 'O', line)  
        line = re.sub(r'\bI-MISC\b', 'O', line)  

        out_f.write(line)

# Load for dataset for work of art instances
ds = load_dataset("DFKI-SLT/few-nerd", "supervised")

# Art labels to transform in BIO format
art_labels = ["art"]
labels = ds["train"].features["ner_tags"].feature.names


def label_in_bio_format(tokens, ner_tags):
    bio_labels = []
    last_tag = 'O' 
    for tag_id in ner_tags:
        tag = labels[tag_id]
        if tag == 'O':  
            bio_labels.append('O')
        else:
            # If it's  art label, apply the BIO format
            if tag in art_labels:
                if last_tag != tag:  
                    bio_labels.append(f'B-WORK_OF_ART')
                else:  
                    bio_labels.append(f'I-WORK_OF_ART')
            else:
                bio_labels.append('O')
        last_tag = tag  
    return bio_labels


art_instances = []
for instance in ds["train"]:
    tokens = instance["tokens"]
    ner_tags = instance["ner_tags"]
    bio_labels = label_in_bio_format(tokens, ner_tags)
    if any(label.startswith('B-WORK_OF_ART') or label.startswith('I-WORK_OF_ART') for label in bio_labels):
        art_instances.append({"tokens": tokens, "ner_tags": bio_labels})

nlp = spacy.load("en_core_web_sm")

# Combine the cleaned training data with work of art instances
with open("train_cleaned.txt", "a", encoding="utf-8") as out_f:
    out_f.write("\n")
    for art_instance in art_instances:
            tokens = art_instance["tokens"]
            ner_tags = art_instance["ner_tags"]

            for token, tag, doc in zip(tokens, ner_tags):
                constituency_tag = token.dep_  

                out_f.write(f"{token} \"\" {constituency_tag} {tag}\n")

In [None]:
def extract_train_features(token, constituency_tag):
    return {
        'words': token,
        'constituency_tag': constituency_tag,
    }
    
def extract_test_features(token):
    doc = nlp(token)
    constituency_tag = doc[0].dep_
    return {
        'words': token,
        'constituency_tag': constituency_tag,
    }

In [None]:
training_features = []
training_gold_labels = []

with open('train_cleaned.txt', 'r', encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line and not line.startswith("-DOCSTART-"):
            columns = line.split()
            token = columns[0]  
            label = columns[-1]
            constituency_tag = columns[2]

            training_features.append(extract_train_features(token, constituency_tag))
            training_gold_labels.append(label)

In [7]:
test_features = []

token_list = test_dataset["token"].values

for token in token_list:
    a_dict = extract_test_features(token)
    test_features.append(a_dict)

test_labels = test_dataset["BIO_NER_tag"].values

In [8]:
vec = DictVectorizer()
all_features = training_features + test_features
the_array = vec.fit_transform(all_features)

len_training_features = len(training_features)
training_features = the_array[:len_training_features]
test_features = the_array[len_training_features:]

In [9]:
lin_clf = svm.LinearSVC()
lin_clf.fit(training_features,training_gold_labels)



In [10]:
y_pred = lin_clf.predict(test_features)
print(classification_report(test_labels, y_pred))

               precision    recall  f1-score   support

        B-LOC       1.00      0.43      0.60         7
        B-ORG       0.40      0.67      0.50         3
     B-PERSON       1.00      0.09      0.17        11
B-WORK_OF_ART       1.00      0.11      0.20         9
        I-LOC       0.00      0.00      0.00         1
        I-ORG       0.00      0.00      0.00         2
     I-PERSON       0.00      0.00      0.00         8
I-WORK_OF_ART       0.67      0.20      0.31        10
            O       0.83      1.00      0.91       186

     accuracy                           0.82       237
    macro avg       0.54      0.28      0.30       237
 weighted avg       0.80      0.82      0.76       237



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
results = []

for token, pred, actual in zip(test_dataset["token"].values, y_pred, test_labels):
    # if pred != actual:
        results.append({"Token": token, "Predicted": pred, "Actual": actual})

results_df = pd.DataFrame(results)

results_df


Unnamed: 0,Token,Predicted,Actual
0,I,O,O
1,would,O,O
2,n't,O,O
3,have,O,O
4,gone,O,O
...,...,...,...
232,Elizabeth,O,I-PERSON
233,II,O,I-PERSON
234,was,O,O
235,phenomenal,O,O


### Classification Report 

### Result Analysis 