# NERC model: SVM


Dataset used:

In [None]:
import pandas as pd
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer 
import numpy as np
import re
from datasets import load_dataset
from collections import Counter

In [11]:
path = 'ner_dataset.csv'

dataset = pd.read_csv(path, encoding='latin1')
dataset = dataset.ffill()

In [5]:
path = "NER-test.tsv"

# Read the file and process each line
fixed_lines = []

with open(path, "r", encoding="latin1") as file:
    for i, line in enumerate(file):
        parts = line.strip().split("\t")  # Split by tab
        parts = [p.strip() for p in parts]  # Remove leading/trailing spaces
        if len(parts) != 4:  # Check for incorrect column count
            # If there are more than 4 columns, merge extra parts into the last column
            if len(parts) > 4:
                parts = parts[:3] + [" ".join(parts[3:])]  # Merge extras into the last column
        fixed_lines.append("\t".join(parts))  # Keep the fixed line

# Rewrite the original file with the fixed data
with open(path, "w", encoding="latin1") as file:
    file.write("\n".join(fixed_lines))

# Load the fixed data into pandas
test_dataset = pd.read_csv(path, sep="\t", encoding="latin1")


In [13]:
train_text = []

pos_list = dataset["POS"].values
token_list = dataset["Word"].values

for token in token_list:
    a_dict = {
        'words':token
        }
    train_text.append(a_dict)
    

train_labels = dataset["Tag"].values

In [14]:
test_text = []

token_list = test_dataset["token"].values

for token in token_list:
    a_dict = {
        'words':token
        }
    test_text.append(a_dict)

test_labels = test_dataset["BIO_NER_tag"].values

In [15]:
vec = DictVectorizer()
all_features = train_text + test_text
the_array = vec.fit_transform(all_features)

len_training_features = len(train_text)
kaggle_training_features = the_array[:len_training_features]
kaggle_test_features = the_array[len_training_features:]

In [16]:
from collections import Counter 
Counter(test_labels)

Counter({'O': 186,
         'B-PERSON': 11,
         'I-WORK_OF_ART': 10,
         'B-WORK_OF_ART': 9,
         'I-PERSON': 8,
         'B-LOC': 7,
         'B-ORG': 3,
         'I-ORG': 2,
         'I-LOC': 1})

In [None]:
lin_clf = svm.LinearSVC()
lin_clf.fit(kaggle_training_features,train_labels)
kaggle_test_features = vec.transform(test_text)

y_pred = lin_clf.predict(kaggle_test_features)

print(classification_report(test_labels, y_pred))

In [None]:
kaggle_test_features = vec.transform(test_text)

y_pred = lin_clf.predict(kaggle_test_features)

print(classification_report(test_labels, y_pred))

               precision    recall  f1-score   support

        B-LOC       0.00      0.00      0.00         7
        B-ORG       0.00      0.00      0.00         3
     B-PERSON       0.00      0.00      0.00        11
B-WORK_OF_ART       0.00      0.00      0.00         9
        B-eve       0.00      0.00      0.00         0
        B-geo       0.00      0.00      0.00         0
        B-gpe       0.00      0.00      0.00         0
        B-org       0.00      0.00      0.00         0
        B-per       0.00      0.00      0.00         0
        B-tim       0.00      0.00      0.00         0
        I-LOC       0.00      0.00      0.00         1
        I-ORG       0.00      0.00      0.00         2
     I-PERSON       0.00      0.00      0.00         8
I-WORK_OF_ART       0.00      0.00      0.00        10
        I-org       0.00      0.00      0.00         0
        I-per       0.00      0.00      0.00         0
            O       0.88      0.98      0.93       186

     acc

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [40]:
import re

def clean_ner_labels(file_path, output_path):
    with open(file_path, "r", encoding="utf-8") as f, open(output_path, "w", encoding="utf-8") as out_f:
        for line in f:
            # Standardize labels
            line = re.sub(r'\bB-PER\b', 'B-PERSON', line)
            line = re.sub(r'\bI-PER\b', 'I-PERSON', line)
            
            # Remove MISC labels
            line = re.sub(r'\bB-MISC\b', 'O', line)  
            line = re.sub(r'\bI-MISC\b', 'O', line)  

            out_f.write(line)

# Example usage
clean_ner_labels("train.txt", "train_cleaned.txt")


In [27]:
from datasets import load_dataset

ds = load_dataset("DFKI-SLT/few-nerd", "supervised")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 131767/131767 [00:00<00:00, 502124.95 examples/s]
Generating validation split: 100%|██████████| 18824/18824 [00:00<00:00, 457654.15 examples/s]
Generating test split: 100%|██████████| 37648/37648 [00:00<00:00, 560841.46 examples/s]


In [39]:
from collections import Counter

# Load training set
train_set = ds["train"]

# Get label mapping (ID → label name)
id2label = ds["train"].features["fine_ner_tags"].feature.names

# Flatten the ner_tags lists (one for each sentence) and count occurrences
flat_ner_tags = [label for sentence_tags in train_set["fine_ner_tags"] for label in sentence_tags]

# Count occurrences of each label ID
label_counts = Counter(flat_ner_tags)

# Convert IDs to readable label names
label_counts_named = {id2label[label]: count for label, count in label_counts.items()}

# Print results
print(label_counts_named)


{'O': 2547020, 'person-artist/author': 22287, 'person-actor': 10720, 'art-writtenart': 11309, 'person-director': 4175, 'person-other': 50906, 'organization-other': 43392, 'organization-company': 29012, 'organization-sportsteam': 17190, 'organization-sportsleague': 7534, 'product-car': 5276, 'event-protest': 1341, 'organization-government/governmentagency': 14984, 'other-biologything': 8905, 'location-GPE': 91196, 'location-other': 16434, 'person-athlete': 17527, 'art-broadcastprogram': 6101, 'product-other': 11248, 'building-other': 24834, 'product-weapon': 4272, 'building-airport': 3642, 'building-sportsfacility': 3706, 'person-scholar': 4704, 'art-music': 10907, 'event-other': 9951, 'other-language': 3551, 'other-chemicalthing': 5550, 'art-film': 8431, 'building-hospital': 4116, 'other-law': 6281, 'product-airplane': 5783, 'location-road/railway/highway/transit': 14604, 'person-soldier': 4365, 'location-mountain': 4500, 'organization-education': 23843, 'organization-media/newspaper':

In [62]:
# Initialize lists for features and labels
training_features = []
training_gold_labels = []

# Read the file line by line
with open('train_cleaned.txt', 'r', encoding="utf-8") as f:
    for line in f:
        # Skip empty lines
        line = line.strip()
        if line and not line.startswith("-DOCSTART-"):
            # Split the line into components (token, POS, chunk, NER label, etc.)
            parts = line.split()
            
            if len(parts) >= 2:
                token = parts[0]  # First column is the token
                label = parts[-1]  # Last column is the NER label (e.g., 'O', 'B-PER', etc.)
                
                # Store the token as a feature (as a dictionary)
                training_features.append({'words': token})
                # Store the label (NER tag)
                training_gold_labels.append(label)

# Verify the number of features and labels
print(f"Number of training features: {len(training_features)}")
print(f"Number of training gold labels: {len(training_gold_labels)}")


Number of training features: 507463
Number of training gold labels: 507463


In [63]:
test_features = []

token_list = test_dataset["token"].values

for token in token_list:
    a_dict = {
        'words':token
        }
    test_features.append(a_dict)

test_labels = test_dataset["BIO_NER_tag"].values


In [64]:
vec = DictVectorizer()
all_features = training_features + test_features
the_array = vec.fit_transform(all_features)

len_training_features = len(training_features)
training_features = the_array[:len_training_features]
test_features = the_array[len_training_features:]

In [65]:
lin_clf = svm.LinearSVC()

In [66]:
lin_clf.fit(training_features,training_gold_labels)



In [67]:
y_pred = lin_clf.predict(test_features)

print(classification_report(test_labels, y_pred))

               precision    recall  f1-score   support

        B-LOC       1.00      0.14      0.25         7
        B-ORG       0.25      0.33      0.29         3
     B-PERSON       1.00      0.09      0.17        11
B-WORK_OF_ART       0.50      0.11      0.18         9
        I-LOC       0.00      0.00      0.00         1
        I-ORG       0.00      0.00      0.00         2
     I-PERSON       0.00      0.00      0.00         8
I-WORK_OF_ART       0.60      0.30      0.40        10
            O       0.83      0.99      0.90       186

     accuracy                           0.81       237
    macro avg       0.46      0.22      0.24       237
 weighted avg       0.77      0.81      0.75       237



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [52]:
for text, pred, label in zip(test_features,y_pred, test_labels):
    if pred != label:
        print(text, pred, label)

{'words': 'United'} B-LOC I-ORG
{'words': 'Barcelona'} B-ORG B-LOC
{'words': 'Coldplay'} O B-ORG
{'words': 'Lionel'} O B-PERSON
{'words': 'Messi'} O I-PERSON
{'words': 'Miami'} B-LOC I-ORG
{'words': 'Titanic'} O B-WORK_OF_ART
{'words': 'The'} O B-WORK_OF_ART
{'words': 'Catcher'} O I-WORK_OF_ART
{'words': 'in'} O I-WORK_OF_ART
{'words': 'the'} O I-WORK_OF_ART
{'words': 'Rye'} O I-WORK_OF_ART
{'words': 'Holden'} O B-PERSON
{'words': 'Caulfield'} O I-PERSON
{'words': 'Sharapova'} O I-PERSON
{'words': 'Serena'} O B-PERSON
{'words': 'Banksy'} O B-PERSON
{'words': 'Bristol'} B-ORG B-LOC
{'words': 'Harry'} O B-WORK_OF_ART
{'words': 'Potter'} O I-WORK_OF_ART
{'words': 'Brandenburg'} O B-LOC
{'words': 'Gate'} O I-LOC
{'words': 'Stranger'} O B-WORK_OF_ART
{'words': 'Things'} O I-WORK_OF_ART
{'words': 'Eleven'} O B-PERSON
{'words': '1984'} O B-WORK_OF_ART
{'words': 'Orwell'} O I-PERSON
{'words': 'Barbie'} O B-WORK_OF_ART
{'words': 'Oppenheimer'} B-ORG B-WORK_OF_ART
{'words': 'To'} O B-WORK_OF_ART

In [None]:
# Clean the original training data
def clean_ner_labels(file_path, output_path):
    with open(file_path, "r", encoding="utf-8") as f, open(output_path, "w", encoding="utf-8") as out_f:
        for line in f:
            # Stick to test set labelling
            line = re.sub(r'\bB-PER\b', 'B-PERSON', line)
            line = re.sub(r'\bI-PER\b', 'I-PERSON', line)
            
            # Remove MISC labels
            line = re.sub(r'\bB-MISC\b', 'O', line)  
            line = re.sub(r'\bI-MISC\b', 'O', line)  

            out_f.write(line)

clean_ner_labels("train.txt", "train_cleaned.txt")

# Load for
ds = load_dataset("DFKI-SLT/few-nerd", "supervised")

# Define art-related labels and the mapping to BIO format
art_labels = ["art", "art-painting", "art-music", "art-film", "art-broadcastprogram", "art-other", "art-writtenart"]
id2label = ds["train"].features["ner_tags"].feature.names

def convert_to_bio_format(tokens, ner_tags):
    """
    Convert NER tags into BIO format for art-related entities.
    """
    bio_labels = []
    last_tag = 'O'  # To keep track of the previous tag for B- vs I-
    
    for token, tag_id in zip(tokens, ner_tags):
        tag = id2label[tag_id]
        
        if tag == 'O':  # Outside entity
            bio_labels.append('O')
        else:
            # If it's an art-related label, we need to apply the BIO scheme
            if tag in art_labels:
                if last_tag != tag:  # Start of a new entity
                    bio_labels.append(f'B-WORK_OF_ART')
                else:  # Continuation of the entity
                    bio_labels.append(f'I-WORK_OF_ART')
            else:
                bio_labels.append('O')

        last_tag = tag  # Update last_tag for next token

    return bio_labels

# Filter and convert art-related instances from the training set
art_related_instances = []
for example in ds["train"]:
    tokens = example["tokens"]
    ner_tags = example["ner_tags"]
    
    # Convert to BIO format if art-related
    bio_labels = convert_to_bio_format(tokens, ner_tags)
    
    # Only add if the sentence contains art-related entities
    if any(label.startswith('B-WORK_OF_ART') or label.startswith('I-WORK_OF_ART') for label in bio_labels):
        art_related_instances.append({"tokens": tokens, "ner_tags": bio_labels})

# Step 3: Combine the cleaned training data with art-related instances
# Read cleaned training data and append the art-related instances
with open("train_cleaned.txt", "a", encoding="utf-8") as out_f:
    for art_instance in art_related_instances:
        tokens = art_instance["tokens"]
        ner_tags = art_instance["ner_tags"]
        
        for token, tag in zip(tokens, ner_tags):
            out_f.write(f"{token} {tag}\n")
        out_f.write("\n")  # Newline after each sentence



Added 9864 art-related instances to the training data.


### Classification Report 

### Result Analysis 