In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report

In [None]:
df = pd.read_csv('ner_train.csv')
df.head()

Unnamed: 0,text,labels
0,Thousands of demonstrators have marched throug...,O O O O O O B-geo O O O O O B-geo O O O O O B-...
1,Iranian officials say they expect to get acces...,B-gpe O O O O O O O O O O O O O O B-tim O O O ...
2,Helicopter gunships Saturday pounded militant ...,O O B-tim O O O O O B-geo O O O O O B-org O O ...
3,They left after a tense hour-long standoff wit...,O O O O O O O O O O O
4,U.N. relief coordinator Jan Egeland said Sunda...,B-geo O O B-per I-per O B-tim O B-geo O B-gpe ...


In [3]:
# preprocessing
# not all IOB tags in the dataset are the same as in the test set, so they need to be mapped
tag_mapping = {
    'B-geo': 'B-LOCATION',
    'I-geo': 'I-LOCATION',
    'B-gpe': 'B-LOCATION',
    'I-gpe': 'I-LOCATION',
    'B-art': 'B-WORK_OF_ART',
    'I-art': 'I-WORK_OF_ART',
    'B-org': 'B-ORG',
    'I-org': 'I-ORG',
    'B-per': 'B-PERSON',
    'I-per': 'I-PERSON',
    'O': 'O'
}

# Function to process each row
def process_row(row):
    tokens = row['text'].split()
    tags = row['labels'].split()

    if len(tokens) != len(tags):
        print("Token-tag mismatch detected. Skipping row.")
        return None

    # Map tags
    standardized_tags = [tag_mapping.get(tag, tag) for tag in tags]

    return list(zip(tokens, standardized_tags))

# Apply processing
df['token_tag_pairs'] = df.apply(process_row, axis=1)

# Drop any rows that failed processing
df = df[df['token_tag_pairs'].notnull()]

In [5]:
df

Unnamed: 0,text,labels,token_tag_pairs
0,Thousands of demonstrators have marched throug...,O O O O O O B-geo O O O O O B-geo O O O O O B-...,"[(Thousands, O), (of, O), (demonstrators, O), ..."
1,Iranian officials say they expect to get acces...,B-gpe O O O O O O O O O O O O O O B-tim O O O ...,"[(Iranian, B-LOCATION), (officials, O), (say, ..."
2,Helicopter gunships Saturday pounded militant ...,O O B-tim O O O O O B-geo O O O O O B-org O O ...,"[(Helicopter, O), (gunships, O), (Saturday, B-..."
3,They left after a tense hour-long standoff wit...,O O O O O O O O O O O,"[(They, O), (left, O), (after, O), (a, O), (te..."
4,U.N. relief coordinator Jan Egeland said Sunda...,B-geo O O B-per I-per O B-tim O B-geo O B-gpe ...,"[(U.N., B-LOCATION), (relief, O), (coordinator..."
...,...,...,...
47941,Opposition leader Mir Hossein Mousavi has said...,O O O B-per I-per O O O O O O O O O O O O O O ...,"[(Opposition, O), (leader, O), (Mir, O), (Hoss..."
47942,"On Thursday , Iranian state media published a ...",O B-tim O B-gpe O O O O O O O O B-org I-org O ...,"[(On, O), (Thursday, B-tim), (,, O), (Iranian,..."
47943,"Following Iran 's disputed June 12 elections ,...",O B-geo O O B-tim I-tim O O O O O O O O O O O ...,"[(Following, O), (Iran, B-LOCATION), ('s, O), ..."
47944,"Since then , authorities have held public tria...",O O O O O O O O O O O O O O O O O O O O O,"[(Since, O), (then, O), (,, O), (authorities, ..."


In [16]:
# pip install sklearn-crfsuite
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics

# feature extraction
def word2features(sent, i):
    word = sent[i][0]
    
    features = {
        'bias': 1.0,                     
        'word.lower()': word.lower(),    # lowercase form of the word
        'word[-3:]': word[-3:],          # last 3 characters (useful for suffixes)
        'word[-2:]': word[-2:],          # last 2 characters
        'word.isupper()': word.isupper(),# is the word all uppercase?
        'word.istitle()': word.istitle(),# is the word capitalized like a title?
        'word.isdigit()': word.isdigit(),# is the word a digit?
    }

    # features from the previous word, if it exists
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        # indicate beginning of sentence
        features['BOS'] = True

    # features from the next word, if it exists
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        # indicate end of sentence
        features['EOS'] = True

    return features

# convert full sentence to list of feature dicts
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

# extract labels from sentence
def sent2labels(sent):
    return [label for token, label in sent]

# extract tokens only from sentence (optional)
def sent2tokens(sent):
    return [token for token, label in sent]

# assume df['token_tag_pairs'] contains list of (token, tag) pairs per sentence
sentences = df['token_tag_pairs'].tolist()

In [17]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

# initialize the crf model with hyperparameters
crf = CRF(
    algorithm='lbfgs',           
    c1=0.1,                    
    c2=0.1,                    
    max_iterations=100,          # max number of training iterations
    all_possible_transitions=True  # include transitions not seen in training
)

# training the crf model
crf.fit(X, y)

In [None]:
# load test set
df_test = pd.read_csv("NER-test.tsv", sep="\t")
grouped = df_test.groupby("sentence_id")

# extract tokens and tags by sentence
test_sentences = []
test_labels = []
for _, group in grouped:
    tokens = group["token"].tolist()
    labels = group["BIO_NER_tag"].tolist()
    test_sentences.append(list(zip(tokens, labels)))
    test_labels.append(labels)

# convert test data to features
X_test = [sent2features(s) for s in test_sentences]

# use a simplified sent2labels that expects just the list of tags
def sent2labels(sent_labels):
    return sent_labels



In [30]:
y_pred = crf.predict(X_test)
y_test = [sent2labels(tags) for tags in test_labels]


# Flatten the nested lists
flat_y_test = [label for seq in y_test for label in seq]
flat_y_pred = [label for seq in y_pred for label in seq]

# Get only the labels present in the test set
labels = sorted(set(flat_y_test), key=lambda x: (x.split('-')[-1], x[0]))

print(classification_report(flat_y_test, flat_y_pred, labels=labels, zero_division=0))


               precision    recall  f1-score   support

   B-LOCATION       0.38      1.00      0.55         3
   I-LOCATION       0.67      1.00      0.80         2
            O       0.89      0.97      0.93       159
        B-ORG       0.50      0.38      0.43         8
        I-ORG       0.38      0.60      0.46         5
     B-PERSON       0.89      0.67      0.76        12
     I-PERSON       1.00      0.54      0.70        13
B-WORK_OF_ART       0.00      0.00      0.00         6
I-WORK_OF_ART       0.00      0.00      0.00         8

    micro avg       0.84      0.83      0.84       216
    macro avg       0.52      0.57      0.51       216
 weighted avg       0.80      0.83      0.81       216

