## NER data SequenceList creation

In [2]:
from utils.NERcorpus import NERCorpus

In [3]:
data_path = "../nlp_d2_data/"

In [4]:
corpus = NERCorpus()
train_seq = corpus.read_sequence_list_csv(f"{data_path}train_data_ner.csv")
test_seq = corpus.read_sequence_list_csv(f"{data_path}test_data_ner.csv")

In [5]:
corpus.tag_dict

{'O': 0,
 'B-geo': 1,
 'B-gpe': 2,
 'B-tim': 3,
 'B-org': 4,
 'I-geo': 5,
 'B-per': 6,
 'I-per': 7,
 'I-org': 8,
 'B-art': 9,
 'I-art': 10,
 'I-tim': 11,
 'I-gpe': 12,
 'B-nat': 13,
 'I-nat': 14,
 'B-eve': 15,
 'I-eve': 16}

In [6]:
print('Sequence example:')
sequence = train_seq[0]
sequence

Sequence example:


0/0 1/0 2/0 3/0 4/0 5/0 6/1 7/0 8/0 9/0 10/0 11/0 12/1 13/0 14/0 9/0 15/0 1/0 16/2 17/0 18/0 19/0 20/0 21/0 

In [7]:
print('Sequence example in corresponding words and tags:')
sequence.to_words(sequence_list=train_seq)

Sequence example in corresponding words and tags:


'Thousands/O of/O demonstrators/O have/O marched/O through/O London/B-geo to/O protest/O the/O war/O in/O Iraq/B-geo and/O demand/O the/O withdrawal/O of/O British/B-gpe troops/O from/O that/O country/O ./O '

In [8]:
print('Train and test tag dictionaries assign the same integers to the same tags?', test_seq.y_dict == train_seq.y_dict)

Train and test tag dictionaries assign the same integers to the same tags? True


In [9]:
print('Length of train word dictionary:', len(train_seq.x_dict))
print('Length of test word dictionary:', len(test_seq.x_dict))

Length of train word dictionary: 55145
Length of test word dictionary: 55145


## FeatureMapper

In [10]:
from skseq.sequences.id_feature import IDFeatures

# Step 1: Instantiate the feature mapper with the training SequenceList
feature_mapper = IDFeatures(train_seq)

# Step 2: Build features from the training data
feature_mapper.build_features()

We can see the computed features here:

In [None]:
feature_mapper.feature_dict

{'init_tag:O': 0,
 'id:Thousands::O': 1,
 'id:of::O': 2,
 'prev_tag:O::O': 3,
 'id:demonstrators::O': 4,
 'id:have::O': 5,
 'id:marched::O': 6,
 'id:through::O': 7,
 'id:London::B-geo': 8,
 'prev_tag:O::B-geo': 9,
 'id:to::O': 10,
 'prev_tag:B-geo::O': 11,
 'id:protest::O': 12,
 'id:the::O': 13,
 'id:war::O': 14,
 'id:in::O': 15,
 'id:Iraq::B-geo': 16,
 'id:and::O': 17,
 'id:demand::O': 18,
 'id:withdrawal::O': 19,
 'id:British::B-gpe': 20,
 'prev_tag:O::B-gpe': 21,
 'id:troops::O': 22,
 'prev_tag:B-gpe::O': 23,
 'id:from::O': 24,
 'id:that::O': 25,
 'id:country::O': 26,
 'id:.::O': 27,
 'final_prev_tag:O': 28,
 'id:Helicopter::O': 29,
 'id:gunships::O': 30,
 'id:Saturday::B-tim': 31,
 'prev_tag:O::B-tim': 32,
 'id:pounded::O': 33,
 'prev_tag:B-tim::O': 34,
 'id:militant::O': 35,
 'id:hideouts::O': 36,
 'id:Orakzai::B-geo': 37,
 'id:tribal::O': 38,
 'id:region::O': 39,
 'id:,::O': 40,
 'id:where::O': 41,
 'id:many::O': 42,
 'id:Taliban::B-org': 43,
 'prev_tag:O::B-org': 44,
 'id:milita

In [13]:
len(feature_mapper.feature_dict)

39802

In [None]:
# In feature_list: [initial_features, transition_features, final_features, emission_features]
feature_mapper.feature_list

[[[[0]],
  [[3],
   [3],
   [3],
   [3],
   [3],
   [9],
   [11],
   [3],
   [3],
   [3],
   [3],
   [9],
   [11],
   [3],
   [3],
   [3],
   [3],
   [21],
   [23],
   [3],
   [3],
   [3],
   [3]],
  [[28]],
  [[1],
   [2],
   [4],
   [5],
   [6],
   [7],
   [8],
   [10],
   [12],
   [13],
   [14],
   [15],
   [16],
   [17],
   [18],
   [13],
   [19],
   [2],
   [20],
   [22],
   [24],
   [25],
   [26],
   [27]]],
 [[[0]],
  [[3],
   [32],
   [34],
   [3],
   [3],
   [3],
   [3],
   [9],
   [11],
   [3],
   [3],
   [3],
   [3],
   [44],
   [46],
   [3],
   [3],
   [3],
   [3],
   [3],
   [3],
   [3],
   [3],
   [3],
   [3],
   [3],
   [3],
   [3],
   [9],
   [58],
   [59]],
  [[28]],
  [[29],
   [30],
   [31],
   [33],
   [35],
   [36],
   [15],
   [13],
   [37],
   [38],
   [39],
   [40],
   [41],
   [42],
   [43],
   [45],
   [47],
   [48],
   [10],
   [5],
   [49],
   [10],
   [50],
   [51],
   [52],
   [53],
   [54],
   [15],
   [55],
   [56],
   [57],
   [27]]],
 [[[0]],
  [[3], [

 Each feature represents a condition, such as "this word is 'London' and its tag is `B-geo`" (`id:London::B-geo`) or "the previous tag was `O` and the current tag is `B-geo`" (`prev_tag:O::B-geo`). When the feature_mapper.build_features() method is called, it loops through the dataset and records every such feature it finds, storing them in feature_dict (a mapping from feature descriptions to unique indices) and feature_list (the same features in list form). These features are not tied to specific positions in the data but are collected globally across all sequences to build a vocabulary of useful signals.

At inference or training time, a model would processes one sequence at a time. For each position in a sequence (e.g., a word and its tag), the model would activate a subset of these predefined features, those that apply to the current word, current tag, and previous tag. 

In [20]:
# for any position i, this will be len 4, corresponding to 
# initial, emission, transition, and final features
m = 0
len(feature_mapper.feature_list[m])

4

In [45]:
feature_mapper.feature_dict_inv = {v: k for k, v in feature_mapper.feature_dict.items()}

Let's visualize the emission features for the first sequence, that is, which tags could be possibly be assigned to each word based on what has the feature mapper seen in the whole corpus.

In [51]:
import pandas as pd

sequence = train_seq.seq_list[0]  # First sequence
words = [train_seq.x_dict.get_label_name(wid) for wid in sequence.x]
tags = [train_seq.y_dict.get_label_name(tid) for tid in range(len(train_seq.y_dict))]

# Initialize an empty DataFrame with tags as index and words as columns
df = pd.DataFrame("", index=tags, columns=words)

for pos, word in enumerate(words):
    for tag_id, tag in enumerate(tags):
        emission_feats = feature_mapper.get_emission_features(sequence, pos, tag_id)
        if emission_feats:
            feat_names = [feature_mapper.feature_dict.get_label_name(fid) for fid in emission_feats]
            df.at[tag, word] = feat_names

df

Unnamed: 0,Thousands,of,demonstrators,have,marched,through,London,to,protest,the,...,demand,the.1,withdrawal,of.1,British,troops,from,that,country,.
O,[id:Thousands::O],id:of::O,[id:demonstrators::O],[id:have::O],[id:marched::O],[id:through::O],,[id:to::O],[id:protest::O],id:the::O,...,[id:demand::O],id:the::O,[id:withdrawal::O],id:of::O,[id:British::O],[id:troops::O],[id:from::O],[id:that::O],[id:country::O],[id:.::O]
B-geo,,id:of::B-geo,,,,,[id:London::B-geo],,,id:the::B-geo,...,,id:the::B-geo,,id:of::B-geo,,,,,,
B-gpe,,,,,,,[id:London::B-gpe],,,,...,,,,,[id:British::B-gpe],,,,,
B-tim,,id:of::B-tim,,,,[id:through::B-tim],,[id:to::B-tim],,id:the::B-tim,...,,id:the::B-tim,,id:of::B-tim,,,[id:from::B-tim],[id:that::B-tim],,
B-org,,id:of::B-org,,,,,[id:London::B-org],,,,...,,,,id:of::B-org,[id:British::B-org],,,,,
I-geo,,id:of::I-geo,,,,,[id:London::I-geo],,,id:the::I-geo,...,,id:the::I-geo,,id:of::I-geo,,,,,,
B-per,,,,,,,,,,,...,,,,,,,,,,
I-per,,id:of::I-per,,,,,[id:London::I-per],,,,...,,,,id:of::I-per,,,,,,
I-org,,id:of::I-org,,,,,,[id:to::I-org],,id:the::I-org,...,,id:the::I-org,,id:of::I-org,[id:British::I-org],,,,,
B-art,,,,,,,,,,,...,,,,,[id:British::B-art],,,,,


Now let's check the transition features, which transition features exist for tag combinations between adjacent words.

In [58]:
transition_feats = feature_mapper.feature_list[0][1]  # index 1 = transition features

# Build tag names
tags = [train_seq.y_dict.get_label_name(i) for i in range(len(train_seq.y_dict))]

# For visualization
rows = []

for pos in range(1, len(sequence.x)):
    word = train_seq.x_dict.get_label_name(sequence.x[pos])
    word_prev = train_seq.x_dict.get_label_name(sequence.x[pos - 1])

    for prev_tag_id, prev_tag in enumerate(tags):
        for curr_tag_id, curr_tag in enumerate(tags):
            # Get features for this transition
            feat_ids = feature_mapper.get_transition_features(sequence, pos, curr_tag_id, prev_tag_id)
            feat_ids = [f for f in feat_ids if f != -1]

            if feat_ids:
                feat_names = [feature_mapper.feature_dict.get_label_name(fid) for fid in feat_ids]
                rows.append({
                    "Pos": pos,
                    "Prev Word": word_prev,
                    "Word": word,
                    "Prev Tag": prev_tag,
                    "Curr Tag": curr_tag,
                    "Features": ", ".join(feat_names)
                })

df = pd.DataFrame(rows)
df.head(20)


Unnamed: 0,Pos,Prev Word,Word,Prev Tag,Curr Tag,Features
0,1,Thousands,of,O,O,prev_tag:O::O
1,1,Thousands,of,O,B-geo,prev_tag:O::B-geo
2,1,Thousands,of,O,B-gpe,prev_tag:O::B-gpe
3,1,Thousands,of,O,B-tim,prev_tag:O::B-tim
4,1,Thousands,of,O,B-org,prev_tag:O::B-org
5,1,Thousands,of,O,B-per,prev_tag:O::B-per
6,1,Thousands,of,O,B-art,prev_tag:O::B-art
7,1,Thousands,of,O,B-nat,prev_tag:O::B-nat
8,1,Thousands,of,O,B-eve,prev_tag:O::B-eve
9,1,Thousands,of,B-geo,O,prev_tag:B-geo::O
