In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/clean_x.csv', sep='|', header=0)
df.fillna('', inplace=True)
df['full_name'] = df['A'] + ' ' + df['B'] + ' ' + df['1']
df['last_name'] = df['1']
df['first_name'] = df['A']
df['middle_name'] = df['B']

In [2]:
df.head(10)

Unnamed: 0,0,1,2,A,B,is_surname_in_list,full_name,last_name,first_name,middle_name
0,20227007,QUE,ANGELA GIMENA,ANGELA,GIMENA,True,ANGELA GIMENA QUE,QUE,ANGELA,GIMENA
1,7008,QUERONA,NORLEY MAEKA YAN,NORLEY MAEKA,YAN,True,NORLEY MAEKA YAN QUERONA,QUERONA,NORLEY MAEKA,YAN
2,7009,QUERUBIN,MA TERESA BRAÑA,MA TERESA,BRAÑA,True,MA TERESA BRAÑA QUERUBIN,QUERUBIN,MA TERESA,BRAÑA
3,7010,QUERUBIN,MARIBELLE RAZON,MARIBELLE,RAZON,True,MARIBELLE RAZON QUERUBIN,QUERUBIN,MARIBELLE,RAZON
4,7011,QUEVEDO,RHEA JAINE LAGRAMA,RHEA JAINE,LAGRAMA,True,RHEA JAINE LAGRAMA QUEVEDO,QUEVEDO,RHEA JAINE,LAGRAMA
5,7012,QUEZADA,CRISTINA VERANO,CRISTINA,VERANO,True,CRISTINA VERANO QUEZADA,QUEZADA,CRISTINA,VERANO
6,7013,QUIA,NONITA SANTIAGO,NONITA,SANTIAGO,True,NONITA SANTIAGO QUIA,QUIA,NONITA,SANTIAGO
7,7014,QUIAMCO,MARIA LORETH OLETRES,MARIA LORETH OLETRES,,False,MARIA LORETH OLETRES QUIAMCO,QUIAMCO,MARIA LORETH OLETRES,
8,7015,QUIAÑO,SHARA MAE FOLLERO,SHARA MAE,FOLLERO,True,SHARA MAE FOLLERO QUIAÑO,QUIAÑO,SHARA MAE,FOLLERO
9,7016,QUIBIAL,YVEZ DE LOS ANGELES,YVEZ,DE LOS ANGELES,True,YVEZ DE LOS ANGELES QUIBIAL,QUIBIAL,YVEZ,DE LOS ANGELES


In [2]:
common_names = {}

for segment in ['middle', 'last']:
    common_names[segment] = df.loc[df[f'{segment}_name'].notnull() & (df[f'{segment}_name'] != '') & (df[f'{segment}_name'].str.split().str.len() == 1), f'{segment}_name'] \
    .groupby(df[f'{segment}_name']).size().sort_values(ascending=False).index[:100].tolist()

suffixes = {'JR', 'III', 'IV', 'V', 'VI', 'VII', 'VIII'}
word_count = df['first_name'].str.split().explode().value_counts().reset_index(name='Frequency').rename(columns={'index': 'Word'})
word_count.drop(word_count[word_count['Word'].isin(suffixes)].index, inplace=True)
common_names['first'] = word_count['Word'].head(100).tolist()


In [4]:
def process_name_bio(row):
    full_name = row['full_name'].split()
    tokens = full_name
    labels = []
    
    for name_segment in ['FIRST', 'MIDDLE', 'LAST']:
        segment_words = row[f'{name_segment.lower()}_name'].split()
        if len(segment_words) == 0:
            continue
        elif len(segment_words) == 1:
            labels.append(f'B-{name_segment}')
        else:
            labels.extend([f'B-{name_segment}'] + [f'I-{name_segment}'] * (len(segment_words) - 1))
    
    return {
        'full_name': row['full_name'],
        'tokens': tokens,
        'labels': labels
    }


In [5]:
data = df.apply(process_name_bio, axis=1).tolist()
data_df = pd.DataFrame(data)

In [6]:
data_df

Unnamed: 0,full_name,tokens,labels
0,ANGELA GIMENA QUE,"[ANGELA, GIMENA, QUE]","[B-FIRST, B-MIDDLE, B-LAST]"
1,NORLEY MAEKA YAN QUERONA,"[NORLEY, MAEKA, YAN, QUERONA]","[B-FIRST, I-FIRST, B-MIDDLE, B-LAST]"
2,MA TERESA BRAÑA QUERUBIN,"[MA, TERESA, BRAÑA, QUERUBIN]","[B-FIRST, I-FIRST, B-MIDDLE, B-LAST]"
3,MARIBELLE RAZON QUERUBIN,"[MARIBELLE, RAZON, QUERUBIN]","[B-FIRST, B-MIDDLE, B-LAST]"
4,RHEA JAINE LAGRAMA QUEVEDO,"[RHEA, JAINE, LAGRAMA, QUEVEDO]","[B-FIRST, I-FIRST, B-MIDDLE, B-LAST]"
...,...,...,...
1183447,BRITTANNY AIKI MARI FUSILERO,"[BRITTANNY, AIKI, MARI, FUSILERO]","[B-FIRST, I-FIRST, B-MIDDLE, B-LAST]"
1183448,DIANA ROSE IDOS FUSILERO,"[DIANA, ROSE, IDOS, FUSILERO]","[B-FIRST, I-FIRST, B-MIDDLE, B-LAST]"
1183449,JMKELVIN MIRANDA FUSIN,"[JMKELVIN, MIRANDA, FUSIN]","[B-FIRST, B-MIDDLE, B-LAST]"
1183450,JUDY ANN SAGION FUSTER,"[JUDY, ANN, SAGION, FUSTER]","[B-FIRST, I-FIRST, B-MIDDLE, B-LAST]"


In [7]:
suffixes = {'JR', 'SR', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII'}
surname_indicators = {'DE', 'DEL', 'DELA', 'DELOS', 'SAN', 'SANTA', 'SANTO'}

def extract_features(tokens, index, common_first_names, common_last_names):
    word = tokens[index]
    features = {
        'word': word,
        'word.length': len(word),
        'word.is_first': index == 0,
        'word.is_last': index == len(tokens) - 1,
        'word.position': index,
        'word.relative_position': index / len(tokens),
        'word.is_suffix': word in suffixes,
        'word.is_surname_indicator': word in surname_indicators,
        'word.in_common_first_names': word in common_first_names,
        'word.in_common_last_names': word in common_last_names,
        'word.contains_hyphen': '-' in word,
        'word.contains_period': '.' in word,
    }

    if index > 0:
        prev_word = tokens[index - 1]
        features.update({
            'prev_word': prev_word,
            'prev_word.is_suffix': prev_word in suffixes,
            'prev_word.is_surname_indicator': prev_word in surname_indicators,
        })
    else:
        features['BOS'] = True

    if index < len(tokens) - 1:
        next_word = tokens[index + 1]
        features.update({
            'next_word': next_word,
            'next_word.is_suffix': next_word in suffixes,
            'next_word.is_surname_indicator': next_word in surname_indicators,
        })
    else:
        features['EOS'] = True

    return features
    
data_df['features'] = data_df.apply(lambda row: [extract_features(row['tokens'], i, set(common_names['first']), set(common_names['last'])) for i in range(len(row['tokens']))], axis=1)

In [8]:
data_df

Unnamed: 0,full_name,tokens,labels,features
0,ANGELA GIMENA QUE,"[ANGELA, GIMENA, QUE]","[B-FIRST, B-MIDDLE, B-LAST]","[{'word': 'ANGELA', 'word.length': 6, 'word.is..."
1,NORLEY MAEKA YAN QUERONA,"[NORLEY, MAEKA, YAN, QUERONA]","[B-FIRST, I-FIRST, B-MIDDLE, B-LAST]","[{'word': 'NORLEY', 'word.length': 6, 'word.is..."
2,MA TERESA BRAÑA QUERUBIN,"[MA, TERESA, BRAÑA, QUERUBIN]","[B-FIRST, I-FIRST, B-MIDDLE, B-LAST]","[{'word': 'MA', 'word.length': 2, 'word.is_fir..."
3,MARIBELLE RAZON QUERUBIN,"[MARIBELLE, RAZON, QUERUBIN]","[B-FIRST, B-MIDDLE, B-LAST]","[{'word': 'MARIBELLE', 'word.length': 9, 'word..."
4,RHEA JAINE LAGRAMA QUEVEDO,"[RHEA, JAINE, LAGRAMA, QUEVEDO]","[B-FIRST, I-FIRST, B-MIDDLE, B-LAST]","[{'word': 'RHEA', 'word.length': 4, 'word.is_f..."
...,...,...,...,...
1183447,BRITTANNY AIKI MARI FUSILERO,"[BRITTANNY, AIKI, MARI, FUSILERO]","[B-FIRST, I-FIRST, B-MIDDLE, B-LAST]","[{'word': 'BRITTANNY', 'word.length': 9, 'word..."
1183448,DIANA ROSE IDOS FUSILERO,"[DIANA, ROSE, IDOS, FUSILERO]","[B-FIRST, I-FIRST, B-MIDDLE, B-LAST]","[{'word': 'DIANA', 'word.length': 5, 'word.is_..."
1183449,JMKELVIN MIRANDA FUSIN,"[JMKELVIN, MIRANDA, FUSIN]","[B-FIRST, B-MIDDLE, B-LAST]","[{'word': 'JMKELVIN', 'word.length': 8, 'word...."
1183450,JUDY ANN SAGION FUSTER,"[JUDY, ANN, SAGION, FUSTER]","[B-FIRST, I-FIRST, B-MIDDLE, B-LAST]","[{'word': 'JUDY', 'word.length': 4, 'word.is_f..."


In [9]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(data_df, test_size=0.2, random_state=42)

# Prepare data for CRF
X_train = train_df['features'].tolist()
y_train = train_df['labels'].tolist()
X_test = test_df['features'].tolist()
y_test = test_df['labels'].tolist()

# Display sample
print(train_df.head())
print("\nSample X_train:")
print(X_train[0])
print("\nSample y_train:")
print(y_train[0])

                          full_name                            tokens  \
279258       CHAIRA DE GUIA LIWANAG       [CHAIRA, DE, GUIA, LIWANAG]   
691845  BEULAH GRACE ARROYO DACANAY  [BEULAH, GRACE, ARROYO, DACANAY]   
531360        CHRISTINE ALIN CABANG         [CHRISTINE, ALIN, CABANG]   
169950    MICHELLE MARVIL AGCAMARAN     [MICHELLE, MARVIL, AGCAMARAN]   
494038       SOPHIA CELMAR GALENDEZ        [SOPHIA, CELMAR, GALENDEZ]   

                                       labels  \
279258  [B-FIRST, B-MIDDLE, I-MIDDLE, B-LAST]   
691845   [B-FIRST, I-FIRST, B-MIDDLE, B-LAST]   
531360            [B-FIRST, B-MIDDLE, B-LAST]   
169950            [B-FIRST, B-MIDDLE, B-LAST]   
494038            [B-FIRST, B-MIDDLE, B-LAST]   

                                                 features  
279258  [{'word': 'CHAIRA', 'word.length': 6, 'word.is...  
691845  [{'word': 'BEULAH', 'word.length': 6, 'word.is...  
531360  [{'word': 'CHRISTINE', 'word.length': 9, 'word...  
169950  [{'word': 'MICHELL

In [10]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn.metrics import classification_report

# Initialize the CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

# Train the model
crf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = crf.predict(X_test)

# Evaluate the model
print("CRF Model Performance:")
print(metrics.flat_classification_report(y_test, y_pred))

# Get feature importance
feature_importances = crf.state_features_
sorted_features = sorted(
    feature_importances.items(),
    key=lambda x: abs(x[1]),
    reverse=True
)

print("\nTop 10 most important features:")
for (feature, weight) in sorted_features[:10]:
    print(f"{feature}: {weight}")

CRF Model Performance:
              precision    recall  f1-score   support

     B-FIRST       1.00      1.00      1.00    236691
      B-LAST       1.00      1.00      1.00    236691
    B-MIDDLE       0.96      1.00      0.98    220981
     I-FIRST       0.99      0.94      0.96    133215
      I-LAST       1.00      0.99      0.99      7456
    I-MIDDLE       0.99      0.99      0.99      6501

    accuracy                           0.99    841535
   macro avg       0.99      0.98      0.99    841535
weighted avg       0.99      0.99      0.99    841535


Top 10 most important features:
('word.is_last', 'B-MIDDLE'): -11.391395
('word.is_last', 'I-FIRST'): -11.062834
('word.is_last', 'I-MIDDLE'): -10.978739
('prev_word:STA', 'I-MIDDLE'): 10.158458
('prev_word:STO', 'I-MIDDLE'): 8.787141
('word:DC', 'I-FIRST'): 7.604971
('word:MAE', 'I-FIRST'): 6.923913
('word:DG', 'I-FIRST'): 6.857006
('word:DL', 'I-FIRST'): 6.783962
('word:PEARL', 'I-FIRST'): 6.772407
