In [None]:
%pip install sklearn_crfsuite

Collecting sklearn_crfsuite
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn_crfsuite)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting tabulate>=0.4.2 (from sklearn_crfsuite)
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: python-crfsuite, tabulate, sklearn_crfsuite
Successfully installed python-crfsuite-0.9.10 sklearn_crfsuite-0.5.0 tabulate-0.9.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np

# Load data
df = pd.read_csv('/content/drive/MyDrive/data/clean_x.csv', sep='|', header=0)
df.fillna('', inplace=True)
df['last_name'] = df['1']
df['first_name'] = df['A']
df['middle_name'] = df['B']

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def process_name_bio(row, reversed=False):
    if reversed:
        segments = ['LAST', 'FIRST', 'MIDDLE']
        full_name = row['last_name'] + ', ' + ' '.join([word for word in [row[f'{x.lower()}_name'] for x in segments[1:]] if word])

        tokens = full_name.split()
        labels = []

        for name_segment in segments:
            segment_words = row[f'{name_segment.lower()}_name'].split()
            if len(segment_words) == 0:
                continue
            elif len(segment_words) == 1:
                labels.append(f'B-{name_segment}')
            else:
                labels.extend([f'B-{name_segment}'] + [f'I-{name_segment}'] * (len(segment_words) - 1))


    else:
        segments = ['FIRST', 'MIDDLE', 'LAST']
        full_name = ' '.join([word for word in [row[f'{x.lower()}_name'] for x in segments] if word])
        tokens = full_name.split()
        labels = []

        for name_segment in segments:
            segment_words = row[f'{name_segment.lower()}_name'].split()
            if len(segment_words) == 0:
                continue
            elif len(segment_words) == 1:
                labels.append(f'B-{name_segment}')
            else:
                labels.extend([f'B-{name_segment}'] + [f'I-{name_segment}'] * (len(segment_words) - 1))

    return {
        'full_name': full_name,
        'tokens': tokens,
        'labels': labels
    }


In [None]:
import random
data = df.apply(lambda row: process_name_bio(row, reversed=random.choice([True, False])), axis=1).tolist()
data_df = pd.DataFrame(data)

In [None]:
# Identify features that will be used
suffixes = {'JR', 'SR', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII'}
surname_indicators = {'DE', 'DEL', 'DELA', 'DELOS', 'SAN', 'SANTA', 'SANTO'}

def extract_features(tokens, index):
    word = tokens[index]
    features = {
        'word': word,
        'word.length': len(word),
        'word.relative_position': index / len(tokens),
        'word.is_suffix': word in suffixes,
        'word.is_surname_indicator': word in surname_indicators,
        'word.contains_hyphen': '-' in word,
        'word.contains_period': '.' in word,
        'word.contains_comma': ',' in word,
    }

    if index > 0:
        prev_word = tokens[index - 1]
        features.update({
            'prev_word': prev_word,
            'prev_word.is_suffix': prev_word in suffixes,
            'prev_word.is_surname_indicator': prev_word in surname_indicators,
        })
    else:
        features['BOS'] = True

    if index < len(tokens) - 1:
        next_word = tokens[index + 1]
        features.update({
            'next_word': next_word,
            'next_word.is_suffix': next_word in suffixes,
            'next_word.is_surname_indicator': next_word in surname_indicators,
        })
    else:
        features['EOS'] = True

    return features

data_df['features'] = data_df.apply(lambda row: [extract_features(row['tokens'], i) for i in range(len(row['tokens']))], axis=1)

In [None]:
# Cross-validation
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold

# Initialise the CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

# Prepare for cross-validation
n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True)

# Lists to store results
all_y_true = []
all_y_pred = []

# X y to format
X = data_df['features'].tolist()
y = data_df['labels'].tolist()

# Perform cross-validation
for fold, (train_index, val_index) in enumerate(kf.split(X), 1):
    print(f"\nFold {fold}")

    X_train = [X[i] for i in train_index]
    y_train = [y[i] for i in train_index]
    X_val = [X[i] for i in val_index]
    y_val = [y[i] for i in val_index]

    # Train the model
    crf.fit(X_train, y_train)

    # Make predictions on the validation set
    y_pred = crf.predict(X_val)

    # Store true labels and predictions
    all_y_true.extend(y_val)
    all_y_pred.extend(y_pred)

    # Print fold-specific results
    print(metrics.flat_classification_report(y_val, y_pred))

# Print overall results
print("\nOverall CRF Model Performance:")
print(metrics.flat_classification_report(all_y_true, all_y_pred))

# Get feature importance
feature_importances = crf.state_features_
sorted_features = sorted(
    feature_importances.items(),
    key=lambda x: abs(x[1]),
    reverse=True
)

print("\nTop 10 most important features:")
for (feature, weight) in sorted_features[:10]:
    print(f"{feature}: {weight}")


Fold 1
              precision    recall  f1-score   support

     B-FIRST       1.00      1.00      1.00    118346
      B-LAST       0.98      0.98      0.98    118346
    B-MIDDLE       0.93      0.98      0.95    110697
     I-FIRST       0.99      0.92      0.95     66668
      I-LAST       0.98      0.97      0.97      3667
    I-MIDDLE       0.96      0.98      0.97      3258

    accuracy                           0.97    420982
   macro avg       0.97      0.97      0.97    420982
weighted avg       0.97      0.97      0.97    420982


Fold 2
              precision    recall  f1-score   support

     B-FIRST       1.00      1.00      1.00    118346
      B-LAST       0.99      0.98      0.98    118346
    B-MIDDLE       0.94      0.98      0.96    110521
     I-FIRST       0.99      0.93      0.96     66154
      I-LAST       0.99      0.96      0.97      3579
    I-MIDDLE       0.96      0.98      0.97      3263

    accuracy                           0.98    420209
   macr