In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report
import random

df = pd.read_csv('../data/clean_x.csv', sep='|', header=0)
df = df.head(10000)
df.fillna('', inplace=True)
df['last_name'] = df['1']
df['first_name'] = df['A']
df['middle_name'] = df['B']

suffixes = {'JR', 'SR', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII'}
surname_indicators = {'DE', 'DEL', 'DELA', 'DELOS', 'SAN', 'SANTA', 'SANTO'}

def extract_features(tokens, index):
    word = tokens[index]
    features = {
        'word': word,
        'word.length': len(word),
        'word.relative_position': index / len(tokens),
        'word.is_suffix': word in suffixes,
        'word.is_surname_indicator': word in surname_indicators,
        'word.contains_hyphen': '-' in word,
        'word.contains_period': '.' in word,
        'word.contains_comma': ',' in word,
    }

    if index > 0:
        prev_word = tokens[index - 1]
        features.update({
            'prev_word': prev_word,
            'prev_word.is_suffix': prev_word in suffixes,
            'prev_word.is_surname_indicator': prev_word in surname_indicators,
        })
    else:
        features['BOS'] = True

    if index < len(tokens) - 1:
        next_word = tokens[index + 1]
        features.update({
            'next_word': next_word,
            'next_word.is_suffix': next_word in suffixes,
            'next_word.is_surname_indicator': next_word in surname_indicators,
        })
    else:
        features['EOS'] = True

    return features


def process_name_bio(row, reversed=False):
    if reversed:
        segments = ['LAST', 'FIRST', 'MIDDLE']
        full_name = row['last_name'] + ', ' + ' '.join([word for word in [row[f'{x.lower()}_name'] for x in segments[1:]] if word])

        tokens = full_name.split()
        labels = []
        
        for name_segment in segments:
            segment_words = row[f'{name_segment.lower()}_name'].split()
            if len(segment_words) == 0:
                continue
            elif len(segment_words) == 1:
                labels.append(f'B-{name_segment}')
            else:
                labels.extend([f'B-{name_segment}'] + [f'I-{name_segment}'] * (len(segment_words) - 1))

        
    else:
        segments = ['FIRST', 'MIDDLE', 'LAST']
        full_name = ' '.join([word for word in [row[f'{x.lower()}_name'] for x in segments] if word])
        tokens = full_name.split()
        labels = []
        
        for name_segment in segments:
            segment_words = row[f'{name_segment.lower()}_name'].split()
            if len(segment_words) == 0:
                continue
            elif len(segment_words) == 1:
                labels.append(f'B-{name_segment}')
            else:
                labels.extend([f'B-{name_segment}'] + [f'I-{name_segment}'] * (len(segment_words) - 1))
        
    return {
        'full_name': full_name,
        'tokens': tokens,
        'labels': labels
    }


# Apply the processing function to create a new dataframe with BIO labels
bio_data = df.apply(lambda row: process_name_bio(row, reversed=random.choice([True, False])), axis=1).tolist()
bio_df = pd.DataFrame(bio_data)

# Add features to the DataFrame
bio_df['features'] = bio_df.apply(lambda row: [extract_features(row['tokens'], i) for i in range(len(row['tokens']))], axis=1)

# Prepare data for Naive Bayes
X = [item for sublist in bio_df['features'].tolist() for item in sublist]
y = [item for sublist in bio_df['labels'].tolist() for item in sublist]

# Convert feature dictionaries to vectors
vec = DictVectorizer(sparse=False)
X_vec = vec.fit_transform(X)

# Convert labels to integer encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vec, y_encoded, test_size=0.2, random_state=42)

# Initialize and train the Naive Bayes model
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Make predictions on the test set
y_pred = nb.predict(X_test)

# Evaluate the model
print("Naive Bayes Model Performance:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

# # Function to predict on new data
# def predict_name_parts_nb(name, nb_model, vec, le):
#     tokens = name.split()
#     features = extract_features(tokens)
#     X_new = vec.transform(features)
#     y_pred = nb_model.predict(X_new)
#     labels = le.inverse_transform(y_pred)
#     return list(zip(tokens, labels))

# # Example usage
# sample_name = 'JUAN DELA CRUZ SANTOS'
# predicted_parts = predict_name_parts_nb(sample_name, nb, vec, le)
# print(f"\nPrediction for '{sample_name}':")
# for token, label in predicted_parts:
#     print(f"{token}: {label}")

# Get feature importance
feature_importances = nb.feature_log_prob_
sorted_features = sorted(
    zip(vec.get_feature_names(), feature_importances.max(axis=0)),
    key=lambda x: x[1],
    reverse=True
)

print("\nTop 10 most important features:")
for feature, importance in sorted_features[:10]:
    print(f"{feature}: {importance}")

Naive Bayes Model Performance:
              precision    recall  f1-score   support

     B-FIRST       0.88      0.85      0.87      2026
      B-LAST       0.85      0.72      0.78      1990
    B-MIDDLE       0.62      0.95      0.75      1932
     I-FIRST       0.97      0.49      0.65      1084
      I-LAST       0.00      0.00      0.00        35
    I-MIDDLE       0.71      0.07      0.13        70

    accuracy                           0.77      7137
   macro avg       0.67      0.51      0.53      7137
weighted avg       0.81      0.77      0.77      7137


Top 10 most important features:
word.length: -0.8361709610977037
word.relative_position: -3.329575571424014
BOS: -3.3648798337406056
EOS: -3.470331912250785
word.contains_comma: -3.4984262524764294
prev_word.is_surname_indicator: -5.2873015309305895
word=MAE: -5.832044239730096
prev_word=DE: -6.053876180045353
word=JOY: -6.080940287146721
word.is_suffix: -6.208279709523323


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
X_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])