In [2]:
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from itertools import combinations
import numpy as np
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
import jellyfish

### newest 
# def generate_blocking_keys(df):
#     # Fill missing values with empty strings
#     df.fillna('', inplace=True)

#     ### hierarchial blocking: fileid
#     # df['filename_blocking_key'] = df['id'].str.lower().str.strip()
    
#     # Generate partial blocking keys from names
#     df['last_name_blocking_key'] = df['entity_1_last_name'].str.lower().str[:3] + '_' + df['entity_2_last_name'].str.lower().str[:3]
#     df['first_name_blocking_key'] = df['entity_1_first_name'].str.lower().str[:3] + '_' + df['entity_2_first_name'].str.lower().str[:3]
    
#     # Generate a blocking key based on context similarity
#     df['context_similar'] = np.where(df['context_similarity'].astype(float) > 0.7, 'context_similar', 'context_not_similar')
    
#     # Integrate filename into the blocking key
    
#     # Combine all blocking keys into a single combined_blocking_key
#     df['combined_blocking_key'] = df.apply(lambda x: '_'.join([
#         x['first_name_blocking_key'],
#         x['last_name_blocking_key'],
#         x['context_similar'],
#     ]), axis=1)
    
#     return df



def generate_blocking_keys(df, context_similarity_threshold=0.9, jaccard_similarity_threshold=0.5, name_similarity_threshold=0.7):
    # Fill NaN values with an empty string and ensure all entries are strings
    df.fillna('', inplace=True)
    for col in ['entity_1_last_name', 'entity_2_last_name', 'entity_1_first_name', 'entity_2_first_name', 'context_similarity', 'jaccard_similarity_context', 'first_name_similarity', 'last_name_similarity']:
        df[col] = df[col].astype(str)

    df['last_name_blocking_key'] = df['entity_1_last_name'].str.lower().str[:3] + '_' + df['entity_2_last_name'].str.lower().str[:3]
    df['first_name_blocking_key'] = df['entity_1_first_name'].str.lower().str[:3] + '_' + df['entity_2_first_name'].str.lower().str[:3]

    df['context_similar'] = np.where(df['context_similarity'].astype(float) > context_similarity_threshold, 'context_similar', 'context_not_similar')

    # Combine all keys to form a comprehensive blocking key
    df['combined_blocking_key'] = df.apply(lambda x: '_'.join([x['first_name_blocking_key'], x['last_name_blocking_key'], x['context_similar'],]), axis=1)

    return df

# def generate_blocking_keys(df, context_similarity_threshold=0.9, jaccard_similarity_threshold=0.5, name_similarity_threshold=0.7):
#     # Fill NaN values with an empty string and ensure all entries are strings
#     df.fillna('', inplace=True)
#     for col in ['entity_1_last_name', 'entity_2_last_name', 'entity_1_first_name', 'entity_2_first_name', 'context_similarity', 'jaccard_similarity_context', 'first_name_similarity', 'last_name_similarity']:
#         df[col] = df[col].astype(str)

#     # Phonetic encoding for the first and last names using Metaphone
#     df['entity_1_last_name_phonetic'] = df['entity_1_last_name'].apply(lambda x: jellyfish.metaphone(x))
#     df['entity_2_last_name_phonetic'] = df['entity_2_last_name'].apply(lambda x: jellyfish.metaphone(x))
#     df['entity_1_first_name_phonetic'] = df['entity_1_first_name'].apply(lambda x: jellyfish.metaphone(x))
#     df['entity_2_first_name_phonetic'] = df['entity_2_first_name'].apply(lambda x: jellyfish.metaphone(x))

#     # Create phonetic blocking keys based on the phonetic encoding
#     df['phonetic_blocking_key'] = df.apply(lambda x: '_'.join([x['entity_1_last_name_phonetic'], x['entity_2_last_name_phonetic'], x['entity_1_first_name_phonetic'], x['entity_2_first_name_phonetic']]), axis=1)

#     # Integrate context similarity into blocking keys
#     df['context_similar'] = np.where(df['context_similarity'].astype(float) > context_similarity_threshold, 'context_similar', 'context_not_similar')

#     # Additional context check with Jaccard similarity
#     df['jaccard_context_similar'] = np.where(df['jaccard_similarity_context'].astype(float) > jaccard_similarity_threshold, 'jaccard_similar', 'jaccard_not_similar')

#     # Name similarity checks
#     df['name_similar'] = np.where((df['first_name_similarity'].astype(float) > name_similarity_threshold) & (df['last_name_similarity'].astype(float) > name_similarity_threshold), 'name_similar', 'name_not_similar')

#     # Rank similarity (assuming encoded rank similarity needs a specific threshold, this is arbitrary for the example)
#     df['rank_similar'] = np.where(abs(df['entity_1_rank_encoded'].astype(float) - df['entity_2_rank_encoded'].astype(float)) < 0.5, 'rank_similar', 'rank_not_similar')

#     # Combine all keys to form a comprehensive blocking key
#     df['combined_blocking_key'] = df.apply(lambda x: '_'.join([x['phonetic_blocking_key'], x['context_similar'],]), axis=1)

#     return df

def yield_record_pairs(df):
    # Group by last name blocking key
    grouped = df.groupby('combined_blocking_key')
    for _, group in grouped:
        if len(group) < 2:
            continue
        for pair in combinations(group.index, 2):
            yield pair

def train_model(X_train, y_train):
    model = RandomForestClassifier()
    k_fold = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
    cv_results = cross_val_score(model, X_train, y_train, cv=k_fold, scoring='accuracy')  # Changed to 'accuracy' for simplicity
    print("Cross-Validation Results:")
    print("Mean Accuracy:", cv_results.mean())
    print("Standard Deviation:", cv_results.std())
    model.fit(X_train, y_train)
    with open('../data/output/trained_model.pkl', 'wb') as f:
        pickle.dump(model, f)
    return model


if __name__ == "__main__":
    df = pd.read_csv("../../ts-feature-engineering/data/output/features.csv")

    # Convert labels to numeric
    label_mapping = {
        'easy match': 0,
        'hard match': 1,
        'easy non-match': 2,
        'hard non-match': 3
    }
    df['label_numeric'] = df['label'].str.lower().map(label_mapping)

    df = generate_blocking_keys(df)
    print(df.head())
    # X = []
    # y = []

    # for record_pair in yield_record_pairs(df):
    #     record1 = df.loc[record_pair[0]]
    #     record2 = df.loc[record_pair[1]]

    #     features = [
    #         record1['first_name_similarity'],
    #         record1['last_name_similarity'],
    #         record1['context_similarity'],
    #     ]
    #     label = record1['label_numeric']

    #     X.append(features)
    #     y.append(label)

    # if len(X) > 0:
    #     trained_model = train_model(np.array(X), np.array(y))
    #     y_pred = trained_model.predict(np.array(X))
    #     print("Classification Report:")
    #     print(classification_report(y, y_pred))
    # else:
    #     print("No record pairs found satisfying similarity threshold.")

FileNotFoundError: [Errno 2] No such file or directory: '../../ts-feature-engineering/data/output/features.csv'