In [58]:
import pandas as pd
from itertools import combinations
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import pickle


In [59]:
def read_csv():
    df = pd.read_csv("../../ts-blocking/data/output/ts-blocks.csv")
    return df 

In [60]:
def yield_record_pairs(df):
    # Yield record pairs based on combined_blocking_key
    grouped = df.groupby('combined_blocking_key')
    for _, group in grouped:
        if len(group) < 2:
            continue
        for pair in combinations(group.index, 2):
            yield pair


def prepare_features_for_model(df, record_pairs):
    # Prepare features for each pair for the ML model
    features = []
    labels = []
    for pair in record_pairs:
        record1, record2 = df.loc[pair[0]], df.loc[pair[1]]
        feature_vector = [
            float(record1['first_name_similarity']),
            float(record1['last_name_similarity']),
            float(record1['context_similarity']),
        ]
        # Assuming binary labels: 1 for match, 0 for non-match
        label = 1 if record1['label_numeric'] == record2['label_numeric'] else 0
        features.append(feature_vector)
        labels.append(label)
    return np.array(features), np.array(labels)

def generate_candidate_pairs(df, trained_model):
    """
    Generate candidate pairs for the test data using the trained ML model.
    :param df: DataFrame with test data.
    :param trained_model: Trained ML model for predicting matches.
    :return: DataFrame of candidate pairs likely to match.
    """
    # Assume df is already processed with generate_blocking_keys
    record_pairs = list(yield_record_pairs(df))
    X_test, _ = prepare_features_for_model(df, record_pairs)
    
    # Predict which pairs are likely matches
    predictions = trained_model.predict(X_test)
    
    # Filter pairs predicted as matches
    likely_matches = [pair for pair, prediction in zip(record_pairs, predictions) if prediction == 1]
    
    # Create DataFrame for likely matches
    candidate_pairs = pd.DataFrame({
        'RecordID_1': [pair[0] for pair in likely_matches],
        'RecordID_2': [pair[1] for pair in likely_matches],
        'Predicted_Match': [1]*len(likely_matches)
    })
    
    return candidate_pairs




def train_model(X_train, y_train):
    param_grid = {
        'n_estimators': [50, 100, 200],  # Number of trees in the forest
        'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
        'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
        'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required at each leaf node
    }
    model = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
    
    # Fit the grid search to the data
    grid_search.fit(X_train, y_train)

    # Get the best estimator
    best_model = grid_search.best_estimator_

    # Optionally, print the best parameters found
    print("Best Parameters:", grid_search.best_params_)

    return best_model


def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print("Accuracy on Test Set:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))


df = read_csv()

record_pairs = yield_record_pairs(df)

X, y = prepare_features_for_model(df, record_pairs)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

trained_model = train_model(X_train, y_train)
evaluate_model(trained_model, X_test, y_test)

with open('../data/output/trained_blocking_model.pkl', 'wb') as f:
    pickle.dump(trained_model, f)

candidate_pairs = generate_candidate_pairs(df, trained_model)
print(candidate_pairs)


Fitting 3 folds for each of 108 candidates, totalling 324 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.1s
[CV] END max_de