In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

# The GMM k-NN function remains the same (keeps feature column prefix 'feature_').
def find_k_nearest_neighbors_within_cluster(df_features_and_clusters, k=10):
    """
    Finds the k closest products (P1 buckets) to every product within their assigned GMM cluster (P2).
    Expects df_features_and_clusters to contain 'P1_Bucket_ID', 'GMM_Cluster_ID' and feature_* columns.
    """
    feature_cols = [col for col in df_features_and_clusters.columns if col.startswith('feature_')]

    if not feature_cols:
        raise ValueError("Feature columns (e.g., 'feature_1' to 'feature_96') not found.")

    all_results = []
    for cluster_id, cluster_data in df_features_and_clusters.groupby('GMM_Cluster_ID'):
        X = cluster_data[feature_cols].values
        p1_ids = cluster_data['P1_Bucket_ID'].values
        n_samples = X.shape[0]

        if n_samples <= k:
            # skip clusters that don't have enough members
            continue

        nn_model = NearestNeighbors(n_neighbors=k + 1, metric='euclidean', algorithm='auto')
        nn_model.fit(X)
        distances, indices = nn_model.kneighbors(X, return_distance=True)

        for i in range(n_samples):
            source_p1 = p1_ids[i]
            neighbor_indices = indices[i, 1:]  # skip self
            neighbor_distances = distances[i, 1:]
            closest_p1_ids = p1_ids[neighbor_indices]

            result = {'Source_P1_Bucket_ID': source_p1, 'GMM_Cluster_ID': cluster_id}
            for j in range(k):
                result[f'Closest_{j+1}_P1_ID'] = closest_p1_ids[j]
                result[f'Distance_{j+1}'] = float(neighbor_distances[j])
            all_results.append(result)

    return pd.DataFrame(all_results)

# --- Example Usage: load real artifacts when available ---
try:
    # 1. Load Cluster Data (P1/P2)
    cluster_csv_candidates = ["../data/processed/gmm_cluster_product_details_for_analysis.csv", "data/processed/gmm_cluster_product_details_for_analysis.csv"]
    cluster_path = None
    for p in cluster_csv_candidates:
        if os.path.exists(p):
            cluster_path = p
            break
    if cluster_path is None:
        raise FileNotFoundError("Could not find gmm_cluster_product_details_for_analysis.csv in expected locations.")
    cluster_df = pd.read_csv(cluster_path)
    cluster_df = cluster_df[['P1_Bucket_ID', 'GMM_Cluster_ID']].drop_duplicates()

    # 2. Load bucket-level features (prefer X_bucket_pca.npy) and bucket index mapping
    feature_npy_candidates = ["artifacts/features/X_bucket_pca.npy", "artifacts/features/X_product_pca.npy", "../notebooks/artifacts/features/X_bucket_pca.npy", "notebooks/artifacts/features/X_bucket_pca.npy"]
    bucket_index_candidates = ["artifacts/features/bucket_index.parquet", "artifacts/features/bucket_index.csv", "artifacts/buckets/bucket_index.parquet", "notebooks/artifacts/features/bucket_index.parquet"]

    feature_npy_path = None
    for p in feature_npy_candidates:
        if os.path.exists(p):
            feature_npy_path = p
            break

    bucket_index_path = None
    for p in bucket_index_candidates:
        if os.path.exists(p):
            bucket_index_path = p
            break

    if feature_npy_path is None or bucket_index_path is None:
        # fallback: try to load precomputed notebook artifacts relative to this notebook
        alt_feature = 'artifacts/features/X_bucket_pca.npy'
        alt_index = 'artifacts/features/bucket_index.parquet'
        if os.path.exists(alt_feature) and os.path.exists(alt_index):
            feature_npy_path = alt_feature
            bucket_index_path = alt_index

    if feature_npy_path is None or bucket_index_path is None:
        raise FileNotFoundError("Could not find bucket features (X_bucket_pca.npy) or bucket_index.parquet.")

    print(f"Loading bucket features from: {feature_npy_path}")
    X_buckets = np.load(feature_npy_path)

    print(f"Loading bucket index mapping from: {bucket_index_path}")
    # bucket_index is expected to have a column linking each row of X_buckets to a P1_Bucket_ID
    if bucket_index_path.endswith('.parquet'):
        bucket_index = pd.read_parquet(bucket_index_path)
    else:
        bucket_index = pd.read_csv(bucket_index_path)

    # Try to find P1 id column in bucket_index
    if 'P1_Bucket_ID' in bucket_index.columns:
        p1_col = 'P1_Bucket_ID'
    elif 'P1' in bucket_index.columns:
        p1_col = 'P1'
    else:
        # try index
        if bucket_index.index.is_unique:
            bucket_index = bucket_index.reset_index()
            p1_col = bucket_index.columns[0]
        else:
            raise ValueError("Could not infer P1_Bucket_ID column from bucket_index file.")

    p1_list = bucket_index[p1_col].values
    if len(p1_list) != X_buckets.shape[0]:
        raise ValueError("Length mismatch: bucket features rows do not match bucket_index rows.")

    # Create feature dataframe
    feature_cols = [f'feature_{i}' for i in range(1, X_buckets.shape[1] + 1)]
    df_features = pd.DataFrame(X_buckets, columns=feature_cols)
    df_features['P1_Bucket_ID'] = p1_list

    # 3. Create the required input for the k-NN function
    df_features_and_clusters = pd.merge(cluster_df, df_features, on='P1_Bucket_ID', how='inner')

    # 4. Run the k-NN calculation (P1 to P1)
    df_closest_neighbors = find_k_nearest_neighbors_within_cluster(df_features_and_clusters, k=10)

    # 5. Load / infer P0 -> P1 mapping (try several artifact locations)
    p0p1_candidates = [
        'artifacts/buckets/product_to_bucket.parquet',
        'artifacts/features/meta_product.parquet',
        'artifacts/graph/nodes_p1_buckets.csv',
        '../notebooks/artifacts/buckets/product_to_bucket.parquet',
        '../notebooks/artifacts/features/meta_product.parquet',
        '../data/processed/products_with_prices_ingredients_nutrition.csv'
    ]
    p0p1_path = None
    for p in p0p1_candidates:
        if os.path.exists(p):
            p0p1_path = p
            break

    if p0p1_path is None:
        # as a last resort, try to build a non-redundant mapping from the products CSV if present
        raise FileNotFoundError("Could not find a P0->P1 mapping file. Looked for product_to_bucket.parquet or meta_product.parquet.")

    print(f"Loading P0->P1 mapping from: {p0p1_path}")
    if p0p1_path.endswith('.parquet'):
        df_map = pd.read_parquet(p0p1_path)
    else:
        df_map = pd.read_csv(p0p1_path)

    # Normalize columns to 'product_id' and 'P1_Bucket_ID' if possible
    possible_pid = [c for c in df_map.columns if 'product' in c.lower()][:1]
    possible_p1 = [c for c in df_map.columns if 'p1' in c.lower() or 'bucket' in c.lower()][:1]
    if not possible_pid or not possible_p1:
        # try common names
        if 'product_id' in df_map.columns and 'P1_Bucket_ID' in df_map.columns:
            df_p0_p1_map = df_map[['product_id', 'P1_Bucket_ID']].copy()
        else:
            raise ValueError("Could not find product_id and P1 mapping columns in the P0->P1 file.")
    else:
        df_p0_p1_map = df_map[[possible_pid[0], possible_p1[0]]].copy()
        df_p0_p1_map.columns = ['product_id', 'P1_Bucket_ID']

    # 6. Join the P0 IDs to the k-NN results for SOURCE product
    df_final = pd.merge(df_closest_neighbors, df_p0_p1_map, left_on='Source_P1_Bucket_ID', right_on='P1_Bucket_ID', how='left')
    df_final = df_final.rename(columns={'product_id': 'Source_P0_Product_ID'})
    df_final = df_final.drop(columns=['P1_Bucket_ID'])

    # 7. Map neighbor P1 buckets to P0 product IDs
    print("Mapping neighbor P1 buckets to P0 product IDs...")
    p1_to_p0 = df_p0_p1_map.set_index('P1_Bucket_ID')['product_id'].to_dict()
    
    for j in range(1, 11):
        p1_col = f'Closest_{j}_P1_ID'
        p0_col = f'Closest_{j}_P0_Product_ID'
        if p1_col in df_final.columns:
            df_final[p0_col] = df_final[p1_col].map(p1_to_p0)
            # Drop the P1 column since we only want P0 in final output
            df_final = df_final.drop(columns=[p1_col])

    # Reorder columns: Source (P0, P1, P2), then neighbors (P0, Distance)
    base_cols = ['Source_P0_Product_ID', 'Source_P1_Bucket_ID', 'GMM_Cluster_ID']
    neighbor_cols = []
    for j in range(1, 11):
        neighbor_cols.extend([f'Closest_{j}_P0_Product_ID', f'Distance_{j}'])
    
    remaining_cols = [c for c in df_final.columns if c not in base_cols and c not in neighbor_cols]
    df_final = df_final[base_cols + neighbor_cols + remaining_cols]

    # 8. Output the results to CSV
    output_filename = '../data/processed/product_substitution_neighbors_p0.csv'
    os.makedirs(os.path.dirname(output_filename), exist_ok=True)
    df_final.to_csv(output_filename, index=False)

    print(f"\nSuccessfully calculated and saved {len(df_final)} rows of closest neighbors to {output_filename}")
    print(f"✓ All neighbors are from the SAME GMM cluster as the source product")
    print(f"\nColumns:")
    print(f"  - Source_P0_Product_ID (P0), Source_P1_Bucket_ID (P1), GMM_Cluster_ID (P2)")
    print(f"  - Closest_1..10_P0_Product_ID (neighbor P0 IDs within same cluster)")
    print(f"  - Distance_1..10 (Euclidean distance in PCA feature space)")
    print("\nResult Head:")
    try:
        print(df_final.head().to_markdown(index=False, numalign="left", stralign="left"))
    except Exception:
        print(df_final.head())

except Exception as e:
    print("\nAn error occurred during execution. Ensure bucket feature artifacts and the P0->P1 mapping file are available.")
    print(f"Error details: {e}")

Loading bucket features from: artifacts/features/X_bucket_pca.npy
Loading bucket index mapping from: artifacts/features/bucket_index.parquet
Loading P0->P1 mapping from: artifacts/buckets/product_to_bucket.parquet
Mapping neighbor P1 buckets to P0 product IDs...
Loading P0->P1 mapping from: artifacts/buckets/product_to_bucket.parquet
Mapping neighbor P1 buckets to P0 product IDs...

Successfully calculated and saved 35429 rows of closest neighbors to ../data/processed/product_substitution_neighbors_p0.csv
✓ All neighbors are from the SAME GMM cluster as the source product

Columns:
  - Source_P0_Product_ID (P0), Source_P1_Bucket_ID (P1), GMM_Cluster_ID (P2)
  - Closest_1..10_P0_Product_ID (neighbor P0 IDs within same cluster)
  - Distance_1..10 (Euclidean distance in PCA feature space)

Result Head:
   Source_P0_Product_ID  Source_P1_Bucket_ID  GMM_Cluster_ID  \
0                  2343                  844               0   
1                 10917                  945               0 