In [None]:
# standard data science packages
import pandas as pd
import numpy as np
import math

# data viz imports
import matplotlib.pyplot as plt
import folium
import branca
import seaborn as sns

# file system imports
import os

# sklearn imports
from sklearn.model_selection import train_test_split

# scipy imports
from scipy.optimize import linear_sum_assignment

In [None]:
for seed in range(5):
    
    X_miami = np.loadtxt("../data_openml/X_361260.csv", delimiter=",")[1:,:]
    y_miami = np.loadtxt("../data_openml/y_361260.csv", delimiter=",")[1:]
    cols_miami = ['LATITUDE', 'LONGITUDE', 'LND_SQFOOT', 'TOT_LVG_AREA',
                'SPEC_FEAT_VAL', 'RAIL_DIST', 'OCEAN_DIST', 'WATER_DIST',
                'CNTR_DIST', 'SUBCNTR_DI', 'HWY_DIST', 'age', 'avno60plus',
                'month_sold', 'structure_quality']

    # read in shap values
    shap_miami = np.loadtxt(f"../lfi-values/fulldata/rf/seed{seed}/361260/shap.csv",
                            delimiter=",")

    # read in lotla values
    lotla_miami = np.loadtxt(f"../lfi-values/fulldata/rf/seed{seed}/361260/" + \
        "elastic_nonnormed_nosquared_norank.csv", delimiter=",")

    # split into train and test 50/50 with seed {seed}
    _, X_miami, _, y_miami = train_test_split(X_miami, y_miami,
                                            test_size = 0.5, random_state = seed)

    # read in cluster labels
    base = f"../cluster-results/rf/kmeans/linear/361260/seed{seed}/"
    labels_shap = np.loadtxt(base + "k4_shap_labels.csv", delimiter=",").astype(int)
    labels_lotla = np.loadtxt(base + \
        "k4_elastic_nonnormed_nosquared_norank_labels.csv", delimiter=",").astype(int)
    
    # get confusion matrix of labels
    confusion_matrix = np.zeros((4, 4), dtype=int)
    for i in range(len(labels_shap)):
        confusion_matrix[labels_shap[i], labels_lotla[i]] += 1
    # print confusion matrix
    print(f"Confusion Matrix for Seed #{seed}:")
    print(confusion_matrix)
    
    row_ind, col_ind = linear_sum_assignment(-confusion_matrix)  # Negative for maximization
    cluster_mapping = {i: j for i, j in zip(row_ind, col_ind)}

    print(f"Cluster Mapping for Seed #{seed}:", cluster_mapping)
    
    # convert labels_shap and labels_lotla to the same mapping
    labels_shap_mapped = np.zeros(labels_shap.shape)
    for i in range(len(labels_shap)):
        labels_shap_mapped[i] = cluster_mapping[labels_shap[i]]
    labels_shap_mapped = labels_shap_mapped.astype(int)
    
    # get confusion matrix of labels
    confusion_matrix = np.zeros((4, 4), dtype=int)
    for i in range(len(labels_shap_mapped)):
        confusion_matrix[labels_shap_mapped[i], labels_lotla[i]] += 1
    print(f"Updated Confusion Matrix for Seed #{seed}:")
    print(confusion_matrix)
    
    # get clusters for lotla
    lotla_clusters = pd.DataFrame(X_miami, columns = cols_miami)
    lotla_clusters['cluster'] = labels_lotla
    # get clusters for shap
    shap_clusters = pd.DataFrame(X_miami, columns = cols_miami)
    shap_clusters['cluster'] = labels_shap_mapped
        
    # get set of points where lotla and shap disagree
    shap_disagree = labels_shap_mapped[labels_shap_mapped != labels_lotla]
    lotla_disagree = labels_lotla[labels_shap_mapped != labels_lotla]
    X_disagree = X_miami[labels_shap_mapped != labels_lotla]
    shap_clusters_disagree = pd.DataFrame(X_disagree, columns = cols_miami)
    shap_clusters_disagree['cluster'] = shap_disagree
    lotla_clusters_disagree = pd.DataFrame(X_disagree, columns = cols_miami)
    lotla_clusters_disagree['cluster'] = lotla_disagree
    
    color_map = {0: 'red', 1: 'green', 2: 'blue', 3: 'purple'}

    # Set up 2x2 grid of subplots
    fig, axs = plt.subplots(2, 2, figsize=(12, 10), sharex=True)

    # Shared color palette
    colors = [color_map[cluster] for cluster in range(4)]

    # Top-left: LoTLA (disagreement)
    sns.boxplot(ax=axs[0, 0], x='cluster', y='TOT_LVG_AREA', data=lotla_clusters_disagree,
                palette=colors, hue='cluster')
    axs[0, 0].set_title("Living Area by Cluster (LoTLA - Disagree)", fontsize=14)
    axs[0, 0].set_xticks(range(4))
    axs[0, 0].set_xticklabels(range(1, 5), fontsize=11)
    axs[0, 0].tick_params(axis='y', labelsize=11)
    axs[0, 0].set_xlabel("Cluster", fontsize=12)
    axs[0, 0].set_ylabel("Living Area (Sq. Ft.)", fontsize=12)
    axs[0, 0].legend_.remove()

    # Top-right: LoTLA (full data)
    sns.boxplot(ax=axs[0, 1], x='cluster', y='TOT_LVG_AREA', data=lotla_clusters,
                palette=colors, hue='cluster')
    axs[0, 1].set_title("LoTLA - Full Data", fontsize=14)
    axs[0, 1].set_xticks(range(4))
    axs[0, 1].set_xticklabels(range(1, 5), fontsize=11)
    axs[0, 1].tick_params(axis='y', labelsize=11)
    axs[0, 1].set_xlabel("Cluster", fontsize=12)
    axs[0, 1].set_ylabel("Living Area (Sq. Ft.)", fontsize=12)
    axs[0, 1].legend_.remove()

    # Bottom-left: SHAP (disagreement)
    sns.boxplot(ax=axs[1, 0], x='cluster', y='TOT_LVG_AREA', data=shap_clusters_disagree,
                palette=colors, hue='cluster')
    axs[1, 0].set_title("Living Area by Cluster (SHAP - Disagree)", fontsize=14)
    axs[1, 0].set_xticks(range(4))
    axs[1, 0].set_xticklabels(range(1, 5), fontsize=11)
    axs[1, 0].tick_params(axis='y', labelsize=11)
    axs[1, 0].set_xlabel("Cluster", fontsize=12)
    axs[1, 0].set_ylabel("Living Area (Sq. Ft.)", fontsize=12)
    axs[1, 0].legend_.remove()

    # Bottom-right: SHAP (full data)
    sns.boxplot(ax=axs[1, 1], x='cluster', y='TOT_LVG_AREA', data=shap_clusters,
                palette=colors, hue='cluster')
    axs[1, 1].set_title("SHAP - Full Data", fontsize=14)
    axs[1, 1].set_xticks(range(4))
    axs[1, 1].set_xticklabels(range(1, 5), fontsize=11)
    axs[1, 1].tick_params(axis='y', labelsize=11)
    axs[1, 1].set_xlabel("Cluster", fontsize=12)
    axs[1, 1].set_ylabel("Living Area (Sq. Ft.)", fontsize=12)
    axs[1, 1].legend_.remove()

    # Adjust layout
    plt.tight_layout()
    plt.show()

    # Set up 2x2 grid of subplots
    fig, axs = plt.subplots(2, 2, figsize=(12, 10), sharex=True)

    y_disagree = y_miami[labels_shap_mapped != labels_lotla]

    # Top-left: Log-USD, full data (LoTLA)
    sns.boxplot(ax=axs[0, 0], x=lotla_clusters['cluster'], y=y_miami, palette=colors)
    axs[0, 0].set_title("Sale Price by Cluster (LoTLA - Full)", fontsize=14)
    axs[0, 0].set_xticks(range(4))
    axs[0, 0].set_xticklabels(range(1, 5), fontsize=11)
    axs[0, 0].tick_params(axis='y', labelsize=11)
    axs[0, 0].set_xlabel("Cluster", fontsize=12)
    axs[0, 0].set_ylabel("Sale Price (Log-USD)", fontsize=12)

    # Top-right: Log-USD, disagreement (LoTLA)
    sns.boxplot(ax=axs[0, 1], x=lotla_clusters_disagree['cluster'], y=y_disagree, palette=colors)
    axs[0, 1].set_title("Sale Price by Cluster (LoTLA - Disagree)", fontsize=14)
    axs[0, 1].set_xticks(range(4))
    axs[0, 1].set_xticklabels(range(1, 5), fontsize=11)
    axs[0, 1].tick_params(axis='y', labelsize=11)
    axs[0, 1].set_xlabel("Cluster", fontsize=12)
    axs[0, 1].set_ylabel("Sale Price (Log-USD)", fontsize=12)

    # Bottom-left: Log-USD, full data (SHAP)
    sns.boxplot(ax=axs[1, 0], x=shap_clusters['cluster'], y=y_miami, palette=colors)
    axs[1, 0].set_title("Sale Price by Cluster (SHAP - Full)", fontsize=14)
    axs[1, 0].set_xticks(range(4))
    axs[1, 0].set_xticklabels(range(1, 5), fontsize=11)
    axs[1, 0].tick_params(axis='y', labelsize=11)
    axs[1, 0].set_xlabel("Cluster", fontsize=12)
    axs[1, 0].set_ylabel("Sale Price (Log-USD)", fontsize=12)

    # Bottom-right: Log-USD, disagreement (SHAP)
    sns.boxplot(ax=axs[1, 1], x=shap_clusters_disagree['cluster'], y=y_disagree, palette=colors)
    axs[1, 1].set_title("Sale Price by Cluster (SHAP - Disagree)", fontsize=14)
    axs[1, 1].set_xticks(range(4))
    axs[1, 1].set_xticklabels(range(1, 5), fontsize=11)
    axs[1, 1].tick_params(axis='y', labelsize=11)
    axs[1, 1].set_xlabel("Cluster", fontsize=12)
    axs[1, 1].set_ylabel("Sale Price (Log-USD)", fontsize=12)

    # Remove legends (optional)
    for ax in axs.flat:
        legend = ax.get_legend()
        if legend:
            legend.remove()

    # Layout
    plt.tight_layout()
    plt.show()


    
    # get distribution of price for each disagreement
    # y_disagree = y_miami[labels_shap_mapped != labels_lotla]

In [None]:
def get_maps(seed):
    X_miami = np.loadtxt("../data_openml/X_361260.csv", delimiter=",")[1:,:]
    y_miami = np.loadtxt("../data_openml/y_361260.csv", delimiter=",")[1:]
    cols_miami = ['LATITUDE', 'LONGITUDE', 'LND_SQFOOT', 'TOT_LVG_AREA',
                'SPEC_FEAT_VAL', 'RAIL_DIST', 'OCEAN_DIST', 'WATER_DIST',
                'CNTR_DIST', 'SUBCNTR_DI', 'HWY_DIST', 'age', 'avno60plus',
                'month_sold', 'structure_quality']

    # split into train and test 50/50 with seed {seed}
    _, X_miami, _, y_miami = train_test_split(X_miami, y_miami,
                                            test_size = 0.5, random_state = seed)

    # read in cluster labels
    base = f"../cluster-results/rf/kmeans/linear/361260/seed{seed}/"
    labels_shap = np.loadtxt(base + "k4_shap_labels.csv", delimiter=",").astype(int)
    labels_lotla = np.loadtxt(base + \
        "k4_elastic_nonnormed_nosquared_norank_labels.csv", delimiter=",").astype(int)
    
    # get confusion matrix of labels
    confusion_matrix = np.zeros((4, 4), dtype=int)
    for i in range(len(labels_shap)):
        confusion_matrix[labels_shap[i], labels_lotla[i]] += 1
    # print confusion matrix
    
    row_ind, col_ind = linear_sum_assignment(-confusion_matrix)  # Negative for maximization
    cluster_mapping = {i: j for i, j in zip(row_ind, col_ind)}
    
    # convert labels_shap and labels_lotla to the same mapping
    labels_shap_mapped = np.zeros(labels_shap.shape)
    for i in range(len(labels_shap)):
        labels_shap_mapped[i] = cluster_mapping[labels_shap[i]]
    labels_shap_mapped = labels_shap_mapped.astype(int)
    
    # get confusion matrix of labels
    confusion_matrix = np.zeros((4, 4), dtype=int)
    for i in range(len(labels_shap_mapped)):
        confusion_matrix[labels_shap_mapped[i], labels_lotla[i]] += 1
    
    # get clusters for lotla
    lotla_clusters = pd.DataFrame(X_miami, columns = cols_miami)
    lotla_clusters['cluster'] = labels_lotla
    # get clusters for shap
    shap_clusters = pd.DataFrame(X_miami, columns = cols_miami)
    shap_clusters['cluster'] = labels_shap_mapped

    color_map = {0: 'red', 1: 'green', 2: 'blue', 3: 'purple'}
    lotla_map = folium.Map(location=[25.7, -80.3], zoom_start=10)
    for lat, lon, cluster in zip(lotla_clusters.iloc[:, 0], lotla_clusters.iloc[:, 1], lotla_clusters.loc[:, 'cluster']):
        folium.CircleMarker(
            [lat, lon],
            radius=1,
            color=color_map[cluster],
            fill=True,
            fill_color=color_map[cluster],
            fill_opacity=0.6
        ).add_to(lotla_map)
    # add legend to map with colors
    # legend_dict = {f"Cluster {i+1}": color_map[i] for i in range(4)}
    # m.add_legend(title = "Legend Title", legend_dict = legend_dict)
    # m
    # Define the legend's HTML
    legend_html = '''
    <div style="position: fixed; 
        bottom: 50px; right: 50px; width: 110px; height: 85px; 
        border:2px solid grey; z-index:9999; font-size:14px;
        background-color:white; opacity: 1.0;">
        &nbsp; Cluster #1 &nbsp; <i class="fa fa-circle" style="color:red"></i><br>
        &nbsp; Cluster #2 &nbsp; <i class="fa fa-circle" style="color:green"></i><br>
        &nbsp; Cluster #3 &nbsp; <i class="fa fa-circle" style="color:blue"></i><br>
        &nbsp; Cluster #4 &nbsp; <i class="fa fa-circle" style="color:purple"></i>
    </div>
    '''

    # Add the legend to the map
    lotla_map.get_root().html.add_child(folium.Element(legend_html))
    
    shap_map = folium.Map(location=[25.7, -80.3], zoom_start=10)
    for lat, lon, cluster in zip(shap_clusters.iloc[:, 0], shap_clusters.iloc[:, 1], shap_clusters.loc[:, 'cluster']):
        folium.CircleMarker(
            [lat, lon],
            radius=1,
            color=color_map[cluster],
            fill=True,
            fill_color=color_map[cluster],
            fill_opacity=0.6
        ).add_to(shap_map)
    # add legend to map with colors
    # legend_dict = {f"Cluster {i+1}": color_map[i] for i in range(4)}
    # m.add_legend(title = "Legend Title", legend_dict = legend_dict)
    # m
    # Define the legend's HTML
    legend_html = '''
    <div style="position: fixed; 
         bottom: 50px; right: 50px; width: 110px; height: 85px; 
         border:2px solid grey; z-index:9999; font-size:14px;
         background-color:white; opacity: 1.0;">
         &nbsp; Cluster #1 &nbsp; <i class="fa fa-circle" style="color:red"></i><br>
         &nbsp; Cluster #2 &nbsp; <i class="fa fa-circle" style="color:green"></i><br>
         &nbsp; Cluster #3 &nbsp; <i class="fa fa-circle" style="color:blue"></i><br>
         &nbsp; Cluster #4 &nbsp; <i class="fa fa-circle" style="color:purple"></i>
    </div>
    '''

    # Add the legend to the map
    shap_map.get_root().html.add_child(folium.Element(legend_html))
    
    return shap_map, lotla_map

In [None]:
def get_disagree_maps(seed):
    
    X_miami = np.loadtxt("../data_openml/X_361260.csv", delimiter=",")[1:,:]
    y_miami = np.loadtxt("../data_openml/y_361260.csv", delimiter=",")[1:]
    cols_miami = ['LATITUDE', 'LONGITUDE', 'LND_SQFOOT', 'TOT_LVG_AREA',
                'SPEC_FEAT_VAL', 'RAIL_DIST', 'OCEAN_DIST', 'WATER_DIST',
                'CNTR_DIST', 'SUBCNTR_DI', 'HWY_DIST', 'age', 'avno60plus',
                'month_sold', 'structure_quality']

    # split into train and test 50/50 with seed {seed}
    _, X_miami, _, y_miami = train_test_split(X_miami, y_miami,
                                            test_size = 0.5, random_state = seed)

    # read in cluster labels
    base = f"../cluster-results/rf/kmeans/linear/361260/seed{seed}/"
    labels_shap = np.loadtxt(base + "k4_shap_labels.csv", delimiter=",").astype(int)
    labels_lotla = np.loadtxt(base + \
        "k4_elastic_nonnormed_nosquared_norank_labels.csv", delimiter=",").astype(int)
    
    # get confusion matrix of labels
    confusion_matrix = np.zeros((4, 4), dtype=int)
    for i in range(len(labels_shap)):
        confusion_matrix[labels_shap[i], labels_lotla[i]] += 1
    # print confusion matrix
    
    row_ind, col_ind = linear_sum_assignment(-confusion_matrix)  # Negative for maximization
    cluster_mapping = {i: j for i, j in zip(row_ind, col_ind)}
    
    # convert labels_shap and labels_lotla to the same mapping
    labels_shap_mapped = np.zeros(labels_shap.shape)
    for i in range(len(labels_shap)):
        labels_shap_mapped[i] = cluster_mapping[labels_shap[i]]
    labels_shap_mapped = labels_shap_mapped.astype(int)
    
    # get confusion matrix of labels
    confusion_matrix = np.zeros((4, 4), dtype=int)
    for i in range(len(labels_shap_mapped)):
        confusion_matrix[labels_shap_mapped[i], labels_lotla[i]] += 1
    
    # get clusters for lotla
    lotla_clusters = pd.DataFrame(X_miami, columns = cols_miami)
    lotla_clusters['cluster'] = labels_lotla
    # get clusters for shap
    shap_clusters = pd.DataFrame(X_miami, columns = cols_miami)
    shap_clusters['cluster'] = labels_shap_mapped
    
    # get set of points where lotla and shap disagree
    shap_disagree = labels_shap_mapped[labels_shap_mapped != labels_lotla]
    lotla_disagree = labels_lotla[labels_shap_mapped != labels_lotla]
    X_disagree = X_miami[labels_shap_mapped != labels_lotla]
    shap_clusters_disagree = pd.DataFrame(X_disagree, columns = cols_miami)
    shap_clusters_disagree['cluster'] = shap_disagree
    lotla_clusters_disagree = pd.DataFrame(X_disagree, columns = cols_miami)
    lotla_clusters_disagree['cluster'] = lotla_disagree
    # plot the disagreement points

    color_map = {0: 'red', 1: 'green', 2: 'blue', 3: 'purple'}
    shap_map = folium.Map(location=[25.7, -80.3], zoom_start=10)
    for lat, lon, cluster in zip(shap_clusters_disagree.iloc[:, 0], shap_clusters_disagree.iloc[:, 1], shap_clusters_disagree.loc[:, 'cluster']):
        folium.CircleMarker(
            [lat, lon],
            radius=1,
            color=color_map[cluster],
            fill=True,
            fill_color=color_map[cluster],
            fill_opacity=0.6
        ).add_to(shap_map)
    legend_html = '''
    <div style="position: fixed; 
        bottom: 50px; right: 50px; width: 110px; height: 85px; 
        border:2px solid grey; z-index:9999; font-size:14px;
        background-color:white; opacity: 1.0;">
        &nbsp; Cluster #1 &nbsp; <i class="fa fa-circle" style="color:red"></i><br>
        &nbsp; Cluster #2 &nbsp; <i class="fa fa-circle" style="color:green"></i><br>
        &nbsp; Cluster #3 &nbsp; <i class="fa fa-circle" style="color:blue"></i><br>
        &nbsp; Cluster #4 &nbsp; <i class="fa fa-circle" style="color:purple"></i>
    </div>
    '''

    # Add the legend to the map
    shap_map.get_root().html.add_child(folium.Element(legend_html))
    
    lotla_map = folium.Map(location=[25.7, -80.3], zoom_start=10)
    for lat, lon, cluster in zip(lotla_clusters_disagree.iloc[:, 0], lotla_clusters_disagree.iloc[:, 1], lotla_clusters_disagree.loc[:, 'cluster']):
        folium.CircleMarker(
            [lat, lon],
            radius=1,
            color=color_map[cluster],
            fill=True,
            fill_color=color_map[cluster],
            fill_opacity=0.6
        ).add_to(lotla_map)
    legend_html = '''
    <div style="position: fixed; 
        bottom: 50px; right: 50px; width: 110px; height: 85px; 
        border:2px solid grey; z-index:9999; font-size:14px;
        background-color:white; opacity: 1.0;">
        &nbsp; Cluster #1 &nbsp; <i class="fa fa-circle" style="color:red"></i><br>
        &nbsp; Cluster #2 &nbsp; <i class="fa fa-circle" style="color:green"></i><br>
        &nbsp; Cluster #3 &nbsp; <i class="fa fa-circle" style="color:blue"></i><br>
        &nbsp; Cluster #4 &nbsp; <i class="fa fa-circle" style="color:purple"></i>
    </div>
    '''

    # Add the legend to the map
    lotla_map.get_root().html.add_child(folium.Element(legend_html))
    
    return shap_map, lotla_map
    

In [None]:
shap_disagree_map, lotla_disagree_map = get_disagree_maps(4)

In [None]:
shap_disagree_map

In [None]:
shap_map, lotla_map = get_maps(1)

In [None]:
shap_map

In [None]:
lotla_map