In [6]:
import numpy as np
from scipy.optimize import linear_sum_assignment

In [7]:
# Hungarian algorithm
def match_labels(cost_matrix):
    # 计算成本矩阵中所有元素的最大值
    max_value = np.max(cost_matrix)
    
    # 通过最大值减去原成本矩阵中的每个元素，转化为最小化问题
    transformed_cost_matrix = max_value - cost_matrix
    
    # 使用linear_sum_assignment从scipy库解决转化后的分配问题
    row_ind, col_ind = linear_sum_assignment(transformed_cost_matrix)
    
    return row_ind, col_ind

In [8]:
def weighted_cost_matrix(matrix):

    total_sum = np.sum(matrix)  # Calculate the total sum of the matrix
    rows, cols = matrix.shape  # Get the dimensions of the matrix
    weighted_matrix = np.zeros_like(matrix, dtype=float)  # Initialize the weighted matrix
    
    for i in range(rows):
        row_sum = np.sum(matrix[i, :])  # Sum of values in the current row
        weight = total_sum / (rows * row_sum)  # Calculate the weight for the current row
        weighted_matrix[i, :] = matrix[i, :] * weight  # Apply the weight to each element in the row
    
    return weighted_matrix

In [9]:
# 这是均衡的情况，可以发现和最开始的结果一样
cost_matrix = np.array([
    [10, 20, 30],
    [10, 2, 3],
    [20, 30, 50]])

# 应用匹配标签的函数
row_ind, col_ind = match_labels(cost_matrix)

# 打印匹配结果
print("聚类标签到真实标签的映射（聚类簇索引 -> 真实标签索引）:", list(zip(row_ind, col_ind)))
print("匹配的数量总和:", cost_matrix[row_ind, col_ind].sum())

聚类标签到真实标签的映射（聚类簇索引 -> 真实标签索引）: [(0, 1), (1, 0), (2, 2)]
匹配的数量总和: 80


In [10]:
# 把真实类别0的样本数扩大10倍，匈牙利结果开始不同了
cost_matrix = np.array([
    [1000, 2000, 3000],
    [10, 2, 3],
    [20, 30, 50]])

# 应用匹配标签的函数
row_ind, col_ind = match_labels(cost_matrix)

# 打印匹配结果
print("聚类标签到真实标签的映射（聚类簇索引 -> 真实标签索引）:", list(zip(row_ind, col_ind)))
print("匹配的数量总和:", cost_matrix[row_ind, col_ind].sum())
print('Fail')

聚类标签到真实标签的映射（聚类簇索引 -> 真实标签索引）: [(0, 2), (1, 0), (2, 1)]
匹配的数量总和: 3040
Fail


In [11]:
cost_matrix = np.array([
    [1000, 2000, 3000],
    [10, 2, 3],
    [20, 30, 50]])
w_cost_matrix = weighted_cost_matrix(cost_matrix)

# 应用匹配标签的函数
row_ind, col_ind = match_labels(w_cost_matrix)

# 打印匹配结果 # successful
print("聚类标签到真实标签的映射（聚类簇索引 -> 真实标签索引）:", list(zip(row_ind, col_ind)))
print("匹配的数量总和:", w_cost_matrix[row_ind, col_ind].sum())

聚类标签到真实标签的映射（聚类簇索引 -> 真实标签索引）: [(0, 1), (1, 0), (2, 2)]
匹配的数量总和: 3057.5


## Case:subtype_match 

In [12]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix

truth_df = pd.read_csv("data/simulate/Subtype_match/truth_SpatialData.csv")  
pred_df = pd.read_csv("data/simulate/Subtype_match/pred_SpatialData.csv")  


mapping = {'A': '1', 'B': '2', 'C': '3'}
pred_df['group_mapped'] = pred_df['group'].astype(str).map(mapping)


aligned_df = truth_df.merge(pred_df[['Unnamed: 0', 'group_mapped']], on='Unnamed: 0', suffixes=('_truth', '_pred'))

unique_groups_truth = np.sort(aligned_df['group'].unique())
unique_groups_pred_mapped = np.sort(aligned_df['group_mapped'].unique())
matrix = pd.DataFrame(0, index=unique_groups_truth, columns=unique_groups_pred_mapped)

for index, row in aligned_df.iterrows():
    truth_label = row['group']
    pred_label_mapped = row['group_mapped']
    matrix.loc[truth_label, pred_label_mapped] += 1

print(matrix)


     1    2   3
A  100  120  90
B   30   50  20
C    2    3  11


In [13]:
cost_matrix = np.array([
    [100, 120, 90],
    [30, 50, 20],
    [2, 3, 11]])

In [14]:
# 应用匹配标签的函数
row_ind, col_ind = match_labels(cost_matrix)

# 打印匹配结果
print("聚类标签到真实标签的映射（聚类簇索引 -> 真实标签索引）:", list(zip(row_ind, col_ind)))
print("匹配的数量总和:", cost_matrix[row_ind, col_ind].sum())
print('Fail')

聚类标签到真实标签的映射（聚类簇索引 -> 真实标签索引）: [(0, 1), (1, 0), (2, 2)]
匹配的数量总和: 161
Fail


In [17]:
w_cost_matrix = weighted_cost_matrix(cost_matrix)

# 应用匹配标签的函数
row_ind, col_ind = match_labels(w_cost_matrix)

# 打印匹配结果 # successful
print("聚类标签到真实标签的映射（聚类簇索引 -> 真实标签索引）:", list(zip(row_ind, col_ind)))
print("匹配的数量总和:", w_cost_matrix[row_ind, col_ind].sum())

聚类标签到真实标签的映射（聚类簇索引 -> 真实标签索引）: [(0, 0), (1, 1), (2, 2)]
匹配的数量总和: 214.43145161290323


## Case2.2

In [19]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix

truth_df = pd.read_csv("data/simulate/Case2 (FN_FP)/Case2.2/Truth_SpatialData.csv")  
pred_df = pd.read_csv("data/simulate/Case2 (FN_FP)/Case2.2/BA_SpatialData.csv")  


mapping = {'A': '1', 'B': '2', 'C': '3'}
pred_df['group_mapped'] = pred_df['group'].astype(str).map(mapping)


aligned_df = truth_df.merge(pred_df[['Unnamed: 0', 'group_mapped']], on='Unnamed: 0', suffixes=('_truth', '_pred'))

unique_groups_truth = np.sort(aligned_df['group'].unique())
unique_groups_pred_mapped = np.sort(aligned_df['group_mapped'].unique())
matrix = pd.DataFrame(0, index=unique_groups_truth, columns=unique_groups_pred_mapped)

for index, row in aligned_df.iterrows():
    truth_label = row['group']
    pred_label_mapped = row['group_mapped']
    matrix.loc[truth_label, pred_label_mapped] += 1

print(matrix)

    1   2   3
A  31   0   0
B  12  19   0
C   0   0  31


In [20]:
cost_matrix = np.array([
    [31, 0, 0],
    [12, 19, 0],
    [0, 0, 31]])

In [21]:
# 应用匹配标签的函数
row_ind, col_ind = match_labels(cost_matrix)

# 打印匹配结果
print("聚类标签到真实标签的映射（聚类簇索引 -> 真实标签索引）:", list(zip(row_ind, col_ind)))
print("匹配的数量总和:", cost_matrix[row_ind, col_ind].sum())
print('Fail')

聚类标签到真实标签的映射（聚类簇索引 -> 真实标签索引）: [(0, 0), (1, 1), (2, 2)]
匹配的数量总和: 81
Fail


In [22]:
w_cost_matrix = weighted_cost_matrix(cost_matrix)

# 应用匹配标签的函数
row_ind, col_ind = match_labels(w_cost_matrix)

# 打印匹配结果 # successful
print("聚类标签到真实标签的映射（聚类簇索引 -> 真实标签索引）:", list(zip(row_ind, col_ind)))
print("匹配的数量总和:", w_cost_matrix[row_ind, col_ind].sum())

聚类标签到真实标签的映射（聚类簇索引 -> 真实标签索引）: [(0, 0), (1, 1), (2, 2)]
匹配的数量总和: 81.0


## Case3.2

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix

truth_df = pd.read_csv("data/simulate/Case2 (FN_FP)/Case2.2/Truth_SpatialData.csv")  
pred_df = pd.read_csv("data/simulate/Case2 (FN_FP)/Case2.2/BA_SpatialData.csv")  


mapping = {'A': '1', 'B': '2', 'C': '3'}
pred_df['group_mapped'] = pred_df['group'].astype(str).map(mapping)


aligned_df = truth_df.merge(pred_df[['Unnamed: 0', 'group_mapped']], on='Unnamed: 0', suffixes=('_truth', '_pred'))

unique_groups_truth = np.sort(aligned_df['group'].unique())
unique_groups_pred_mapped = np.sort(aligned_df['group_mapped'].unique())
matrix = pd.DataFrame(0, index=unique_groups_truth, columns=unique_groups_pred_mapped)

for index, row in aligned_df.iterrows():
    truth_label = row['group']
    pred_label_mapped = row['group_mapped']
    matrix.loc[truth_label, pred_label_mapped] += 1

print(matrix)