In [None]:
import os
import pandas as pd
import numpy as np
from constants import complete_city_list

# following the expert knowledge from xinghan, we divided all features into 5 groups, see the following:
feature_list_0 = ['vegetation', 'Street_Scale']
feature_list_1 = ['B', 'C', 'G', 'M', 'O', 'P', 'S', 'W']
feature_list_2 = ['Art_Deco', 'Brutalism', 'Eastern_Asian_Regional', 'Eastern_European_Regional', 'Georgian', 'Greystone', 'High-tech', \
                'International', 'Middle_Eastern_Regional',
                'Modern_high-rise_Apartment', 'Neoclassical', 'Nordic_Regional',
                'Postmodern', 'Ranch-style', 'Scandinavian_Vernacular',
                'Southeast_Asian_Regional', 'Southern_Asian_Regional',
                'Southern_European_Regional', 'Tube-shaped_Apartment', 'Victorian',
                'Western_European_Vernacular', 'Worker_Cottage']
general_feature_list = feature_list_0 + feature_list_1 + feature_list_2

# color feature
# process the origin feature file, deleting unnecessary columns and preserving the feature columns 
feature_list_3 = pd.read_csv('/hpc2ssd/JH_DATA/spooler/xzeng159/data/banner/Amsterdam.csv').columns.drop(['ID', 'lat', 'lon', 'area']).tolist()
feature_list_4 = pd.read_csv('/hpc2ssd/JH_DATA/spooler/xzeng159/data/color/Amsterdam.csv').columns.drop(['ID', 'Lat', 'Lon']).tolist()

all_feature_list = general_feature_list + feature_list_3 + feature_list_4 

In [None]:
# 定义append_prob_matrix函数
def append_prob_matrix(df, super_set):
    df = df.reset_index(drop=True)  # 重置索引，确保索引是唯一的
    prob_matrix_df = pd.DataFrame(
        0, index=np.arange(len(df)), columns=super_set)
    for i, row in df.iterrows():
        col_name_0 = f'{row["cls1"]}'
        col_name_1 = f'{row["cls2"]}'
        prob_matrix_df.at[i, col_name_0] = row['prob1']
        prob_matrix_df.at[i, col_name_1] = row['prob2']
    result_df = pd.concat([df, prob_matrix_df], axis=1)
    return result_df  


def load_and_merge_city_data(city_name, base_folder):
    """
    对于给定的城市名称，加载并合并该城市的general, color, banner数据。
    
    Parameters:
    - city_name: str, 城市名称。
    - base_folder: str, 包含general, color, banner文件夹的根目录路径。
    
    Returns:
    - DataFrame, 合并后的城市数据。
    """
    # 定义一个空DataFrame作为合并后的城市数据
    merged_city_data = pd.DataFrame()
    
    # 遍历每个特征文件夹
    for feature in ['general', 'color', 'banner']:
        folder_path = os.path.join(base_folder, feature)
        file_path = os.path.join(folder_path, f'{city_name}.csv')
        
        # 检查文件是否存在
        if os.path.exists(file_path):
            temp_data = pd.read_csv(file_path)
            if feature == 'general':
                temp_data.drop(['lat', 'lon'], axis=1, inplace=True)
                temp_data = append_prob_matrix(temp_data, feature_list_2)
                temp_data = temp_data[['ID'] + general_feature_list]
                temp_data.dropna(inplace=True)
            elif feature == 'color':
               temp_data.drop(['Lat', 'Lon'], axis=1, inplace=True) # for color
               temp_data.replace([np.inf, -np.inf], np.nan, inplace=True)
               temp_data.dropna(inplace=True)
            elif feature == 'banner':
                temp_data.drop(['lat', 'lon', 'area'], axis=1, inplace=True) 
                temp_data.replace([np.inf, -np.inf], np.nan, inplace=True)
                temp_data.dropna(inplace=True)
            # 如果是第一个特征，直接赋值给merged_city_data
            if merged_city_data.empty:
                merged_city_data = temp_data
            else:
                # 使用ID列进行合并，为重复列添加适当后缀
                merged_city_data = pd.merge(merged_city_data, temp_data, on='ID', how='left')
        else:
            print(f"未找到文件: {file_path}")
        
        if feature == 'banner':
            merged_city_data['city'] = city_name
    merged_city_data.dropna(inplace=True)
    print(f"{city_name} finished")
    return merged_city_data

In [None]:
from concurrent.futures import ProcessPoolExecutor

base_folder = '/hpc2ssd/JH_DATA/spooler/xzeng159/data'

city_list = complete_city_list

def process_city(city_name):
    print(f"Processing {city_name}")
    return load_and_merge_city_data(city_name, base_folder)

# 使用多进程加速数据加载和合并
def load_data_concurrently(city_list, base_folder):
    all_data = pd.DataFrame()
    with ProcessPoolExecutor(max_workers=16) as executor:  # 可根据核心数调整
        for city_data in executor.map(process_city, city_list):
            all_data = pd.concat([all_data, city_data], ignore_index=True)
    return all_data

# 现在，我们使用并行处理版本的函数来加载数据
all_data = load_data_concurrently(city_list, base_folder)
print(all_data.head())

In [None]:
print(len(all_data))

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch.nn.functional as F


In [None]:
def create_category_mapping(series):
    return {category: idx for idx, category in enumerate(series.unique())}

# Function to convert dataframe column to tensor of indices

def column_to_tensor(df, mapping):
    return torch.tensor([mapping[category] for category in df])

city_mapping = create_category_mapping(all_data['city'])
city_tensor = column_to_tensor(all_data['city'], city_mapping)
all_data['city'] = city_tensor

In [None]:
# 分割特征和标签（假设标签列名为 'label'）
X = all_data[all_feature_list].values
y = all_data['city'].values

# 分割数据集为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)


# 获取特征索引
group0_indices = [all_feature_list.index(f) for f in feature_list_0]
group1_indices = [all_feature_list.index(f) for f in feature_list_1]
group2_indices = [all_feature_list.index(f) for f in feature_list_2]
group3_indices = [all_feature_list.index(f) for f in feature_list_3]
group4_indices = [all_feature_list.index(f) for f in feature_list_4]

# 应用索引分割特征
X_train_group0 = X_train[:, group0_indices]
X_train_group1 = X_train[:, group1_indices]
X_train_group2 = X_train[:, group2_indices]
X_train_group3 = X_train[:, group3_indices]
X_train_group4 = X_train[:, group4_indices]

X_test_group0 = X_test[:, group0_indices]
X_test_group1 = X_test[:, group1_indices]
X_test_group2 = X_test[:, group2_indices]
X_test_group3 = X_test[:, group3_indices]
X_test_group4 = X_test[:, group4_indices]

# 数据预处理
scaler0 = StandardScaler()
scaler1 = StandardScaler()
scaler2 = StandardScaler()
scaler3 = StandardScaler()
scaler4 = StandardScaler()


X_train_group0 = scaler0.fit_transform(X_train_group0)
X_train_group1 = scaler1.fit_transform(X_train_group1)
X_train_group2 = scaler2.fit_transform(X_train_group2)
X_train_group3 = scaler3.fit_transform(X_train_group3)
X_train_group4 = scaler4.fit_transform(X_train_group4)


X_test_group0 = scaler0.transform(X_test_group0)
X_test_group1 = scaler1.transform(X_test_group1)
X_test_group2 = scaler2.transform(X_test_group2)
X_test_group3 = scaler3.transform(X_test_group3)
X_test_group4 = scaler4.transform(X_test_group4)


# 转换为Torch tensors
train_data = TensorDataset(torch.tensor(X_train_group0, dtype=torch.float), 
                           torch.tensor(X_train_group1, dtype=torch.float), 
                           torch.tensor(X_train_group2, dtype=torch.float), 
                           torch.tensor(X_train_group3, dtype=torch.float), 
                           torch.tensor(X_train_group4, dtype=torch.float), 
                           torch.tensor(y_train, dtype=torch.long))
test_data = TensorDataset(torch.tensor(X_test_group0, dtype=torch.float), 
                          torch.tensor(X_test_group1, dtype=torch.float), 
                          torch.tensor(X_test_group2, dtype=torch.float), 
                          torch.tensor(X_test_group3, dtype=torch.float), 
                          torch.tensor(X_test_group4, dtype=torch.float),
                          torch.tensor(y_test, dtype=torch.long))

In [None]:
from models import MLPEmbedding

batch_size = 32768
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

input_sizes = [len(group0_indices), len(group1_indices), len(group2_indices), len(group3_indices), len(group4_indices)]
embedding_sizes = [32, 32, 32, 32, 32]  # 假设所有嵌入层的输出尺寸为32
hidden_size = 128
num_classes = len(np.unique(y))

model = MLPEmbedding(input_sizes, embedding_sizes, hidden_size, num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

In [None]:
from tqdm import tqdm
from utils import calculate_accuracy, calculate_metrics

# 训练循环
epochs = 10
num_classes = len(np.unique(y))
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for *groups, labels in tqdm(train_loader):
        optimizer.zero_grad()
        outputs = model(*groups)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    if (epoch+1) % 5 == 0 and epoch != 0:
        train_accuracy = calculate_accuracy(model, train_loader)  # 计算训练集准确率
        test_accuracy = calculate_accuracy(model, test_loader)  # 计算测试集准确率
        print(f'Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}, Train Acc: {train_accuracy*100:.2f}%, Test Acc: {test_accuracy*100:.2f}%')
        
train_precision, train_recall = calculate_metrics(model, train_loader, num_classes)
test_precision, test_recall = calculate_metrics(model, test_loader, num_classes)
for i in range(num_classes):
    print(f'Class {i} - Train Precision: {train_precision[i]*100:.2f}%, Train Recall: {train_recall[i]*100:.2f}%')
    print(f'Class {i} - Test Precision: {test_precision[i]*100:.2f}%, Test Recall: {test_recall[i]*100:.2f}%')

In [None]:
# memory may overflow

import gc
# del scaler0, scaler1, scaler2, scaler3, scaler4
del X_train_group0, X_train_group1, X_train_group2, X_train_group3, X_train_group4
del X_test_group0, X_test_group1, X_test_group2, X_test_group3, X_test_group4
gc.collect()

In [None]:
# Generate Confusion Matrix

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import itertools

def get_predictions_and_labels(model, data_loader):
    model.eval()
    predictions = []
    labels_list = []
    
    with torch.no_grad():
        for *groups, labels in data_loader:
            outputs = model(*groups)
            _, predicted = torch.max(outputs, 1)
            predictions.extend(predicted.cpu().numpy())
            labels_list.extend(labels.cpu().numpy())
    
    return predictions, labels_list

test_predictions, test_labels = get_predictions_and_labels(model, test_loader)


In [None]:
# 可视化混淆矩阵
def plot_confusion_matrix(conf_matrix, classes, normalize=False, title='Confusion Matrix', cmap=plt.cm.Blues):
    if normalize:
        conf_matrix = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis]
    plt.imshow(conf_matrix, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = conf_matrix.max() / 2.
    for i, j in itertools.product(range(conf_matrix.shape[0]), range(conf_matrix.shape[1])):
        plt.text(j, i, format(conf_matrix[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if conf_matrix[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
from sklearn.metrics import confusion_matrix

# 生成混淆矩阵
conf_matrix = confusion_matrix(test_labels, test_predictions)

# 可视化混淆矩阵
def plot_confusion_matrix(conf_matrix, classes, normalize=False, title='Confusion Matrix', cmap=plt.cm.Blues):
    if normalize:
        conf_matrix = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis]
    plt.imshow(conf_matrix, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = conf_matrix.max() / 2.
    for i, j in itertools.product(range(conf_matrix.shape[0]), range(conf_matrix.shape[1])):
        plt.text(j, i, format(conf_matrix[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if conf_matrix[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# 将混淆矩阵保存为CSV文件
def save_confusion_matrix_to_csv(conf_matrix, class_names, filename):
    df_cm = pd.DataFrame(conf_matrix, index=class_names, columns=class_names)
    df_cm.to_csv(filename)

# 类别名称
class_names = [city_list[i] for i in range(num_classes)]

# 保存非标准化的混淆矩阵为CSV文件
save_confusion_matrix_to_csv(conf_matrix, class_names, './707/confusion_matrix.csv')
conf_matrix_normalized = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis]
conf_matrix_normalized = np.round(conf_matrix_normalized, 2)  # 保留两位小数
save_confusion_matrix_to_csv(conf_matrix_normalized, class_names, './707/normalize_confusion_matrix.csv')

# 画出非标准化的混淆矩阵
plt.figure(figsize=(50, 50))
plot_confusion_matrix(conf_matrix, classes=class_names, title='Confusion matrix, without normalization')
plt.savefig('./707/1.png')

# 画出标准化的混淆矩阵
plt.figure(figsize=(50, 50))
plot_confusion_matrix(conf_matrix, classes=class_names, normalize=True, title='Normalized confusion matrix')
plt.savefig('./707/2.png')


In [None]:
def collect_data(data_loader):
    all_groups, all_labels = [], []
    for *groups, labels in data_loader:
        all_groups.append(groups)
        all_labels.append(labels)
    all_groups = [np.concatenate([group[i].numpy() for group in all_groups], axis=0) for i in range(len(all_groups[0]))]
    all_labels = np.concatenate(all_labels, axis=0)
    return all_groups, all_labels

# 收集所有数据
groups, labels = collect_data(train_loader)

# 收集中间层输出
model.eval()  # 确保模型处于评估模式
embeddings = []
with torch.no_grad():
    for *groups, _ in tqdm(train_loader):
        group = [torch.tensor(g, dtype=torch.float32) for g in groups]
        embedding = model(*group, return_embedding=True).cpu().numpy()
        embeddings.append(embedding)
embeddings = np.concatenate(embeddings, axis=0)

In [None]:
# 确保此时 embeddings 和 labels 长度相同
assert len(embeddings) == len(labels), "The embeddings and labels should have the same length."

# 使用t-SNE进行降维
perplexity = 30
tsne = TSNE(n_components=2, verbose=1, perplexity=perplexity)
tsne_results = tsne.fit_transform(embeddings)

# 可视化
class_names = [city_list[i] for i in range(len(np.unique(labels)))]
num_classes = len(class_names)
plt.figure(figsize=(32,20))
for class_idx in range(num_classes):
    indices = labels == class_idx
    plt.scatter(tsne_results[indices, 0], tsne_results[indices, 1], label=f'{class_names[class_idx]}', alpha=0.1)
plt.title(f'All data points -- perplexity: {perplexity}')
plt.legend()
plt.savefig(f'./520/all_points_perplexity-{perplexity}.png')
plt.show()

In [None]:
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt


def sample_data(data_loader, sample_size):
    for *groups, labels in data_loader:
        sampled_indices = np.random.choice(len(labels), size=sample_size, replace=False)
        groups_sampled = [group[sampled_indices] for group in groups]
        labels_sampled = labels[sampled_indices]
        break  # 只取一个批次进行采样
    return groups_sampled, labels_sampled

# 采样数据
sample_size = 30000  # 设置你希望采样的大小
groups_sampled, labels_sampled = sample_data(test_loader, sample_size)

# 收集中间层输出
model.eval()  # 确保模型处于评估模式
with torch.no_grad():
    embeddings = model(*groups_sampled, return_embedding=True).cpu().numpy()

# 使用t-SNE进行降维
perplexity = 20
tsne = TSNE(n_components=2, verbose=1, perplexity=perplexity, n_iter=300)
tsne_results = tsne.fit_transform(embeddings)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# 假设以下变量已正确定义和初始化
# tsne_results, labels_sampled, sample_size, perplexity
class_names = [city_list[i] for i in range(num_classes)]
# 独特标签的颜色映射
unique_labels = set(culture_group_labels)  # 根据文化组标签创建一个独特标签集
colors = plt.cm.rainbow(np.linspace(0, 1, len(unique_labels)))  # 为每个独特标签生成颜色
color_map = dict(zip(unique_labels, colors))  # 创建标签到颜色的映射

plt.figure(figsize=(32, 20))
num_classes = len(np.unique(labels_sampled.numpy()))  # 更新num_classes基于采样数据

for class_idx in range(num_classes):
    indices = labels_sampled.numpy() == class_idx
    label = None
    for i in range(len(culture_group)):
        if class_names[class_idx] in culture_group[i]:
            label = culture_group_labels[i]
            break
    if label is None:
        exit()

    # 使用之前创建的颜色映射来确定点的颜色
    color = color_map.get(label, 'k')  # 如果label不在color_map中，默认使用黑色
    plt.scatter(tsne_results[indices, 0], tsne_results[indices, 1], label=label, color=color, alpha=0.3)

# 优化图例，以仅显示一次每个标签
handles, labels = plt.gca().get_legend_handles_labels()
by_label = dict(zip(labels, handles))  # 移除重复标签
plt.legend(by_label.values(), by_label.keys())

plt.title(f'Randomly sampled {sample_size} points -- perplexity: {perplexity}')
plt.savefig(f'./520/{sample_size}points_perplexity-{perplexity}.png')
plt.show()


In [None]:
import pandas as pd
import plotly.express as px
from IPython.display import HTML


class_names = [city_list[i] for i in range(num_classes)]
# 假设以下变量已正确定义和初始化
# tsne_results, labels_sampled, sample_size, perplexity, class_names, culture_group_labels

df_tsne = pd.DataFrame(tsne_results, columns=['tsne-2d-one', 'tsne-2d-two'])
df_tsne['city_label'] = [class_names[i] for i in labels_sampled.numpy()]

# 增加一个新的列来存储文化组的标签
df_tsne['culture_label'] = None  # 初始化文化组标签列 
for i in range(len(culture_group)):
    df_tsne.loc[df_tsne['city_label'].isin(culture_group[i]), 'culture_label'] = culture_group_labels[i]

# 确保每个城市类别都有一个对应的文化组标签
# assert not df_tsne['culture_label'].isnull().any(), "Some city labels don't have a corresponding culture group label."

# 使用plotly.express绘制交互式图表，并将颜色设置为文化组标签
fig = px.scatter(df_tsne, x='tsne-2d-one', y='tsne-2d-two', color='culture_label',
                 title=f'Randomly Sampled {sample_size} Points -- Perplexity: {perplexity}',
                 labels={"culture_label": "Culture Group Label"})

fig.update_layout(width=1800, height=1200)
# 如果需要保存图表，可以使用write_image方法，但需要额外安装kaleido
# fig.write_image(f'./506/{sample_size}points_perplexity-{perplexity}.png')
HTML(fig.to_html())


In [None]:
fig.write_html('./test.html')

In [None]:
df_tsne['label'] = [class_names[i] for i in labels_sampled.numpy()]
df_tsne['label']

In [None]:
import pandas as pd
df_tsne = pd.DataFrame(tsne_results, columns=['tsne-2d-one', 'tsne-2d-two'])
df_tsne['label'] = [class_names[i] for i in labels_sampled.numpy()]

# 使用plotly.express绘制交互式图表
fig = px.scatter(df_tsne, x='tsne-2d-one', y='tsne-2d-two', color='label',
                 title=f'Randomly Sampled {sample_size} Points -- Perplexity: {perplexity}',
                 labels={"label": "City Label"})

fig.update_layout(width=1800, height=1200)
# 如果需要保存图表，可以使用write_image方法，但需要额外安装kaleido
# fig.write_image(f'./506/{sample_size}points_perplexity-{perplexity}.png')
HTML(fig.to_html())

In [None]:
fig.write_html('./test_city.html')