In [1]:

from tqdm import tqdm
from glob import glob
import os
import pandas as pd
import numpy as np
import gc
import datetime
import argparse
import h3


In [2]:
# Constants
ROOTFOLDER = "/lustre1/g/geog_pyloo/05_timemachine"
VALFOLDER = (
    "/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier_img_yolo8_inf_dir"
)
CURATED_FOLDER = (
    "/lustre1/g/geog_pyloo/05_timemachine/_curated/c_city_classifiier_prob"
)
TRAIN_TEST_FOLDER = "/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier_img_yolo8"
RAW_PATH = "/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_rgb/{city}/gsvmeta/{city}_meta.csv"

PANO_PATH = "{ROOTFOLDER}/GSV/gsv_rgb/{cityabbr}/gsvmeta/gsv_pano.csv"
PATH_PATH = "{ROOTFOLDER}/GSV/gsv_rgb/{cityabbr}/gsvmeta/gsv_path.csv"

CURATE_FOLDER_SOURCE = "/lustre1/g/geog_pyloo/05_timemachine/_curated/c_city_classifiier_prob_hex_summary"
CURATE_FOLDER_EXPORT = "/lustre1/g/geog_pyloo/05_timemachine/_curated/c_city_classifiier_prob_similarity"

if not os.path.exists(CURATE_FOLDER_EXPORT):
    os.makedirs(CURATE_FOLDER_EXPORT)
    
vector_ls = [str(x) for x in range(0, 127)]


In [None]:
# create scripts for batch processing
lines = """python /home/yuanzf/uvi-time-machine/_script/A-city-never-was/B5_prob_vector_summary.py --city {city}"""
city_meta = pd.read_csv("/home/yuanzf/uvi-time-machine/_script/city_meta.csv")
city_ls = city_meta.City.values
# split the cities into four groups to run the script in parallel
N = len(city_ls) // 10
for i in range(N):
    with open(f"run_b5_{i}.sh", "w") as f:
        for city in city_ls[i*10:(i+1)*10]:
            f.write(lines.format(city=city) + "\n")

# Construct similarity indexes

In [None]:
# construct the similarity matrix among all cities
# load the results first
# compute the similarity matrix among all cells
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import gc

RES_EXCLUDE = 11

# OUTPUT_FILE_NAME = "prob_city={city}_res_exclude={res_exclude}.parquet"
def load_all(res_sel,
             res_exclude=RES_EXCLUDE,
             ):
    files = glob(CURATE_FOLDER_SOURCE + f"/*res_exclude={res_exclude}.parquet")
    print(len(files))
    df_all = []
    for f in files:
        temp = pd.read_parquet(f)
        temp = temp[temp.res == res_sel].reset_index(drop = True)
        temp['city'] = os.path.basename(f).split("_")[1].replace("city=", "")
        df_all.append(temp)
    df_all = pd.concat(df_all).drop_duplicates('hex_id').reset_index(drop = True)
    df_all = df_all.drop(columns = ["res"])
    print("Data loaded", df_all.shape[0])
    n_cells = df_all.shape[0]
    X = df_all[vector_ls].values
    # create a new dataframe that has shape of (n_cells, 2) to store the similarity matrix
    # compute the similarity matrix
    similarity_matrix = cosine_similarity(X)
    print("Similarity matrix computed", similarity_matrix.shape)

    # only keep the upper triangle of the matrix
    similarity_matrix = np.triu(similarity_matrix, k=1)
    print("Upper triangle extracted", similarity_matrix.shape)
    
    gc.collect()
    hex_ls = df_all.hex_id.values
    similarity_df = pd.DataFrame(similarity_matrix, index = hex_ls, columns = hex_ls)
    gc.collect()
    similarity_df = similarity_df.stack()
    similarity_df = pd.DataFrame(similarity_df).reset_index()
    similarity_df.columns = ["hex_id1", "hex_id2", "similarity"]

    gc.collect()
    similarity_df = similarity_df.merge(df_all[['hex_id', 'city']].drop_duplicates(), left_on = 'hex_id1', right_on = 'hex_id')\
        .drop(["hex_id"], axis = 1)\
        .merge(df_all[['hex_id', 'city']].drop_duplicates(), left_on = "hex_id2", right_on = 'hex_id', suffixes = ["_1", "_2"])\
            .drop(["hex_id"], axis = 1)
    for city in city_ls:
        temp = similarity_df[(similarity_df.city_1 == city)]
        temp.to_parquet(os.path.join(CURATE_FOLDER_EXPORT, f'similarity_connection_res={res_sel}_city={city}.parquet'))
    print("Similarity df saved city by city")

    summarydf = similarity_df.groupby(['city_1', 'city_2']).size().reset_index()
    summarydf.to_csv(os.path.join(CURATE_FOLDER_EXPORT, f'similarity_summary_connection_res={res_sel}.csv'))
    
    return summarydf

In [36]:
import gc
gc.collect()

0

In [37]:
def create_city_pair_ls():
    city_meta = pd.read_csv("/home/yuanzf/uvi-time-machine/_script/city_meta.csv")
    city_ls = city_meta.City.values
    pair_ls = np.array(np.meshgrid(city_ls, city_ls)).T.reshape(-1, 2)
    pair_ls = pair_ls[pair_ls[:, 0] != pair_ls[:, 1]]  # exclude same city
    return pair_ls

pair_ls = create_city_pair_ls()

In [39]:
pair_ls.shape

(16002, 2)

In [34]:
# convert the similarity matrix to a dataframe
res_exclude = 11
city_1 = "Hong Kong"
city_2 = "Sheffield"
filename = "prob_city={city_name}_res_exclude={res_exclude}.parquet"

for res_sel in [6]:
    print(f"Processing res_sel={res_sel}")
    files = glob(
        CURATE_FOLDER_SOURCE
        + "/"
        + filename.format(city_name=city_1, res_exclude=res_exclude)
    ) + glob(
        CURATE_FOLDER_SOURCE
        + "/"
        + filename.format(city_name=city_2, res_exclude=res_exclude)
    )
    print(len(files))
    df_all = []
    for f in files:
        temp = pd.read_parquet(f)
        temp = temp[temp.res == res_sel].reset_index(drop = True)
        temp['city'] = os.path.basename(f).split("_")[1].replace("city=", "")
        df_all.append(temp)
    df_all = pd.concat(df_all).drop_duplicates('hex_id').reset_index(drop = True)
    df_all = df_all.drop(columns = ["res"])
    print("Data loaded", df_all.shape[0])
    n_cells = df_all.shape[0]
    X = df_all[vector_ls].values
    # create a new dataframe that has shape of (n_cells, 2) to store the similarity matrix
    # compute the similarity matrix
    similarity_matrix = cosine_similarity(X)
    print("Similarity matrix computed", similarity_matrix.shape)

    # only keep the upper triangle of the matrix
    similarity_matrix = np.triu(similarity_matrix, k=1)
    print("Upper triangle extracted", similarity_matrix.shape)

    gc.collect()
    hex_ls = df_all.hex_id.values
    similarity_df = pd.DataFrame(similarity_matrix, index = hex_ls, columns = hex_ls)
    gc.collect()
    similarity_df = similarity_df.stack()
    similarity_df = pd.DataFrame(similarity_df).reset_index()
    similarity_df.columns = ["hex_id1", "hex_id2", "similarity"]

    gc.collect()
    # similarity_df = similarity_df.merge(df_all[['hex_id', 'city']].drop_duplicates(), left_on = 'hex_id1', right_on = 'hex_id')\
    #     .drop(["hex_id"], axis = 1)\
    #     .merge(df_all[['hex_id', 'city']].drop_duplicates(), left_on = "hex_id2", right_on = 'hex_id', suffixes = ["_1", "_2"])\
    #         .drop(["hex_id"], axis = 1)
    # city_ls = df_all.city.unique()
    # for city in city_ls:
    #     temp = similarity_df[(similarity_df.city_1 == city)]
    #     temp.to_parquet(os.path.join(CURATE_FOLDER_EXPORT, f'similarity_connection_res={res_sel}_city={city}.parquet'))
    # print("Similarity df saved city by city")

    # summarydf = similarity_df.groupby(['city_1', 'city_2']).size().reset_index()
    # summarydf.to_csv(os.path.join(CURATE_FOLDER_EXPORT, f'similarity_summary_connection_res={res_sel}.csv'))

Processing res_sel=6
2
Data loaded 73
Similarity matrix computed (73, 73)
Upper triangle extracted (73, 73)


In [35]:
similarity_df[(similarity_df["hex_id1"] == hex_1) & (similarity_df["hex_id2"] == hex_2)]

Unnamed: 0,hex_id1,hex_id2,similarity
57,864103487ffffff,8619420cfffffff,0.005752


In [9]:
city_ls = df_all.city.unique()
for city in city_ls:
    temp = similarity_df[(similarity_df.city_1 == city)]
    temp.to_parquet(os.path.join(CURATE_FOLDER_EXPORT, f'similarity_connection_res={res_sel}_city={city}.parquet'))
print("Similarity df saved city by city")

summarydf = similarity_df.groupby(['city_1', 'city_2']).size().reset_index()
summarydf.to_csv(os.path.join(CURATE_FOLDER_EXPORT, f'similarity_summary_connection_res={res_sel}.csv'))

Similarity df saved city by city


# QC.1. Why two vectors have 0 cosine similarity

In [3]:
city = "Hong Kong"
res_sel = 6
temp = pd.read_parquet(
    os.path.join(
        CURATE_FOLDER_EXPORT, f"similarity_connection_res={res_sel}_city={city}.parquet"
    )
)

In [23]:
temp = temp.reset_index(drop=True)
temp.head()

Unnamed: 0,hex_id1,hex_id2,similarity,city_1,city_2
0,864103487ffffff,8619420c7ffffff,0.0,Hong Kong,Sheffield
1,864103487ffffff,8619420cfffffff,0.0,Hong Kong,Sheffield
2,864103487ffffff,8619420d7ffffff,0.0,Hong Kong,Sheffield
3,864103487ffffff,8619420dfffffff,0.0,Hong Kong,Sheffield
4,864103487ffffff,8619420e7ffffff,0.0,Hong Kong,Sheffield


In [24]:
hex_1 = '864103487ffffff'
hex_2 = "8619420cfffffff"
city_1 = "Hong Kong"
city_2 = "Sheffield"
RES_EXCLUDE = 11
filename = "prob_city={city_name}_res_exclude={res_exclude}.parquet"
# load the probablity vectors for the two hexagons
temp1 = pd.read_parquet(
    os.path.join(
        CURATE_FOLDER_SOURCE,
        filename.format(city_name=city_1, res_exclude=RES_EXCLUDE),
    )
)
vec1 = temp1[temp1.hex_id == hex_1][vector_ls].values[0]
temp2 = pd.read_parquet(
    os.path.join(
        CURATE_FOLDER_SOURCE,
        filename.format(city_name=city_2, res_exclude=RES_EXCLUDE),
    )
)
vec2 = temp2[temp2.hex_id == hex_2][vector_ls].values[0]
# compute the cosine similarity

In [25]:
vec1.shape

(127,)

In [26]:
vec2.shape

(127,)

In [27]:
# manual check the cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity([vec2], [vec1])

array([[0.00575179]], dtype=float32)

# QC.2. Why the cosine similarity is asymetrical?