In [1]:

from tqdm import tqdm
from glob import glob
import os
import pandas as pd
import numpy as np
import gc
import datetime
import argparse
import h3


In [2]:
# Constants
ROOTFOLDER = "/lustre1/g/geog_pyloo/05_timemachine"
VALFOLDER = (
    "/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier_img_yolo8_inf_dir"
)
CURATED_FOLDER = (
    "/lustre1/g/geog_pyloo/05_timemachine/_curated/c_city_classifiier_prob"
)
TRAIN_TEST_FOLDER = "/lustre1/g/geog_pyloo/05_timemachine/_transformed/t_classifier_img_yolo8"
RAW_PATH = "/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_rgb/{city}/gsvmeta/{city}_meta.csv"

PANO_PATH = "{ROOTFOLDER}/GSV/gsv_rgb/{cityabbr}/gsvmeta/gsv_pano.csv"
PATH_PATH = "{ROOTFOLDER}/GSV/gsv_rgb/{cityabbr}/gsvmeta/gsv_path.csv"

CURATE_FOLDER_SOURCE = "/lustre1/g/geog_pyloo/05_timemachine/_curated/c_city_classifiier_prob_hex_summary"
CURATE_FOLDER_EXPORT = "/lustre1/g/geog_pyloo/05_timemachine/_curated/c_city_classifiier_prob_similarity"

if not os.path.exists(CURATE_FOLDER_EXPORT):
    os.makedirs(CURATE_FOLDER_EXPORT)
    
vector_ls = [str(x) for x in range(0, 127)]


In [None]:
# create scripts for batch processing
lines = """python /home/yuanzf/uvi-time-machine/_script/A-city-never-was/B5_prob_vector_summary.py --city {city}"""
city_meta = pd.read_csv("/home/yuanzf/uvi-time-machine/_script/city_meta.csv")
city_ls = city_meta.City.values
# split the cities into four groups to run the script in parallel
N = len(city_ls) // 10
for i in range(N):
    with open(f"run_b5_{i}.sh", "w") as f:
        for city in city_ls[i*10:(i+1)*10]:
            f.write(lines.format(city=city) + "\n")

# Construct similarity indexes

In [None]:
# construct the similarity matrix among all cities
# load the results first
# compute the similarity matrix among all cells
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import gc

RES_EXCLUDE = 11

# OUTPUT_FILE_NAME = "prob_city={city}_res_exclude={res_exclude}.parquet"
def load_all(res_sel,
             res_exclude=RES_EXCLUDE,
             ):
    files = glob(CURATE_FOLDER_SOURCE + f"/*res_exclude={res_exclude}.parquet")
    print(len(files))
    df_all = []
    for f in files:
        temp = pd.read_parquet(f)
        temp = temp[temp.res == res_sel].reset_index(drop = True)
        temp['city'] = os.path.basename(f).split("_")[1].replace("city=", "")
        df_all.append(temp)
    df_all = pd.concat(df_all).drop_duplicates('hex_id').reset_index(drop = True)
    df_all = df_all.drop(columns = ["res"])
    print("Data loaded", df_all.shape[0])
    n_cells = df_all.shape[0]
    X = df_all[vector_ls].values
    # create a new dataframe that has shape of (n_cells, 2) to store the similarity matrix
    # compute the similarity matrix
    similarity_matrix = cosine_similarity(X)
    print("Similarity matrix computed", similarity_matrix.shape)

    # only keep the upper triangle of the matrix
    similarity_matrix = np.triu(similarity_matrix, k=1)
    print("Upper triangle extracted", similarity_matrix.shape)
    
    gc.collect()
    hex_ls = df_all.hex_id.values
    similarity_df = pd.DataFrame(similarity_matrix, index = hex_ls, columns = hex_ls)
    gc.collect()
    similarity_df = similarity_df.stack()
    similarity_df = pd.DataFrame(similarity_df).reset_index()
    similarity_df.columns = ["hex_id1", "hex_id2", "similarity"]

    gc.collect()
    similarity_df = similarity_df.merge(df_all[['hex_id', 'city']].drop_duplicates(), left_on = 'hex_id1', right_on = 'hex_id')\
        .drop(["hex_id"], axis = 1)\
        .merge(df_all[['hex_id', 'city']].drop_duplicates(), left_on = "hex_id2", right_on = 'hex_id', suffixes = ["_1", "_2"])\
            .drop(["hex_id"], axis = 1)
    for city in city_ls:
        temp = similarity_df[(similarity_df.city_1 == city)]
        temp.to_parquet(os.path.join(CURATE_FOLDER_EXPORT, f'similarity_connection_res={res_sel}_city={city}.parquet'))
    print("Similarity df saved city by city")

    summarydf = similarity_df.groupby(['city_1', 'city_2']).size().reset_index()
    summarydf.to_csv(os.path.join(CURATE_FOLDER_EXPORT, f'similarity_summary_connection_res={res_sel}.csv'))
    
    return summarydf

In [5]:
import gc
gc.collect()

476

In [10]:
# convert the similarity matrix to a dataframe
res_exclude = 11
for res_sel in [7]:
    print(f"Processing res_sel={res_sel}")
    files = glob(CURATE_FOLDER_SOURCE + f"/*res_exclude={res_exclude}.parquet")
    print(len(files))
    df_all = []
    for f in files:
        temp = pd.read_parquet(f)
        temp = temp[temp.res == res_sel].reset_index(drop = True)
        temp['city'] = os.path.basename(f).split("_")[1].replace("city=", "")
        df_all.append(temp)
    df_all = pd.concat(df_all).drop_duplicates('hex_id').reset_index(drop = True)
    df_all = df_all.drop(columns = ["res"])
    print("Data loaded", df_all.shape[0])
    n_cells = df_all.shape[0]
    X = df_all[vector_ls].values
    # create a new dataframe that has shape of (n_cells, 2) to store the similarity matrix
    # compute the similarity matrix
    similarity_matrix = cosine_similarity(X)
    print("Similarity matrix computed", similarity_matrix.shape)

    # only keep the upper triangle of the matrix
    similarity_matrix = np.triu(similarity_matrix, k=1)
    print("Upper triangle extracted", similarity_matrix.shape)
    
    gc.collect()
    hex_ls = df_all.hex_id.values
    similarity_df = pd.DataFrame(similarity_matrix, index = hex_ls, columns = hex_ls)
    gc.collect()
    similarity_df = similarity_df.stack()
    similarity_df = pd.DataFrame(similarity_df).reset_index()
    similarity_df.columns = ["hex_id1", "hex_id2", "similarity"]

    gc.collect()
    similarity_df = similarity_df.merge(df_all[['hex_id', 'city']].drop_duplicates(), left_on = 'hex_id1', right_on = 'hex_id')\
        .drop(["hex_id"], axis = 1)\
        .merge(df_all[['hex_id', 'city']].drop_duplicates(), left_on = "hex_id2", right_on = 'hex_id', suffixes = ["_1", "_2"])\
            .drop(["hex_id"], axis = 1)
    city_ls = df_all.city.unique()
    for city in city_ls:
        temp = similarity_df[(similarity_df.city_1 == city)]
        temp.to_parquet(os.path.join(CURATE_FOLDER_EXPORT, f'similarity_connection_res={res_sel}_city={city}.parquet'))
    print("Similarity df saved city by city")

    summarydf = similarity_df.groupby(['city_1', 'city_2']).size().reset_index()
    summarydf.to_csv(os.path.join(CURATE_FOLDER_EXPORT, f'similarity_summary_connection_res={res_sel}.csv'))

Processing res_sel=7
112
Data loaded 23886
Similarity matrix computed (23886, 23886)
Upper triangle extracted (23886, 23886)
Similarity df saved city by city


In [9]:
city_ls = df_all.city.unique()
for city in city_ls:
    temp = similarity_df[(similarity_df.city_1 == city)]
    temp.to_parquet(os.path.join(CURATE_FOLDER_EXPORT, f'similarity_connection_res={res_sel}_city={city}.parquet'))
print("Similarity df saved city by city")

summarydf = similarity_df.groupby(['city_1', 'city_2']).size().reset_index()
summarydf.to_csv(os.path.join(CURATE_FOLDER_EXPORT, f'similarity_summary_connection_res={res_sel}.csv'))

Similarity df saved city by city


In [16]:
n_author = 3
is_first = True
def get_percentage(n_author = 3, is_first=True):
    x = 100/(3+n_author)
    if is_first:
        return 3*x
    else:
        return 2*x
all_paper = {
    1:[3, False],
    2:[2, False],
    3:[4, True],
    4:[3, False],
    5:[4, True],
    6:[2, True],
    7: [5, True],
    8: [5, False],
    9: [2, True],
    10:[4, False],
    11:[2, True]
}
result = []
for i in all_paper.keys():
    print(i, get_percentage(all_paper[i][0], all_paper[i][1]))
    result.append(get_percentage(all_paper[i][0], all_paper[i][1]))
print(np.mean(result))

1 33.333333333333336
2 40.0
3 42.85714285714286
4 33.333333333333336
5 42.85714285714286
6 60.0
7 37.5
8 25.0
9 60.0
10 28.571428571428573
11 60.0
42.13203463203463
