# Goal:
1. load image pair file and select images to pair together
2. For each image meta file, we assign each lat lon with a level 12 h3 index (approximate edge length is 10 meter, maximum distance 20 meter) https://h3geo.org/docs/core-library/restable/
3. For ease of analysis, we only select the locations within 5.5 km of city center
4. dist_hav: meter from city center. Refer to the excel: https://docs.google.com/spreadsheets/d/1o5gFmZPUoDwrrbfE6M26uJF3HnEZll02ivnOxP6K6Xw/edit#gid=0

## Output format:
1. pair of images in the format of 64encoding, latitude, longitude (centroid), distance from city center


In [52]:
import numpy as np
import pandas as pd
import PIL
import os
from PIL import Image
import base64
import h3


In [58]:
city = 'hongkong'
folder = "/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_rgb"

yeargroup0 = [2016, 2017]
yeargroup1 = [2019, 2020]
yeargroup2 = [2022, 2023]

dist_thred = 5000
meta_folder = "gsvmeta"

metafiles = os.listdir(os.path.join(folder, city, meta_folder))
metafiles

['hongkong_meta.csv',
 'gsv_pano.p',
 'gsv_pano.csv',
 'gsv_pano_label.csv',
 'gsv_path.csv',
 'gsv_pano_label_before15.csv',
 'sentPt.p',
 'forviz.csv']

In [19]:
os.path.join(folder, city, meta_folder, 'hongkong_meta.csv')

'/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_rgb/hongkong/gsvmeta/hongkong_meta.csv'

In [34]:
df = pd.read_csv(os.path.join(folder, city, meta_folder, 'hongkong_meta.csv'))
df['h3_res12'] = df.apply(lambda x: h3.geo_to_h3(x.lat, x.lon, 12), axis=1)
# df.to_csv(os.path.join(folder, city, meta_folder, 'hongkong_meta.csv'), index = False) # overwrite the file for later usage
df.head()

Unnamed: 0,path,panoid,angle,size,lat,lon,year,month,id,dist_hav,h3_res8,h3_res9,h3_res13,h3_res12
0,./data/gsv_rgb/hongkong/img_rgb/6_1/d/8/wKWRvk...,wKWRvkKL2PCumn4qlDHz5A,0,50251,22.226549,114.199823,2020.0,10.0,38827,8782.527726,8841034821fffff,89410348203ffff,8d410348203297f,8c41034820329ff
1,./data/gsv_rgb/hongkong/img_rgb/6_1/d/8/wKWRvk...,wKWRvkKL2PCumn4qlDHz5A,90,40307,22.226549,114.199823,2020.0,10.0,38827,8782.527726,8841034821fffff,89410348203ffff,8d410348203297f,8c41034820329ff
2,./data/gsv_rgb/hongkong/img_rgb/6_1/d/8/wKWRvk...,wKWRvkKL2PCumn4qlDHz5A,180,36605,22.226549,114.199823,2020.0,10.0,38827,8782.527726,8841034821fffff,89410348203ffff,8d410348203297f,8c41034820329ff
3,./data/gsv_rgb/hongkong/img_rgb/6_1/d/8/wKWRvk...,wKWRvkKL2PCumn4qlDHz5A,270,43963,22.226549,114.199823,2020.0,10.0,38827,8782.527726,8841034821fffff,89410348203ffff,8d410348203297f,8c41034820329ff
4,./data/gsv_rgb/hongkong/img_rgb/6_1/d/8/Zt4pQV...,Zt4pQVSLaglLUGrQ_quQlA,0,36231,22.248623,114.156795,2020.0,11.0,4598,6370.818091,88410349d7fffff,89410349d63ffff,8d410349d601d3f,8c410349d601dff


In [62]:
dist_thred = 5000
df_sel = df[df['dist_hav']<dist_thred].reset_index(drop = True)
df_sel.groupby('year')['panoid'].nunique()

year
2015.0     239
2016.0    1266
2017.0     680
2018.0       1
2019.0    1662
2020.0    1571
2021.0    1828
2022.0    4366
2023.0    2981
Name: panoid, dtype: int64

In [63]:
# export the panoid for checking
viztest = df_sel.drop_duplicates(['panoid']).reset_index(drop = True)
# viztest.to_csv(os.path.join(folder, city, meta_folder,'forviz.csv'), index = False)
# count per each h3_res12
df_sel['panoid_num'] = df_sel.groupby(['h3_res12'])['panoid'].transform('nunique')
df_sel.head()

Unnamed: 0,path,panoid,angle,size,lat,lon,year,month,id,dist_hav,h3_res8,h3_res9,h3_res13,h3_res12,panoid_num
0,./data/gsv_rgb/hongkong/img_rgb/6_1/d/8/dyPS4_...,dyPS4_m8kzbt52M4QrpZQg,0,37758,22.3454,114.191242,2016.0,12.0,84058,4961.188527,88411c84c3fffff,89411c84c3bffff,8d411c84c383c7f,8c411c84c383dff,1
1,./data/gsv_rgb/hongkong/img_rgb/6_1/d/8/dyPS4_...,dyPS4_m8kzbt52M4QrpZQg,90,32596,22.3454,114.191242,2016.0,12.0,84058,4961.188527,88411c84c3fffff,89411c84c3bffff,8d411c84c383c7f,8c411c84c383dff,1
2,./data/gsv_rgb/hongkong/img_rgb/6_1/d/8/dyPS4_...,dyPS4_m8kzbt52M4QrpZQg,180,31637,22.3454,114.191242,2016.0,12.0,84058,4961.188527,88411c84c3fffff,89411c84c3bffff,8d411c84c383c7f,8c411c84c383dff,1
3,./data/gsv_rgb/hongkong/img_rgb/6_1/d/8/dyPS4_...,dyPS4_m8kzbt52M4QrpZQg,270,39165,22.3454,114.191242,2016.0,12.0,84058,4961.188527,88411c84c3fffff,89411c84c3bffff,8d411c84c383c7f,8c411c84c383dff,1
4,./data/gsv_rgb/hongkong/img_rgb/6_1/d/8/zXd6kO...,zXd6kOYWYxaUVTH0iwKmKA,0,39883,22.310641,114.176056,2016.0,12.0,79643,889.861953,88411c8697fffff,89411c86967ffff,8d411c8696514ff,8c411c8696515ff,2


In [65]:
df_pair = df_sel[df_sel['panoid_num']>1].reset_index(drop = True)
print(df_pair.shape[0])
df_pair['path'] = df_pair['path'].apply(lambda x: folder+ x.replace("./data/gsv_rgb", ""))

# os.listdir(folder)
df_pair

28404


Unnamed: 0,path,panoid,angle,size,lat,lon,year,month,id,dist_hav,h3_res8,h3_res9,h3_res13,h3_res12,panoid_num
0,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,zXd6kOYWYxaUVTH0iwKmKA,0,39883,22.310641,114.176056,2016.0,12.0,79643,889.861953,88411c8697fffff,89411c86967ffff,8d411c8696514ff,8c411c8696515ff,2
1,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,zXd6kOYWYxaUVTH0iwKmKA,90,46593,22.310641,114.176056,2016.0,12.0,79643,889.861953,88411c8697fffff,89411c86967ffff,8d411c8696514ff,8c411c8696515ff,2
2,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,zXd6kOYWYxaUVTH0iwKmKA,180,39932,22.310641,114.176056,2016.0,12.0,79643,889.861953,88411c8697fffff,89411c86967ffff,8d411c8696514ff,8c411c8696515ff,2
3,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,zXd6kOYWYxaUVTH0iwKmKA,270,35743,22.310641,114.176056,2016.0,12.0,79643,889.861953,88411c8697fffff,89411c86967ffff,8d411c8696514ff,8c411c8696515ff,2
4,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,FmKVuLeGOvCK8UwDFT1Yzg,0,40409,22.333633,114.167634,2016.0,12.0,45173,3576.851717,88411c84dbfffff,89411c84da3ffff,8d411c84da0d43f,8c411c84da0d5ff,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28399,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,KjbZVDr0MkyZWrpCP6TdSA,270,41637,22.317625,114.163645,2021.0,11.0,91850,2167.768607,88411cb361fffff,89411cb3607ffff,8d411cb36a9243f,8c411cb36a925ff,2
28400,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,tjYONFxcirl1ZccddVJUiw,0,40348,22.339187,114.202591,2019.0,11.0,58834,4823.273080,88411c84cdfffff,89411c84c8bffff,8d411c84c8964bf,8c411c84c8965ff,21
28401,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,tjYONFxcirl1ZccddVJUiw,90,36334,22.339187,114.202591,2019.0,11.0,58834,4823.273080,88411c84cdfffff,89411c84c8bffff,8d411c84c8964bf,8c411c84c8965ff,21
28402,/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_r...,tjYONFxcirl1ZccddVJUiw,180,35947,22.339187,114.202591,2019.0,11.0,58834,4823.273080,88411c84cdfffff,89411c84c8bffff,8d411c84c8964bf,8c411c84c8965ff,21


In [66]:
# Function to encode the image
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

In [67]:
df_pair['img_encoding'] = df_pair['path'].apply(lambda x: encode_image(x))
df_pair.drop(['panoid_num','h3_res8'], axis = 1).to_parquet(os.path.join(folder, city, meta_folder, f'img_encoding_{city}_5k.parquet'), index = False)