In [25]:
import pandas as pd
import numpy as np
import os
import glob
import h3
from haversine import haversine, Unit
import shutil
from tqdm import tqdm

In [None]:
# random select the images within 5.5km of the city center

serviceaccount = "/home/yuanzf/google_drive_personal.json"
import gspread

# from oauth2client.service_account import ServiceAccountCredentials
gc = gspread.service_account(filename=serviceaccount)

def read_url(url, SHEET_NAME):
    SHEET_ID = url.split("/")[5]
    spreadsheet = gc.open_by_key(SHEET_ID)
    worksheet = spreadsheet.worksheet(SHEET_NAME)
    rows = worksheet.get_all_records()
    df_spread = pd.DataFrame(rows)
    return df_spread, worksheet


url = "https://docs.google.com/spreadsheets/d/1o5gFmZPUoDwrrbfE6M26uJF3HnEZll02ivnOxP6K6Xw/edit?usp=sharing"
SHEETNAME = "select_city"
city_meta, other_worksheet = read_url(url, SHEETNAME)

In [44]:
folder = "./data/gsv_rgb"
meta_folder = "gsvmeta"
folder_label = "./data/gsv_label"
if not os.path.exists(os.path.join(folder, cityabbr)):
    os.makedirs(os.path.join(folder, cityabbr))
    
dist_thred = 5500

In [45]:
def get_dist(city, df, dist_thred):
    center_lat, center_lng = city_meta[city_meta['City'] == city][['center_lat', 'center_lng']].values[0]
    df['dist_hav'] = df.apply(lambda row: haversine((row['lat'], row['lon']), (center_lat, center_lng),
                                                            unit = 'm'), axis=1)
    return df[df['dist_hav'] < dist_thred].reset_index(drop = True)

def load_data(city):
    cityabbr = city.replace(" ", "").lower()
    META_FILE = "{cityabbr}_meta.csv"


    metafiles = os.listdir(os.path.join(folder, cityabbr, meta_folder))
    meta_file = META_FILE.format(cityabbr=cityabbr)
    meta_file_path = os.path.join(folder, cityabbr, meta_folder, meta_file)
    df = pd.read_csv(meta_file_path)
    return df

def sel_data(city):
    cityabbr = city.replace(" ", "").lower()
    df = load_data(city)
    df_sel = get_dist(city, df, dist_thred)
    # create h3 index
    df_sel['h3_res12'] = df_sel.apply(lambda x: h3.geo_to_h3(x.lat, x.lon, 12), axis=1)
    print("Number of h3 index: ", df_sel['h3_res12'].nunique())
    df_sel_h3 = df_sel.groupby(['h3_res12'])['dist_hav'].mean().sort_values(ascending = True).head(200).reset_index()
    df_sel_100 = df_sel[df_sel['h3_res12'].isin(df_sel_h3['h3_res12'])].groupby('h3_res12').head(1).reset_index()
    return df_sel_100

def copy_gsv(city):
    df_sel_100 = sel_data(city)
    print("Images selected: ", df_sel_100.shape[0])
    # change the path to absolute path
    df_sel_100['path'] = df_sel_100['path'].apply(lambda x: x.replace("./", "/group/geog_pyloo/08_GSV/"))
    folder_label_city = os.path.join(folder_label, city.replace(" ", "").lower())
    if not os.path.exists(folder_label_city):
        os.makedirs(folder_label_city)
    for p in tqdm(df_sel_100['path']):
        shutil.copy(p, folder_label_city)
        

In [46]:
city = 'Hong Kong'
copy_gsv(city)

Number of h3 index:  11716
Images selected:  200


100%|██████████| 200/200 [00:08<00:00, 24.45it/s]


In [36]:
cityls = city_meta['City'].unique()
cityls = [x for x in cityls if x not in ['New York','Hong Kong', 'London','Tokyo','Nairobi']]

In [48]:
for city in ['Munich', 'Madrid', 'Dubai', 'Capetown','New York', 'London','Tokyo','Nairobi', 'Sao Paulo']:
    print("Now processing: ", city)
    copy_gsv(city)

Now processing:  Munich
Number of h3 index:  18326
Images selected:  200


100%|██████████| 200/200 [00:10<00:00, 19.01it/s]


Now processing:  Madrid
Number of h3 index:  7394
Images selected:  200


100%|██████████| 200/200 [00:04<00:00, 49.51it/s]


Now processing:  Dubai
Number of h3 index:  5977
Images selected:  200


100%|██████████| 200/200 [00:09<00:00, 20.86it/s]


Now processing:  Capetown
Number of h3 index:  7251
Images selected:  200


100%|██████████| 200/200 [00:11<00:00, 17.26it/s]


Now processing:  New York
Number of h3 index:  11269
Images selected:  200


100%|██████████| 200/200 [00:04<00:00, 49.53it/s]


Now processing:  London
Number of h3 index:  20129
Images selected:  200


100%|██████████| 200/200 [00:12<00:00, 15.76it/s]


Now processing:  Tokyo
Number of h3 index:  11501
Images selected:  200


100%|██████████| 200/200 [00:08<00:00, 24.51it/s]


Now processing:  Nairobi
Number of h3 index:  23288
Images selected:  200


100%|██████████| 200/200 [00:10<00:00, 18.51it/s]


Now processing:  Sao Paulo
Number of h3 index:  20113
Images selected:  200


100%|██████████| 200/200 [00:06<00:00, 30.60it/s]


In [49]:
# use tar zip the label folder
!tar -czvf gsv_label.tar.gz ./data/gsv_label

./data/gsv_label/
./data/gsv_label/hongkong/
./data/gsv_label/hongkong/UfdTxePZ1N8yJK3h05nEQQ_0.jpg
./data/gsv_label/hongkong/GdBsqdZs9jffsWcfMTSgig_0.jpg
./data/gsv_label/hongkong/KLhnwUWQBtXvq2cJ9ZHLfw_0.jpg
./data/gsv_label/hongkong/4GyfziisVd1omv-a9zYaCw_0.jpg
./data/gsv_label/hongkong/-rgrH71zhiRJUWaPWS3wZg_0.jpg
./data/gsv_label/hongkong/4ZJpDg2Ib6pvxfA3-BtjSQ_0.jpg
./data/gsv_label/hongkong/zWVY6KlggBJtCxYWnW6TSw_0.jpg
./data/gsv_label/hongkong/dVBDNEPYufRW2XOUwl6Syw_0.jpg
./data/gsv_label/hongkong/oQvYLc2Nu_ne1EmiUpyoig_0.jpg
./data/gsv_label/hongkong/gIlJbN4XoqtXAeX6zH_INQ_0.jpg
./data/gsv_label/hongkong/S-HH_XBVgo_nTAyqVaWeTQ_0.jpg
./data/gsv_label/hongkong/j8oKrHro9BfZ35ilqPMXeg_0.jpg
./data/gsv_label/hongkong/mCtIeICnGtJDCRN2kws55g_0.jpg
./data/gsv_label/hongkong/7O_6BQJmH_yZ8LZh5n_dng_0.jpg
./data/gsv_label/hongkong/ONOoW_KNtevfmeYAbjNmLQ_0.jpg
./data/gsv_label/hongkong/ftQRaEGBkZEzo_fZuhbhlA_0.jpg
./data/gsv_label/hongkong/-ymBg5sCSP3vE32ByzSM9A_0.jpg
./data/gsv_label/hon