In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import shutil
from math import sin, cos, sqrt, atan2, radians
import random
from tqdm import tqdm
import multiprocessing
from multiprocessing import Pool
import glob
import sys
ROOT_DIR = os.path.abspath("../")
sys.path.append(ROOT_DIR)
from utils.gsvload import GSVSummary
import concurrent.futures

# Goal:
1. Check downloaded data
2. Create the updated {city}_meta.csv file for each city 
3. the {city}_meta.csv should only include the downloade gsv and selected data in gsv_pano_label.csv
4. this meta file is used for segmentation and city never was project. Remaining downloaded data will stay in the gsv_path.csv file

In [3]:
META_FILE = "{citylower}_meta.csv"
city = "New York"
city_abbr = city.lower().replace(" ", "_")
meta_file= META_FILE.format(citylower = city_abbr.replace("_", ""))
citysummary = GSVSummary(city)
GSV_META_FOLDER = citysummary.metafolder

meta_df = pd.read_csv(os.path.join(citysummary.metafolder, meta_file), nrows= 1)

In [4]:
meta = {
    "path":"path to the gsv",
    "panoid":"unique identifier",
    "angle":"image angle",
    "size":"image size",
    "lat":"latitude",
    "lon":"longitude",
    "year":"year taken",
    "month":"month",
    "id":"image id",
    "dist_hav":"distance from the city center", 
    "h3_res8":"h3 level 8",
    "h3_res9":"h3 level 9"
}

# 1. Check all data availability

In [5]:

serviceaccount = "../../google_drive_personal.json"
import gspread

# from oauth2client.service_account import ServiceAccountCredentials
gc = gspread.service_account(filename=serviceaccount)


def read_url(url, SHEET_NAME):
    SHEET_ID = url.split("/")[5]
    spreadsheet = gc.open_by_key(SHEET_ID)
    worksheet = spreadsheet.worksheet(SHEET_NAME)
    rows = worksheet.get_all_records()
    df_spread = pd.DataFrame(rows)
    return df_spread, worksheet


url = "https://docs.google.com/spreadsheets/d/1o5gFmZPUoDwrrbfE6M26uJF3HnEZll02ivnOxP6K6Xw/edit?usp=sharing"
SHEETNAME = "select_city"
city_meta, other_worksheet = read_url(url, SHEETNAME)
city_meta = city_meta[city_meta['City']!=''].reset_index(drop = True)

In [6]:
ROOT = "/lustre1/g/geog_pyloo"


In [8]:
def get_path(subfolder):
    files = glob.glob(os.path.join(subfolder, "*/*/*.jpg"))
    return files


def get_file_size(file):
    try:
        return file, os.path.getsize(file)
    except OSError as e:
        print(f"Error: {e}")
        return file, None


class GSVnew(GSVSummary):
    def __init__(self, city):
        super().__init__(city)

    def get_gsv_file_size(self, gsv_meta_df):
        gsvpath = gsv_meta_df.copy()
        files = gsvpath["path"].values
        # Store file sizes in a dictionary
        file_sizes = {}
        with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
            # The map method helps maintain the order of results
            results = executor.map(get_file_size, files)

            for file, size in results:
                if size is not None:
                    file_sizes[file] = size
                else:
                    file_sizes[file] = 0  # the file is removed

        gsvpath["size"] = gsvpath["path"].apply(lambda x: file_sizes[x])
        # gsvpath.to_csv(os.path.join(self.metafolder, "gsv_path.csv"), index=False)
        return gsvpath

## 2.Check all downloaded GSV
1. Count number of GSV downloaded
2. Generate the gsv_path.csv file for all downloaded gsv

In [9]:
import datetime

today = datetime.datetime.today().strftime("%Y-%m-%d")
gsv_path_file = "gsv_path.csv"

In [None]:
# current being transfered, can summarize later
city_to_transfer = ['jakarta', 
                    'lima', # fully transfered
                    'metromanila', # fully transfered
                    'nagoya',# fully transfered
                    'riodejaneiro',# fully transfered
                    'milan', # 
                    'bangkok',
                    'toronto',
                    'kualalumpur'
                   ]

In [6]:
# This function checks whether there are more gsv downloaded but not included in the gsv_path file
for city in tqdm(['lima']):
    print(city)
    citysummary = GSVnew(city)
    gsvpath = citysummary.load_finshed_gsv()
    # load previous gsv path
    try:
        gsvpath_original = pd.read_csv(os.path.join(citysummary.metafolder, "gsv_path.csv"))
        print(gsvpath_original.shape[0], ": original")
        remain = gsvpath[
            gsvpath["panoid"].isin(gsvpath_original["panoid"]) == False
        ].reset_index(drop=True)
        if remain.shape[0] > 0:
            gsvpath_update = citysummary.get_gsv_file_size(remain)

            gsvpath_new = pd.concat([gsvpath_original, gsvpath_update], axis=0)
            print(gsvpath_new.shape[0], ": original+new")
            gsvpath_new.to_csv(
                os.path.join(citysummary.metafolder, "gsv_path.csv"), index=False
            )
        else:
            print("no new gsv")
        os.remove(os.path.join(citysummary.metafolder, gsv_path_file))
    except:
        print("no path file exists")
        gsvpath.to_csv(
                os.path.join(citysummary.metafolder, "gsv_path.csv"), index=False
            )

  0%|          | 0/3 [00:00<?, ?it/s]

buenosaires



  0%|          | 0/16 [00:00<?, ?it/s][A
  6%|▋         | 1/16 [00:29<07:14, 28.94s/it][A
 12%|█▎        | 2/16 [00:29<02:54, 12.46s/it][A
 56%|█████▋    | 9/16 [00:55<00:34,  4.89s/it][A
 62%|██████▎   | 10/16 [00:56<00:25,  4.32s/it][A
 94%|█████████▍| 15/16 [00:56<00:02,  2.12s/it][A
100%|██████████| 16/16 [00:56<00:00,  3.54s/it][A


no path file exists


 33%|███▎      | 1/3 [01:37<03:14, 97.04s/it]

dhaka



  0%|          | 0/16 [00:00<?, ?it/s][A
  6%|▋         | 1/16 [00:01<00:15,  1.02s/it][A
 25%|██▌       | 4/16 [00:01<00:02,  4.24it/s][A
 56%|█████▋    | 9/16 [00:01<00:00,  7.81it/s][A
100%|██████████| 16/16 [00:01<00:00,  8.33it/s][A


no path file exists


 67%|██████▋   | 2/3 [01:58<00:52, 52.52s/it]

hyderabad



  0%|          | 0/16 [00:00<?, ?it/s][A
  6%|▋         | 1/16 [00:00<00:08,  1.86it/s][A
 62%|██████▎   | 10/16 [00:00<00:00, 17.28it/s][A
100%|██████████| 16/16 [00:00<00:00, 19.31it/s][A


no path file exists


100%|██████████| 3/3 [02:15<00:00, 45.09s/it]


In [None]:
# recalculate the number of images per city
# count = []
# for city in tqdm(city_meta["City"].values.tolist()):
#     print(city)
#     citysummary = GSVnew(city)
#     gsvpath = pd.read_csv(os.path.join(citysummary.metafolder, "gsv_path.csv"))
#     count.append(gsvpath.shape[0])

In [19]:
new_cities = ['buenosaires', 
                  'dhaka',
                 'hyderabad']

In [8]:
road_result = []
pano_result = []
gsv_result = []

for city in tqdm(new_cities):
    result = GSVnew(city)
    try:
        road_result.append(result.check_road())
    except:
        road_result.append(0)
    try:
        pano_result.append(result.check_pano()[1])
    except:
        pano_result.append(0)
    try:
        gsvdf = result.load_gsv_path()
        n = gsvdf.shape[0]
        gsv_result.append(n)
    except:
        gsv_result.append(0)

100%|██████████| 3/3 [00:49<00:00, 16.46s/it]


In [11]:
inspectdf = pd.DataFrame(
    {
        "city": new_cities,
        "num_panoid": pano_result,
        "road": road_result,
        "GSV Downloaded": gsv_result,
    }
)
inspectdf

Unnamed: 0,city,num_panoid,road,GSV Downloaded
0,buenosaires,606938,1,2365132
1,dhaka,515490,1,2054805
2,hyderabad,183285,1,732376


In [20]:
city_meta['city'] = city_meta['City'].apply(lambda x: x.lower().replace(" ", ""))
city_meta = city_meta.set_index('city')
for city in inspectdf.city.unique():
    temp = inspectdf[inspectdf['city']==city].reset_index(drop = True)
    city_meta.at[city, 'num_panoid'] = temp['num_panoid'].values[0]
    city_meta.at[city,  'road'] = temp['road'].values[0]
    city_meta.at[city,  'GSV Downloaded'] = temp['GSV Downloaded'].values[0]

In [22]:
city_meta = city_meta.reset_index(drop = True)
city_meta = city_meta.astype(str)

Unnamed: 0,City,Country,center_lat,center_lng,"GSV Pair (15-18, 20-23)",label,h3_9_count,h3_8_count,num_panoid,road,GSV Downloaded,Transfer Progress,Encoding Progress,oneformer Segmentation
0,Mumbai,India,19.076,72.8777,,35,,,21640,1,49301,TRUE,FALSE,FALSE
1,Kampala,Uganda,0.347596,32.58252,0,10,1390,215,38839,1,116248,TRUE,FALSE,FALSE
2,Miami,USA,25.7617,-80.1918,,39,,,806659,1,123133,TRUE,FALSE,FALSE
3,San Francisco,USA,37.7749,-122.4194,,37,,,1346152,1,129104,TRUE,FALSE,FALSE
4,Paris,France,48.864716,2.349014,4256,13,1151,170,864312,1,133100,TRUE,FALSE,FALSE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,Rio de Janeiro,Brazil,-22.911502,-43.181365,,,,,,,,TRUE,,
63,Tehran,Iran,35.721568,51.335854,,,,,,,,TRUE,,
64,Milan,Italy,45.465776,9.187412,,,,,,,,TRUE,,
65,Kuala Lumpur,Malaysia,3.1495,101.694178,,,,,,,,TRUE,,


In [27]:
# write to google sheet
other_worksheet.update(
    [city_meta.columns.values.tolist()] + city_meta.values.tolist()
)

{'spreadsheetId': '1o5gFmZPUoDwrrbfE6M26uJF3HnEZll02ivnOxP6K6Xw',
 'updatedRange': 'select_city!A1:N68',
 'updatedRows': 68,
 'updatedColumns': 14,
 'updatedCells': 952}

# Create final pano meta file for analysis


In [13]:
# cityabbr = city.lower().replace(" ", "")
META_FILE = '{cityabbr}_meta.csv'
META_FOLDER = "gsvmeta"
GSVROOT = "/lustre1/g/geog_pyloo/05_timemachine/GSV/gsv_rgb/"

In [20]:
for city in tqdm([
    # 'buenosaires','dhaka',
                 # 'lima', # direct transfer
                 #    'metromanila', # direct transfer
                 #    'nagoya', # direct transfer
                 #    'riodejaneiro', # direct transfer
                    'milan', # tar - 3572 
                    'bangkok', # transfered -3573 - suspect of data not fully transfered
                    'toronto', # tar - 3574
                    'kualalumpur', # tar - 3575
                     'jakarta'
]): 
    # Step 0: check whether the meta data already exists
    cityabbr = city.lower().replace(" ", "")
    metafiles = os.listdir(os.path.join(GSVROOT, cityabbr, META_FOLDER))
    meta_file = META_FILE.format(cityabbr = cityabbr)
    
    if META_FILE in metafiles:
        print(f"CITY {city} meta file already exists")

    else:
        print("Meta file missing, now processing: ", city)
        # Step 1: check remaining data to add to the path
        citysummary = GSVSummary(city)
        gsvpath = pd.read_csv(os.path.join(citysummary.metafolder, "gsv_path.csv"))
        print(gsvpath.head(1))
        path_sample = gsvpath['path'].values[0]
        if "/data/" in path_sample:
            print(path_sample)
            print("need to correct the path")
            gsvpath['path'] = gsvpath['path'].apply(lambda x: x.replace("./data/", "/lustre1/g/geog_pyloo/05_timemachine/GSV/"))
        else:
            print("Path is already correct")
        print(gsvpath.shape[0], ": original")
        
        # Step 2: merge the meta file and save file
        metadf_update = citysummary.merge_meta(sel = True)
        print("final saved file number of rows: ", metadf_update.shape[0])
        import gc
        gc.collect()


  0%|          | 0/5 [00:00<?, ?it/s]

Meta file missing, now processing:  milan
                                                path                  panoid
0  ./data/gsv_rgb/milan/img_rgb/4_1/1/3/TM0lPn6MA...  TM0lPn6MAec85C0miWWYbQ
./data/gsv_rgb/milan/img_rgb/4_1/1/3/TM0lPn6MAec85C0miWWYbQ_0.jpg
need to correct the path
4646129 : original
Total number of panos: 4646129


 20%|██        | 1/5 [00:38<02:33, 38.48s/it]

final saved file number of rows:  179964
Meta file missing, now processing:  bangkok
                                                path                  panoid
0  ./data/gsv_rgb/bangkok/img_rgb/8_1/9/1/6qce74X...  6qce74XwW42RGAWKneaFXQ
./data/gsv_rgb/bangkok/img_rgb/8_1/9/1/6qce74XwW42RGAWKneaFXQ_0.jpg
need to correct the path
9848301 : original
Total number of panos: 9848301


 40%|████      | 2/5 [02:11<03:30, 70.28s/it]

final saved file number of rows:  1147829
Meta file missing, now processing:  toronto
                                                path                  panoid
0  ./data/gsv_rgb/toronto/img_rgb/3_1/8/e/wILWhGp...  wILWhGpMSITo6VPxWQmJ1g
./data/gsv_rgb/toronto/img_rgb/3_1/8/e/wILWhGpMSITo6VPxWQmJ1g_0.jpg
need to correct the path
3226919 : original
Total number of panos: 3226919


 60%|██████    | 3/5 [02:36<01:39, 49.97s/it]

final saved file number of rows:  104557
Meta file missing, now processing:  kualalumpur
                                                path                  panoid
0  ./data/gsv_rgb/kualalumpur/img_rgb/7_1/3/f/mDH...  mDHR3xS7nQAjgsTOlOymwQ
./data/gsv_rgb/kualalumpur/img_rgb/7_1/3/f/mDHR3xS7nQAjgsTOlOymwQ_0.jpg
need to correct the path
2252334 : original
Total number of panos: 2252334


 80%|████████  | 4/5 [02:54<00:37, 37.38s/it]

final saved file number of rows:  208340
Meta file missing, now processing:  jakarta
                                                path                  panoid
0  ./data/gsv_rgb/jakarta/img_rgb/b_1/2/6/uQIcL69...  uQIcL699DH6eXfNnzPPQvA
./data/gsv_rgb/jakarta/img_rgb/b_1/2/6/uQIcL699DH6eXfNnzPPQvA_0.jpg
need to correct the path
6701389 : original
Total number of panos: 6701389


100%|██████████| 5/5 [03:49<00:00, 45.97s/it]

final saved file number of rows:  659418





In [18]:
metadf_update.shape

(854825, 13)