# Goal: Put the GSV files into train and test folder

In [4]:
import networkx as nx
import osmnx as ox
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point
from shapely.geometry import Polygon
from fiona.crs import from_epsg
import os
from scipy import spatial
import shutil
from math import sin, cos, sqrt, atan2, radians
import random
import shapely

import multiprocessing as mp
from haversine import haversine, Unit
from tqdm import tqdm
import glob
import h3
import sys
sys.path.append("../")

from utils.gsvload import GSVSummary as gsv

In [5]:
gcloudapi = "AIzaSyCohhLdvyTC0UsGriQ9j-rU8pRln5wVVG8"
serviceaccount = "../../google_drive_personal.json"
import gspread
# from oauth2client.service_account import ServiceAccountCredentials
gc = gspread.service_account(filename = serviceaccount)
def read_url(url, SHEET_NAME):
    SHEET_ID = url.split('/')[5]
    spreadsheet = gc.open_by_key(SHEET_ID)
    worksheet = spreadsheet.worksheet(SHEET_NAME)
    rows = worksheet.get_all_records()
    df_spread = pd.DataFrame(rows)
    return df_spread, worksheet

url = "https://docs.google.com/spreadsheets/d/1o5gFmZPUoDwrrbfE6M26uJF3HnEZll02ivnOxP6K6Xw/edit?usp=sharing"
SHEETNAME = "select_city"
city_meta, other_worksheet = read_url( url, SHEETNAME)

In [None]:
# city_meta_1 = city_meta[city_meta['City']!='Gaborone'].reset_index(drop=True)
# city_meta_2 = city_meta[city_meta['City']=='Gaborone'].reset_index(drop=True)
# city_meta_1['label'] = city_meta_1.index
# city_meta_2['label'] = 34
# city_meta = pd.concat([city_meta_1, city_meta_2]).reset_index(drop=True)

In [None]:
city_meta_label = dict(zip(city_meta['City'], city_meta['label']))
city_meta_label

In [None]:
other_worksheet.update([city_meta.columns.values.tolist()] + city_meta.values.tolist()) # save the label

In [None]:
city = "Boston"
cityabbrlower = city.lower().replace(" ", "")
citysummary = gsv(city)
selmeta = citysummary.merge_meta()
selmeta

In [None]:
# size threshold = 15000
sthres = 15000
dis_thred = 30000
total_h3_8 = 3000
selmeta = selmeta[(selmeta['size']>=sthres)&(selmeta['dist_hav']<=30000)].reset_index(drop=True)
# train_sel = selmeta[selmeta['data_group']=='train'].sample(n=20000, random_state=1)
# test_sel = selmeta[selmeta['data_group']=='test'].sample(n=1000, random_state=1)
# val_sel = selmeta[selmeta['data_group']=='val'].sample(n=1000, random_state=1)
h3summary = selmeta.groupby('h3_res8').size().reset_index(name='counts').sort_values(by='counts', ascending=True)
h3summary.shape

In [None]:
# check how many h3_8 for each city exist within the size and distance threshold
# within each h3_8, we choose 80% (h3_9) for training, 10% for testing, 10% for validation.
# However, need to avoid unbalanced data, so we need to check the number of h3_9 for each city
# h3 count
# h3_9 count
h3_9_count = []
h3_8_count = []

for city in tqdm(list(city_meta['City'].unique())[:]):
    cityabbrlower = city.lower().replace(" ", "")
    citysummary = gsv(city)
    selmeta = citysummary.merge_meta(sel = False)
    sthres = 15000
    dis_thred = 30000
    
    center_lat = city_meta[city_meta['City']==city]['center_lat'].values[0]
    center_lng = city_meta[city_meta['City']==city]['center_lng'].values[0]

    # get distance to center
    selmeta['dist_hav'] = selmeta.apply(lambda row: haversine((row['lat'], row['lon']), (center_lat, center_lng),
                                                            unit = 'm'), axis=1)
    
    # selmeta = selmeta[(selmeta['size']>=sthres)&(selmeta['dist_hav']<=30000)].reset_index(drop=True)
    
    for res in [8, 9]:
        selmeta[f'h3_res{res}'] = selmeta.apply(lambda row: h3.geo_to_h3(row['lat'], row['lon'], res), axis=1)
    h3_9_count.append(selmeta['h3_res9'].nunique())
    h3_8_count.append(selmeta['h3_res8'].nunique())
    selmeta.to_csv(os.path.join(citysummary.metafolder, f'{cityabbrlower}_meta.csv'), index=False)

In [None]:
city_meta['h3_9_count'] = h3_9_count
city_meta['h3_8_count'] = h3_8_count
city_meta

In [None]:
# within each h3 8, select 5 h3 9 for training, 1 for testing, 1 for validation


In [None]:
other_worksheet.update([city_meta.columns.values.tolist()] + city_meta.values.tolist()) # save the label

In [None]:
train_sel = selmeta[selmeta['data_group']=='train'].groupby('h3_res9').sample(n = 1)

In [None]:
# load the labeled df

trainset = []

for city in tqdm(list(city_meta['City'].unique())[7:]):
    cityabbrlower = city.lower().replace(" ", "")
    citysummary = gsv(city)
    selmeta = citysummary.merge_meta()
    # random select 20000 images from train, 2000 from test, 2000 from val
    selmeta['label'] = city
    # selmeta.rename(columns = {'data_group':'test'}, inplace = True)
    selmeta = selmeta[['path','label', 'data_group']]
    if selmeta.shape[0]>=24000:
        train_sel = selmeta[selmeta['data_group']=='train'].sample(n=20000, random_state=1)
        test_sel = selmeta[selmeta['data_group']=='test'].sample(n=1000, random_state=1)
        val_sel = selmeta[selmeta['data_group']=='val'].sample(n=1000, random_state=1)
        trainset.append(train_sel)
        trainset.append(test_sel)
        trainset.append(val_sel)
    else:
        print("City with fewer than 24000 images: ", city)
        continue

    
trainset = pd.concat(trainset)

# make a folder under train, test val for the city
folder_train = os.path.join(train_path, cityabbrlower)
folder_test = os.path.join(test_path, cityabbrlower)
folder_val = os.path.join(val_path, cityabbrlower)
if not os.path.exists(folder_train):
    os.makedirs(folder_train)
if not os.path.exists(folder_test):
    os.makedirs(folder_test)
if not os.path.exists(folder_val):
    os.makedirs(folder_val)
# copy the images to the folder
for i, row in train_sel.iterrows():
    shutil.copy(row['image_path'], folder_train)

In [None]:
df['path'] = df['path'].apply(lambda x: x.replace("./data/", "/host_dir/08_GSV/data/"))

In [None]:
# reassign train test val
df = pd.read_csv("./data/classifier_trainset_update.csv")
df['path'] = df['path'].apply(lambda x: x.replace("./data/", "/host_dir/08_GSV/data/"))
print("original: ", df.shape[0])
train = df.groupby(['label','city']).apply(lambda x: x.sample(n=1500, random_state=1)).reset_index(drop = True)

print("train: ", train.shape[0])
remain = df[~df.path.isin(train.path)].reset_index(drop = True)
print("remain: ",remain.shape[0])
test = remain.groupby(['label','city']).apply(lambda x: x.sample(frac=0.5, random_state=1)).reset_index(drop = True)
print(test.shape[0])
val = remain[~remain.path.isin(test.path)].reset_index(drop = True)
print(val.shape[0])
train['data_group'] = 'train'
test['data_group'] = 'test'
val['data_group'] = 'val'
df = pd.concat([train, test, val])
df = df.reset_index(drop=True)
df

In [None]:
# trainset['city'] = trainset['label']
# trainset['label'] = trainset['city'].apply(lambda x: city_meta_label[x])
df.to_csv("./data/classifier_trainset_update_1.csv", index = False)
# trainset

In [8]:
df = pd.read_csv("/lustre1/g/geog_pyloo/05_timemachine/GSV/classifier_trainset_update.csv")
df.head()

Unnamed: 0,path,label,data_group,city,size
0,./data/gsv_rgb/accra/img_rgb/4_1/7/f/IJ--mMcyT...,11,train,Accra,37960
1,./data/gsv_rgb/accra/img_rgb/b_1/1/8/-tP18xo2B...,11,train,Accra,32807
2,./data/gsv_rgb/accra/img_rgb/4_1/4/4/ApH9sVZH3...,11,train,Accra,32206
3,./data/gsv_rgb/accra/img_rgb/5_1/e/d/kgodz5sI0...,11,train,Accra,42600
4,./data/gsv_rgb/accra/img_rgb/9_1/3/1/LDUY2d_O4...,11,train,Accra,32506
