In [14]:
import os
import pandas as pd
import numpy as np
from glob import glob
import gspread
import h3
from tqdm import tqdm
import gc

CURATED_FOLDER_LONG = "/lustre1/g/geog_pyloo/05_timemachine/_curated/c_seg_longitudinal_year"
CURATED_TARGET = "/lustre1/g/geog_pyloo/05_timemachine/_curated/c_seg_hex"
if not os.path.exists(CURATED_TARGET):
    os.makedirs(CURATED_TARGET)
    
column_map = {
    "<2014":[2007,   2008,   2009,   2010,   2011,   2012,   2013,],
    # "2014-2015":[2014, 2015],
    "2014-2017":[2014, 2015, 2016, 2017],
    "2018-2020":[2018, 2019, 2020],
    "2021-2023":[2021, 2022, 2023, 2024]
}

def load_class():
    serviceaccount = "../../google_drive_personal.json"
    import gspread

    # from oauth2client.service_account import ServiceAccountCredentials
    gc = gspread.service_account(filename=serviceaccount)

    def read_url(url, SHEET_NAME):
        SHEET_ID = url.split("/")[5]
        spreadsheet = gc.open_by_key(SHEET_ID)
        worksheet = spreadsheet.worksheet(SHEET_NAME)
        rows = worksheet.get_all_records()
        df_spread = pd.DataFrame(rows)
        return df_spread, worksheet

    url = "https://docs.google.com/spreadsheets/d/1o5gFmZPUoDwrrbfE6M26uJF3HnEZll02ivnOxP6K6Xw/edit?usp=sharing"
    SHEETNAME = "object150"
    obj_meta, other_worksheet = read_url(url, SHEETNAME)
    return obj_meta

def construct_cat(df_seg, obj_meta):
    
    new_cols = []
    for x in df_seg.columns:
        if x in obj_meta["id"].values:
            new_cols.append(ADE_CATEGORIES_DICT[x])
        else:
            new_cols.append(x)
    df_seg.columns = new_cols

    # drop the columns if all value are 0
    variables = set([v for v in df_seg.columns if v in obj_meta["category"].unique()])
    print("Variables original: ", len(variables))
    to_drop = ["other"]
    variables_remain = [v for v in variables if not v in to_drop]
    print("Variables kept: ", len(variables_remain))
    
    # combine year to year_groups
    df_seg['year_group'] = "other"
    df_seg['year_group'] = df_seg['year'].apply(lambda x: [k for k,v in column_map.items() if x in v][0])
    df_seg = df_seg.drop(columns = ['year']).groupby(["city_lower", "hex_id", "img_count", "year_group"]).sum().reset_index()

    # combine categories and transform
    df_long = (
        df_seg.set_index(["city_lower", "hex_id", "img_count","year_group"]).stack().reset_index()
    )
    # print(df_long.columns)
    df_long.rename(columns={"level_4": "category", 0: "value"}, inplace=True)
    df_long["value"] = df_long["value"].fillna(0).astype(float)

    df_seg_update = (
        df_long.groupby(["city_lower", "hex_id", "img_count", "category", "year_group"])["value"]
        .sum()
        .reset_index()
        .pivot(
            columns="category",
            index=["year_group","city_lower", "hex_id", "img_count"],
            values="value",
        )
        .reset_index()
    )
    return df_seg_update, variables_remain
def get_cross(curated_cross, obj_meta, res):
    segfiles = glob(curated_cross + "/*.parquet")
    df_seg = []
    for f in tqdm(segfiles):
        temp = pd.read_parquet(f)
        temp["city_lower"] = f.split("/")[-1].split(".")[0]
        temp_filter = temp[temp['res']==res].reset_index(drop = True)
        temp_update, variables_remain = construct_cat(temp_filter, obj_meta)
        print(temp_update.shape)
        df_seg.append(temp_update)
    df_seg = pd.concat(df_seg).reset_index(drop=True)
    return df_seg

In [2]:
##################### EXPORT STAGING FILES FOR LATER ANALYSIS############################################################
obj_meta = load_class()
print("Loaded: ", obj_meta.shape[0])
n_cat = len(obj_meta["category"].unique())
print("Number of categories: ", n_cat)
obj_meta["id"] = obj_meta["id"].astype(str)
ADE_CATEGORIES_DICT = dict(zip(obj_meta["id"].values, obj_meta["category"].values))

Loaded:  150
Number of categories:  31


In [18]:
res = 8
df_seg = get_cross(CURATED_FOLDER_LONG, obj_meta, res)
df_seg.to_parquet(
        CURATED_TARGET + f"/c_seg_long_cat={n_cat}_res={res}.parquet", index=False
    )

  1%|          | 1/127 [00:00<00:15,  8.03it/s]

Variables original:  30
Variables kept:  29
(1003, 35)
Variables original:  31
Variables kept:  30


  2%|▏         | 2/127 [00:00<00:34,  3.63it/s]

(4756, 36)
Variables original:  31
Variables kept:  30


  2%|▏         | 3/127 [00:00<00:40,  3.03it/s]

(4524, 36)
Variables original:  31
Variables kept:  30


  3%|▎         | 4/127 [00:01<00:48,  2.56it/s]

(6006, 36)
Variables original:  31
Variables kept:  30


  4%|▍         | 5/127 [00:02<01:04,  1.89it/s]

(10133, 36)
Variables original:  31
Variables kept:  30


  6%|▋         | 8/127 [00:02<00:34,  3.43it/s]

(5218, 36)
Variables original:  30
Variables kept:  29
(344, 35)
Variables original:  31
Variables kept:  30
(185, 36)
Variables original:  31
Variables kept:  30


  7%|▋         | 9/127 [00:03<00:54,  2.16it/s]

(12743, 36)
Variables original:  31
Variables kept:  30


  8%|▊         | 10/127 [00:05<01:32,  1.26it/s]

(24625, 36)
Variables original:  31
Variables kept:  30


  9%|▊         | 11/127 [00:05<01:24,  1.37it/s]

(6955, 36)
Variables original:  31
Variables kept:  30


  9%|▉         | 12/127 [00:06<01:13,  1.57it/s]

(4774, 36)
Variables original:  31
Variables kept:  30


 10%|█         | 13/127 [00:07<01:14,  1.52it/s]

(8931, 36)
Variables original:  31
Variables kept:  30


 12%|█▏        | 15/127 [00:07<00:50,  2.20it/s]

(5387, 36)
Variables original:  31
Variables kept:  30
(1161, 36)


 13%|█▎        | 16/127 [00:07<00:42,  2.61it/s]

Variables original:  31
Variables kept:  30
(2184, 36)
Variables original:  31
Variables kept:  30


 14%|█▍        | 18/127 [00:08<00:28,  3.78it/s]

(233, 36)
Variables original:  31
Variables kept:  30
(1604, 36)
Variables original:  31
Variables kept:  30


 15%|█▍        | 19/127 [00:08<00:37,  2.85it/s]

(7600, 36)
Variables original:  31
Variables kept:  30


 16%|█▌        | 20/127 [00:08<00:33,  3.17it/s]

(2419, 36)
Variables original:  31
Variables kept:  30
(100, 36)
Variables original:  31
Variables kept:  30


 19%|█▉        | 24/127 [00:09<00:20,  4.99it/s]

(3863, 36)
Variables original:  31
Variables kept:  30
(169, 36)
Variables original:  31
Variables kept:  30
(176, 36)


 20%|█▉        | 25/127 [00:09<00:20,  4.97it/s]

Variables original:  31
Variables kept:  30
(1313, 36)
Variables original:  31
Variables kept:  30


 21%|██▏       | 27/127 [00:10<00:31,  3.19it/s]

(10261, 36)
Variables original:  31
Variables kept:  30
(1162, 36)


 23%|██▎       | 29/127 [00:10<00:21,  4.50it/s]

Variables original:  31
Variables kept:  30
(421, 36)
Variables original:  31
Variables kept:  30
(594, 36)
Variables original:  31
Variables kept:  30


 24%|██▎       | 30/127 [00:11<00:23,  4.15it/s]

(3525, 36)
Variables original:  31
Variables kept:  30
(298, 36)
Variables original:  31
Variables kept:  30


 25%|██▌       | 32/127 [00:11<00:20,  4.62it/s]

(3156, 36)
Variables original:  30
Variables kept:  29
(114, 35)
Variables original:  31
Variables kept:  30


 27%|██▋       | 34/127 [00:11<00:17,  5.29it/s]

(1951, 36)
Variables original:  31
Variables kept:  30


 29%|██▉       | 37/127 [00:12<00:14,  6.18it/s]

(3129, 36)
Variables original:  31
Variables kept:  30
(309, 36)
Variables original:  30
Variables kept:  29
(82, 35)


 30%|██▉       | 38/127 [00:12<00:15,  5.64it/s]

Variables original:  31
Variables kept:  30
(3154, 36)
Variables original:  31
Variables kept:  30
(368, 36)
Variables original:  31
Variables kept:  30


 31%|███▏      | 40/127 [00:12<00:14,  5.84it/s]

(1992, 36)
Variables original:  31
Variables kept:  30


 33%|███▎      | 42/127 [00:13<00:16,  5.09it/s]

(2934, 36)
Variables original:  31
Variables kept:  30
(1431, 36)


 35%|███▍      | 44/127 [00:13<00:13,  6.24it/s]

Variables original:  31
Variables kept:  30
(1007, 36)
Variables original:  30
Variables kept:  29
(468, 35)


 35%|███▌      | 45/127 [00:13<00:11,  6.94it/s]

Variables original:  31
Variables kept:  30
(607, 36)
Variables original:  31
Variables kept:  30


 36%|███▌      | 46/127 [00:13<00:13,  5.95it/s]

(1054, 36)
Variables original:  30
Variables kept:  29


 37%|███▋      | 47/127 [00:14<00:14,  5.60it/s]

(1944, 35)
Variables original:  31
Variables kept:  30


 38%|███▊      | 48/127 [00:14<00:14,  5.33it/s]

(2258, 36)
Variables original:  31
Variables kept:  30


 39%|███▊      | 49/127 [00:15<00:44,  1.76it/s]

(21578, 36)
Variables original:  31
Variables kept:  30


 40%|████      | 51/127 [00:16<00:30,  2.48it/s]

(5014, 36)
Variables original:  30
Variables kept:  29
(396, 35)
Variables original:  31
Variables kept:  30


 41%|████      | 52/127 [00:16<00:28,  2.64it/s]

(4263, 36)
Variables original:  31
Variables kept:  30


 42%|████▏     | 53/127 [00:17<00:30,  2.43it/s]

(5964, 36)
Variables original:  31
Variables kept:  30


 43%|████▎     | 55/127 [00:17<00:23,  3.08it/s]

(3806, 36)
Variables original:  31
Variables kept:  30
(1369, 36)
Variables original:  31
Variables kept:  30


 45%|████▍     | 57/127 [00:18<00:25,  2.80it/s]

(9572, 36)
Variables original:  31
Variables kept:  30
(1093, 36)


 46%|████▌     | 58/127 [00:18<00:21,  3.24it/s]

Variables original:  31
Variables kept:  30
(2052, 36)
Variables original:  31
Variables kept:  30


 47%|████▋     | 60/127 [00:20<00:36,  1.84it/s]

(24786, 36)
Variables original:  31
Variables kept:  30
(1354, 36)


 48%|████▊     | 61/127 [00:20<00:28,  2.29it/s]

Variables original:  31
Variables kept:  30
(1485, 36)
Variables original:  31
Variables kept:  30


 49%|████▉     | 62/127 [00:21<00:32,  1.98it/s]

(8438, 36)
Variables original:  31
Variables kept:  30


 50%|████▉     | 63/127 [00:22<00:42,  1.52it/s]

(13189, 36)
Variables original:  31
Variables kept:  30
(132, 36)
Variables original:  31
Variables kept:  30


 53%|█████▎    | 67/127 [00:23<00:20,  2.90it/s]

(7102, 36)
Variables original:  31
Variables kept:  30
(387, 36)
Variables original:  31
Variables kept:  30
(402, 36)
Variables original:  30
Variables kept:  29
(3577, 35)


 54%|█████▍    | 69/127 [00:23<00:18,  3.14it/s]

Variables original:  31
Variables kept:  30
(3866, 36)


 55%|█████▌    | 70/127 [00:24<00:16,  3.48it/s]

Variables original:  31
Variables kept:  30
(1805, 36)
Variables original:  30
Variables kept:  29
(247, 35)


 57%|█████▋    | 72/127 [00:24<00:11,  4.75it/s]

Variables original:  31
Variables kept:  30
(768, 36)
Variables original:  31
Variables kept:  30


 58%|█████▊    | 74/127 [00:25<00:22,  2.39it/s]

(16945, 36)
Variables original:  31
Variables kept:  30
(825, 36)
Variables original:  31
Variables kept:  30
(506, 36)
Variables original:  31
Variables kept:  30


 61%|██████    | 77/127 [00:26<00:17,  2.92it/s]

(7108, 36)
Variables original:  31
Variables kept:  30
(1283, 36)


 61%|██████▏   | 78/127 [00:26<00:13,  3.53it/s]

Variables original:  31
Variables kept:  30
(161, 36)
Variables original:  30
Variables kept:  29


 62%|██████▏   | 79/127 [00:27<00:12,  3.83it/s]

(2096, 35)
Variables original:  31
Variables kept:  30


 63%|██████▎   | 80/127 [00:27<00:11,  3.96it/s]

(2460, 36)
Variables original:  31
Variables kept:  30


 64%|██████▍   | 81/127 [00:27<00:13,  3.40it/s]

(4336, 36)
Variables original:  30
Variables kept:  29


 65%|██████▌   | 83/127 [00:28<00:10,  4.29it/s]

(1595, 35)
Variables original:  31
Variables kept:  30
(521, 36)


 66%|██████▌   | 84/127 [00:28<00:08,  5.04it/s]

Variables original:  31
Variables kept:  30
(424, 36)
Variables original:  31
Variables kept:  30


 68%|██████▊   | 86/127 [00:28<00:07,  5.66it/s]

(1225, 36)
Variables original:  30
Variables kept:  29
(1001, 35)


 69%|██████▊   | 87/127 [00:28<00:06,  6.19it/s]

Variables original:  30
Variables kept:  29
(841, 35)
Variables original:  31
Variables kept:  30


 69%|██████▉   | 88/127 [00:29<00:11,  3.45it/s]

(7396, 36)
Variables original:  31
Variables kept:  30


 71%|███████   | 90/127 [00:30<00:12,  2.92it/s]

(8390, 36)
Variables original:  31
Variables kept:  30
(1424, 36)


 72%|███████▏  | 91/127 [00:30<00:09,  3.67it/s]

Variables original:  31
Variables kept:  30
(423, 36)
Variables original:  30
Variables kept:  29
(75, 35)
Variables original:  31
Variables kept:  30


 74%|███████▍  | 94/127 [00:30<00:07,  4.41it/s]

(5220, 36)
Variables original:  31
Variables kept:  30
(526, 36)
Variables original:  31
Variables kept:  30


 75%|███████▍  | 95/127 [00:31<00:07,  4.04it/s]

(3600, 36)
Variables original:  31
Variables kept:  30


 76%|███████▌  | 96/127 [00:31<00:08,  3.86it/s]

(3614, 36)
Variables original:  31
Variables kept:  30


 77%|███████▋  | 98/127 [00:31<00:07,  4.02it/s]

(3611, 36)
Variables original:  31
Variables kept:  30
(1494, 36)


 78%|███████▊  | 99/127 [00:32<00:06,  4.45it/s]

Variables original:  31
Variables kept:  30
(1101, 36)
Variables original:  31
Variables kept:  30


 79%|███████▊  | 100/127 [00:32<00:06,  4.46it/s]

(2199, 36)
Variables original:  31
Variables kept:  30


 80%|███████▉  | 101/127 [00:32<00:06,  4.10it/s]

(3014, 36)
Variables original:  31
Variables kept:  30


 81%|████████  | 103/127 [00:33<00:06,  3.60it/s]

(6227, 36)
Variables original:  31
Variables kept:  30
(1086, 36)
Variables original:  31
Variables kept:  30


 82%|████████▏ | 104/127 [00:33<00:07,  3.07it/s]

(5977, 36)
Variables original:  31
Variables kept:  30


 83%|████████▎ | 106/127 [00:34<00:05,  3.78it/s]

(2359, 36)
Variables original:  31
Variables kept:  30
(1546, 36)


 84%|████████▍ | 107/127 [00:34<00:05,  3.79it/s]

Variables original:  31
Variables kept:  30
(3038, 36)
Variables original:  31
Variables kept:  30


 85%|████████▌ | 108/127 [00:34<00:05,  3.31it/s]

(4460, 36)
Variables original:  31
Variables kept:  30


 87%|████████▋ | 110/127 [00:35<00:04,  4.07it/s]

(1540, 36)
Variables original:  31
Variables kept:  30
(1653, 36)


 87%|████████▋ | 111/127 [00:35<00:04,  3.92it/s]

Variables original:  31
Variables kept:  30
(1963, 36)
Variables original:  31
Variables kept:  30


 89%|████████▉ | 113/127 [00:35<00:02,  5.46it/s]

(378, 36)
Variables original:  31
Variables kept:  30
(634, 36)
Variables original:  31
Variables kept:  30


 91%|█████████ | 115/127 [00:39<00:09,  1.31it/s]

(48297, 36)
Variables original:  30
Variables kept:  29
(977, 35)
Variables original:  31
Variables kept:  30


 91%|█████████▏| 116/127 [00:40<00:10,  1.08it/s]

(16799, 36)
Variables original:  31
Variables kept:  30


 92%|█████████▏| 117/127 [00:41<00:08,  1.23it/s]

(6946, 36)
Variables original:  31
Variables kept:  30


 93%|█████████▎| 118/127 [00:41<00:06,  1.48it/s]

(3419, 36)
Variables original:  31
Variables kept:  30


 94%|█████████▎| 119/127 [00:41<00:04,  1.78it/s]

(3002, 36)
Variables original:  31
Variables kept:  30


 94%|█████████▍| 120/127 [00:42<00:04,  1.49it/s]

(13009, 36)
Variables original:  31
Variables kept:  30


 95%|█████████▌| 121/127 [00:43<00:03,  1.70it/s]

(4427, 36)
Variables original:  31
Variables kept:  30


 96%|█████████▌| 122/127 [00:43<00:02,  2.03it/s]

(3244, 36)
Variables original:  31
Variables kept:  30


 98%|█████████▊| 124/127 [00:43<00:00,  3.12it/s]

(2301, 36)
Variables original:  30
Variables kept:  29
(189, 35)
Variables original:  31
Variables kept:  30


100%|██████████| 127/127 [00:44<00:00,  2.87it/s]

(3981, 36)
Variables original:  31
Variables kept:  30
(750, 36)
Variables original:  31
Variables kept:  30
(683, 36)





In [17]:
df_seg.to_parquet(
        CURATED_TARGET + f"/c_seg_long_cat={n_cat}_res={res}.parquet", index=False
    )