In [None]:
#|default_exp items

# Items

> Utilities for fetching item catalogs

In [None]:
#| eval: false
#| hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export

import requests
import json
from fastcore.all import L
import pandas as pd
from typing import Any
from urllib.parse import urlparse
from pathlib import Path
import numpy as np
import re
from shapely.geometry import Polygon, box
import geopandas as gpd

In [None]:
#| hide
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth',120)

In [None]:
#| export
from ntlights_damage_assessment.catalogs import *

In [None]:
%%time
#| eval: false
catalogs = get_item_catalogs()

CPU times: user 20.4 ms, sys: 6.03 ms, total: 26.4 ms
Wall time: 1.59 s


In [None]:
#| eval: false
catalog1805 = catalogs[catalogs.yearmonth == '201805']

In [None]:
#| eval: false
href1805 = catalog1805.iloc[0].href; href1805

'https://globalnightlight.s3.amazonaws.com/npp_201805/npp_201805_catalog.json'

In [None]:
#| eval: false
baseurl = catalog1805.iloc[0].baseurl; baseurl

'https://globalnightlight.s3.amazonaws.com'

In [None]:
#| eval: false
folder1805 = catalog1805.iloc[0].folder; folder1805

'npp_201805'

In [None]:
%%time
#| eval: false
data1805 = get_data(href1805)

CPU times: user 64.7 ms, sys: 6.24 ms, total: 71 ms
Wall time: 2.44 s


In [None]:
#| eval:false
df1805 = make_df(data1805,rel='item')

In [None]:
#| export

def get_item_href(href,folder, baseurl):
    return f'{baseurl}/{folder}{href[1:]}'

In [None]:
assert get_item_href('./SVDNB_npp_d20180501_t0004098_e0009502_b33718_c20180501060951522600_noac_ops.rade9.co.json',
              '201805',
              'https://globalnightlight.s3.amazonaws.com') == 'https://globalnightlight.s3.amazonaws.com/201805/SVDNB_npp_d20180501_t0004098_e0009502_b33718_c20180501060951522600_noac_ops.rade9.co.json'

In [None]:
#| export
def split_href_type(href, first=True):
    parts = href[2:].split('.')
    return parts[0] if first else '.'.join(parts[1:])
    

In [None]:
assert split_href_type('./SVDNB_npp_d20180501_t0004098_e0009502_b33718_c20180501060951522600_noac_ops.rade9.co.json') == 'SVDNB_npp_d20180501_t0004098_e0009502_b33718_c20180501060951522600_noac_ops'


In [None]:
assert split_href_type('./SVDNB_npp_d20180501_t0004098_e0009502_b33718_c20180501060951522600_noac_ops.rade9.co.json', first=False) == 'rade9.co.json'


In [None]:
#| export
def transform_items_df(items_df, folder, baseurl):
    items_df.drop(columns=['rel','type'], inplace=True)
    items_df['item_href'] = items_df.href.apply(get_item_href, folder=folder, baseurl=baseurl)
    items_df['stem'] = items_df.href.apply(split_href_type)
    items_df['suffix'] = items_df.href.apply(split_href_type,first=False)
    items_df['base_url'] = baseurl
    items_df['folder'] = folder
    return items_df

In [None]:
items_df = pd.DataFrame(
    data=dict(rel=['item'],
              type=['application/json'],
              href=['./SVDNB_npp_d20180501_t0004098_e0009502_b33718_c20180501060951522600_noac_ops.rade9.co.json']
             ))

In [None]:
items_df = transform_items_df(items_df,'201805','https://globalnightlight.s3.amazonaws.com') 

In [None]:
assert set(['base_url', 'folder', 'href', 'item_href', 'stem', 'suffix']) == set(items_df.columns.values)

In [None]:
#| export
PAT = r'(.*[^_]+)_d([^_]*)_t([^_]*)_e([^_]*)_b([^_]*)_c([^_]*)_([^_]*)_(.*)$'

In [None]:
#| export

def find_stem_components(stem):
    matcher = re.match(PAT, stem)
    if matcher is not None:
        results = list(matcher.groups())
    else: 
        results = []
    if len(results) < 8:
        results = groups +  [''] * (8 - len(groups))
    return results[:8]
        

In [None]:
assert find_stem_components('SVDNB_npp_d20180901_t0002302_e0008088_b35463_c20180901040811139620_nobc_ops') == ['SVDNB_npp',
 '20180901',
 '0002302',
 '0008088',
 '35463',
 '20180901040811139620',
 'nobc',
 'ops']

In [None]:
#| export

def make_vflag_file(o):
    return f"npp_d{o['start_date']}_t{o['first_scantime']}_e{o['end_scantime']}_b{o['orbital_nbr']}.vflag.co.tif"

In [None]:
items_df2 = pd.DataFrame(dict(start_date=['20150504'], first_scantime=['1335358'], end_scantime=['1341162'], orbital_nbr=['18219']))

In [None]:
assert (items_df2.apply(make_vflag_file,axis=1) == pd.Series(['npp_d20150504_t1335358_e1341162_b18219.vflag.co.tif'])).all(axis=None)

In [None]:
#| export

def make_vflag_href(o):
    return f"{o['base_url']}/{o['folder']}/{o['vflag_file']}"

In [None]:
items_df3 = pd.DataFrame(dict(base_url=['https://globalnightlight.s3.amazonaws.com'], folder=['npp_201504'], vflag_file=['npp_d20150504_t1335358_e1341162_b18219.vflag.co.tif']))

In [None]:
assert (items_df3.apply(make_vflag_href, axis=1) == pd.Series(['https://globalnightlight.s3.amazonaws.com/npp_201504/npp_d20150504_t1335358_e1341162_b18219.vflag.co.tif'])).all(axis=None)

In [None]:
#| export

def split_stem_components(df):
    (df['product_id'], 
     df['start_date'], 
     df['first_scantime'], 
     df['end_scantime'], 
     df['orbital_nbr'], 
     df['create_datetime'],
     df['data_origin'],
     df['data_domain']
    ) = zip(*df['stem'].map(find_stem_components))
    df['vflag_file'] = df.apply(make_vflag_file, axis=1)
    df['vflag_href'] = df.apply(make_vflag_href, axis=1) 
    return df

In [None]:
items_df4 = pd.DataFrame(
    dict(stem=['SVDNB_npp_d20180901_t0002302_e0008088_b35463_c20180901040811139620_nobc_ops'],
         base_url=['https://globalnightlight.s3.amazonaws.com'], 
         folder=['npp_201809']
         ))         

In [None]:
items_df4 = split_stem_components(items_df4)

In [None]:
assert set(items_df4.columns.values) == set(['base_url',
 'create_datetime',
 'data_domain',
 'data_origin',
 'end_scantime',
 'first_scantime',
 'folder',
 'orbital_nbr',
 'product_id',
 'start_date',
 'stem',
 'vflag_file',
 'vflag_href'])

In [None]:
#| export
def get_monthly_items(href,folder,baseurl):
    data = get_data(href)
    df = make_df(data, rel='item')
    df = transform_items_df(df, folder,baseurl)
    df = split_stem_components(df)
    return df

In [None]:
%%time
#| eval: false
items1805 = get_monthly_items(href1805,folder1805, baseurl)

CPU times: user 177 ms, sys: 7.41 ms, total: 184 ms
Wall time: 2.59 s


In [None]:
#| eval: false
assert len(items1805) == 3600

In [None]:
#| eval: false
assert set(items1805.columns.values) == set(['href','item_href','suffix', *items_df4.columns.values])