# Stac for the Challenge

Using both a csv file and folders

In [574]:
#import the required libraries:

from satstac import Catalog
from satstac import Collection
from satstac import Item
import rasterio
import shapely
import sys,os,os.path
import json
import pandas as pd
import numpy as np
import geopandas as gpd

In [575]:
stac_version = '0.6.2'
stac_challenge_folder = './STAC_challenge_4'
catalog_address = stac_challenge_folder + '/' + 'catalog.json'

tiffiles = {"accra_1": "https://oin-hotosm.s3.amazonaws.com/5b694a0f4b87366cc0f0fa70/0/5b694a0f4b8736ebfff0fa71.tif",
           "accra_2":"https://oin-hotosm.s3.amazonaws.com/5bb9323e9ed15b0006d24f33/0/5bb9323e9ed15b0006d24f34.tif",
           'accra_3':'https://oin-hotosm.s3.amazonaws.com/5be9bb18080ac000051474fd/0/5be9bb18080ac000051474fe.tif',
           'monrovia_1':'https://oin-hotosm.s3.amazonaws.com/5c08c2ec6918390006b7a8a1/0/5c08c2ec6918390006b7a8a2.tif',
           'monrovia_2':'https://oin-hotosm.s3.amazonaws.com/5b83a514c8e197000a93403e/0/5b83a514c8e197000a93403f.tif',
           'monrovia_3':'https://oin-hotosm.s3.amazonaws.com/5bcdce33b9e5f20005f7da3e/0/5bcdce33b9e5f20005f7da3f.tif',
           'monrovia_4':'https://oin-hotosm.s3.amazonaws.com/5b8180e87343a943f0347d18/0/5b8180e87343a991c8347d19.tif',
           'pointe-noire_1': 'https://oin-hotosm.s3.amazonaws.com/5c30a233be6ca30005c74da8/1/5c30a233be6ca30005c74daa.tif',
           'pointe-noire_2': 'https://oin-hotosm.s3.amazonaws.com/5c30a233be6ca30005c74da8/0/5c30a233be6ca30005c74da9.tif'
           }

In [576]:
# read the csv:
df = pd.read_csv(stac_challenge_folder+'/'+'data_to_STAC.csv')
df

Unnamed: 0,collection,id_item,raster_title,raster_href,vector_title,additional_keywords,vector_storage,vector_address
0,accra,accra_1_buildings-item,accra_1,https://oin-hotosm.s3.amazonaws.com/5b694a0f4b...,accra_1_buildings,ad_keyword_1;ad_keyword_2,local,accra/accra_1/accra_1_buildings.geojson
1,accra,accra_1_drains-item,accra_1,https://oin-hotosm.s3.amazonaws.com/5b694a0f4b...,accra_1_drains,,local,accra/accra_1/accra_1_drains.geojson
2,accra,accra_1_roads-item,accra_1,https://oin-hotosm.s3.amazonaws.com/5b694a0f4b...,accra_1_roads,,local,accra/accra_1/accra_1_roads.geojson
3,accra,accra_2_buildings-item,accra_2,https://oin-hotosm.s3.amazonaws.com/5bb9323e9e...,accra_2_buildings,,local,accra/accra_2/accra_2_buildings.geojson
4,monrovia,monrovia_2_buildings-item,monrovia_2,https://oin-hotosm.s3.amazonaws.com/5b83a514c8...,monrovia_2_buildings,,local,monrovia/monrovia_2/monrovia_2_buildings.geojson
5,zanzibar,znz_022_buildings-item,znz_022,https://oin-hotosm.s3.amazonaws.com/5ae242fd0b...,znz_022_buildings,,online,https://www.dropbox.com/sh/ct3s1x2a846x3yl/AAD...
6,zanzibar,znz_001_buildings-item,znz_001,https://oin-hotosm.s3.amazonaws.com/5afeda152b...,znz_001_buildings,,online,https://www.dropbox.com/sh/ct3s1x2a846x3yl/AAA...


## Functions for the items:

In [577]:
#item's generic metadata: 
type_item = "Feature"
properties_item = {
    "datetime": "2019-02-26T00:00:00Z",
    "td:title": "test",
    "td:description": "test",
    "td:label_type": "segmentation",
    "td:classes": [
      'buildings', 'drains', 'roads'
    ]
  }
basic_keywords_items = ["challenge", "world bank"]

In [578]:
#get the appropriate extent for a geojson file:

#this one is very weird, I  wanted to simply use: 
def get_bbox(address):
    gdf = gpd.read_file(address)
    return list(gdf['geometry'].total_bounds)

#but there is a file which has a None geometry in it, and it came out as very difficult to deal with (dropna etc where leaving a None row, and reset_index was not getting rid of it)
#So I will, at least temporarily, use a very memory expensive solution:

def get_bbox(address):
    gdf = gpd.read_file(address)
    #df.head()
    #gdf.dropna()
    i_none = []
    gdf2_list = []
    for i, f in gdf.iterrows():
        if (f['geometry'] is not None):
            gdf2_list.append(f)
        else:
            print('None detected')
    gdf2 = gpd.GeoDataFrame(gdf2_list)
    return list(gdf2['geometry'].total_bounds)

In [579]:
def get_bbox(address):
    gdf = gpd.read_file(address)
    gdf = gdf[gdf['geometry'].isna() != True]
    return list(gdf['geometry'].total_bounds)

In [580]:
#get the appropriate tags for the items

def get_tags_items(collection,surfoldername, foldername, filename, bbox):
    filename_split = filename.split('.')[0]
    id_item = filename_split
    links_item = [    {
      "rel": "self",
      "href": stac_challenge_folder+"/"+filename_split+"-item.json"
    },
    {
      "rel": "root",
      "href": '../../catalog.json'
    },]
    
    assets_item = {
    "raster": {
        "title": "image",
        "href": tiffiles[filename.split('_')[0]+'_'+filename.split('_')[1]],
        "type": "image/vnd.stac.geotiff; cloud-optimized=true"
    },
    "vector": {
      "title": filename_split,
      "href": '../../'+surfoldername +'/'+foldername+'/'+filename,
      "type": "application/geo+json"
    }
    }
    key = basic_keywords_items.copy()
    key.append(filename_split[0]+'_'+filename_split[1])
    key.append(filename_split[2])
    collection_address = stac_challenge_folder +'/' + filename
    directory = stac_challenge_folder + '/'+surfoldername+'/'+ foldername
    with open(directory + '/'+ filename, 'r') as f:
        data = json.load(f)
        geom = []
        #we create the appropriate geometry:
        for f in data["features"]:
            geom.append(f["geometry"]["coordinates"][0])
        if("Polygon" in data['features'][0]["geometry"]["type"]):    
            geometry_item = {
                   "type": "MultiPolygon",
                    "coordinates": geom
                 }
        else:
            geometry_item = {
                   "type": "MultiLineString",
                   "coordinates": geom
                 }
    bbox_item = bbox
    prop = properties_item;
    prop['collection']=filename_split.split('_')[0]+' '+filename_split.split('_')[1]
    prop['td:title']=filename_split.split('_')[0]+' '+filename_split.split('_')[1]
    prop['td:description']=filename_split.split('_')[2]+' for '+filename_split.split('_')[0]+' '+filename_split.split('_')[1]
    return id_item, type_item, geometry_item, bbox_item, prop, links_item,assets_item

In [581]:
def get_tags_items_csv(row):
    collection = row['collection']
    surfoldername = row['collection']
    filename = row['id_item']+'.geojson'
    filename_split = row['id_item']
    foldername = row['raster_title']
    id_item = filename_split   
    links_item = [ {
      "rel": "self",
      "href": stac_challenge_folder+"/"+'challenge_collection_'+ collection+'/'+ foldername+'/'+filename
    },
    {
      "rel": "root",
      "href": '../../catalog.json'
    },]    
    #the adress of the vector id different depending on wether it is stored online or locally 
    if(row['vector_storage']=='local'):
        vect_href = '../../'+row['vector_address'];
        address_file = stac_challenge_folder +'/'+  row['vector_address']; 
    else: #if(row['vector_storage'] == 'online'):
        address_file = row['vector_address'];
        vect_href = row['vector_address']; 
    assets_item = {
    "raster": {
        "title": row['raster_title'],
        "href": row['raster_href'],
        "type": "image/vnd.stac.geotiff; cloud-optimized=true"
        },
    "vector": {
        "title": row['vector_title'],
        "href": vect_href,
        "type": "application/geo+json"
        }
        }
    key = basic_keywords_items.copy()
    key.append(filename_split.split('_')[0]+' '+filename_split.split('_')[1])
    key.append(filename_split.split('_')[2])
    #we add aditional keywords from the csv:
    if(not (row.isnull()['additional_keywords'])):
        for k in str(row['additional_keywords']).split(';'):
            key.append(k);    
    collection_address = stac_challenge_folder +'/' + 'challenge_collection_' + foldername
    directory = stac_challenge_folder + '/'+surfoldername+'/'+ foldername   
    bbox_item = get_bbox(address_file)
    data = gpd.read_file(address_file)   
    geom = []
    
    #we get the corresponding geometry:
    for f in data['geometry']:
        if(f is not None):
            #We now have to know wether it is a multipolygon or polygon or multiLineString or LineString,
            #in order to get the complete geometry and not only the first polygon/line
            if('polygon' in str(type(data['geometry'][0])) or 'polygon' in str(type(data['geometry'][1]))):   
                geom_type = 'Polygon'
                if('MultiPolygon' in str(type(data['geometry'][0]))):   
                    polygons = list(f)
                    for po in polygons:
                        x, y = po.exterior.coords.xy
                        geom.append([[[x[i],y[i]] for i in range(len(x))]])
                else:
                    x, y = f.exterior.coords.xy
                    geom.append([[[x[i],y[i]] for i in range(len(x))]])
            else:
                geom_type = 'LineString'
                if('Multi' in str(type(data['geometry'][0]))):   
                    lines = list(f)
                    for li in lines:
                        x, y = li.coords.xy
                        geom.append([[x[i],y[i]] for i in range(len(x))])
                else:
                    x, y = f.coords.xy
                    geom.append([[x[i],y[i]] for i in range(len(x))])                 
    geometry_item = {
        "type": "Multi" + geom_type,
        "coordinates": geom
        } 
    
    prop = properties_item;
    prop['collection']= collection;
    prop['td:title']= filename_split.split('_')[0]+' '+filename_split.split('_')[1]+' '+filename_split.split('_')[2].split('-')[0]
    prop['td:description']=filename_split.split('_')[2].split('-')[0]+' for '+filename_split.split('_')[0]+' '+filename_split.split('_')[1]
    return id_item, type_item, geometry_item, bbox_item, prop, links_item,assets_item

#stac_challenge_folder+'/'+
data = gpd.read_file(df.iloc[5]['vector_address']) 

for f in data['geometry']:
    if('polygon' in str(type(data['geometry'][0])) and 'Multi' not in str(type(data['geometry'][0]))):
        print('polygon',list(f));
       

In [582]:
# create an Item object with JSON
def create_corresponding_item(id_item, type_item, geometry_item, bbox_item, properties_item, links_item,assets_item):
    item_json = {
        "id": id_item,
        "type": type_item,
        "geometry": geometry_item,
        "bbox": bbox_item,
        "properties": properties_item,
        "links": links_item,
        "assets":assets_item,
    }
    it= Item(item_json)
    return it

## Functions for the collections:

In [583]:
license_col = "CC-BY-4.0" #open, proprietary?
version_col = "1.0"
providers_col = [
    {
      "name": "WB/OCA",
      "roles": [
        "producer",
        "licensor"
      ],
      "url": "https://opencitiesproject.org"
    }
  ]
basic_keywords_col = ["challenge", "world bank"]
extent_col = {
    "spatial": [
      -180.0,
      -56.0,
      180.0,
      83.0
    ],
    "temporal": [
      "2015-06-23T00:00:00Z",
        None
    ]
  }
id_collection_basic = "challenge_collection_"
basic_description_col = "The data available for the challenge formatted in a STAC collection for the "
basic_title_col = " AoI collection"

In [584]:
def get_tags_collection(foldername):
    key = basic_keywords_col.copy()
    key.append(foldername)
    extent_col = {
    "spatial": [
      -180.0,
      -56.0,
      180.0,
      83.0
    ],
    "temporal": [
      "2015-06-23T00:00:00Z",
        None
    ]
    }
    collection_address = stac_challenge_folder +'/' +id_collection_basic+foldername+'/'+ 'catalog.json'
    links =[ {
          "rel": "self",
          "href": collection_address
        },
            {
          "rel": "root",
          "href": '../catalog.json'
        }]
    title = foldername +basic_title_col
    description = basic_description_col + foldername + ' AoI'
    id_collection = id_collection_basic + foldername
    return stac_version, id_collection, title, description, key, version_col, license_col, providers_col, extent_col, links

In [585]:
def create_corresponding_collection(stac_version, id_collection, title, description, keywords, version, license, providers, extent, links):
    # create a Collection object with JSON
    collection_json = {
    "stac_version": stac_version,
    "id": id_collection,
    "title": title ,
    "description": description,
    "collection version": version,
    "keywords":keywords,
    "license": license,
    "version": version,
    "providers":providers,
    "extent":extent,
    "links": links
    }
    col = Collection(collection_json)
    print(col.id)
    return col

## Creating the catalog

In [586]:
stac_version = stac_version
catalog_id = 'challenge_catalog'
catalog_title = 'Challenge OpenML'
catalog_description = 'Data for the ML challenge, in the STAC format'
catalog_links = [ {
      "rel": "self",
      "href": '../'+catalog_address
    },
        {
      "rel": "root",
      "href": '../'+catalog_address
    },
]

In [587]:
# save as a root catalog
catalog_json = {
    "stac_version": stac_version,
    "id": catalog_id,
    "title": catalog_title,
    "description": catalog_description,
    "links": catalog_links   
}
catalog = Catalog(catalog_json)
catalog.save_as(catalog_address)
print('id:',catalog.id)
print('filename:',catalog.filename)
print('path:',catalog.path)
print('data:',catalog.data)

id: challenge_catalog
filename: ./STAC_challenge_4/catalog.json
path: ./STAC_challenge_4
data: {'stac_version': '0.6.2', 'id': 'challenge_catalog', 'title': 'Challenge OpenML', 'description': 'Data for the ML challenge, in the STAC format', 'links': [{'rel': 'self', 'href': '.././STAC_challenge_4/catalog.json'}, {'rel': 'root', 'href': '.././STAC_challenge_4/catalog.json'}]}


## functions to break up the creation of the collections and items:

In [588]:
def create_collection(collection):
    stac_version, id_collection, title, description, keywords, version, license, providers, extent, links = get_tags_collection(collection);
    col = create_corresponding_collection(stac_version, id_collection, title, description, keywords, version, license, providers, extent, links);
    col_address = stac_challenge_folder +'/' +"challenge_collection_"+collection+'/'+ 'catalog.json';
    col.save_as(col_address);
    id_collections.append(collection);
    return col, col_address

In [589]:
#the creation of an item differs depending on if the metadata comes from the csv or from the local organization
def create_item(csv, row, collection):
    if(csv):
        id_item, type_item, geometry_item, bbox_item, properties_item, links_item,assets_item = get_tags_items_csv(row);
        item = create_corresponding_item(id_item, type_item, geometry_item, bbox_item, properties_item, links_item,assets_item);
        item_address = stac_challenge_folder+'/' +"challenge_collection_"+collection + '/' +row['raster_title']+'/'+row['id_item'] + '.json';
        item.save_as(item_address);
    else:
        folder = row[0];
        file = row[1]#.split('.')[0]
        id_item, type_item, geometry_item, bbox_item, properties_item, links_item,assets_item = get_tags_items(collection,collection,folder,file,row[2]);
        item = create_corresponding_item(id_item, type_item, geometry_item, bbox_item, properties_item, links_item,assets_item);        
        item_address = stac_challenge_folder+'/' +"challenge_collection_"+collection + '/' +folder+'/'+file.split('.')[0] + '-item.json';
        item.save_as(item_address);
    return item

In [590]:
#create the corresponding bbox
def get_bbox_row(csv,row):
    if(csv):
        if(row['vector_storage']=='local'):
            bbox = get_bbox(stac_challenge_folder +'/'+ row['vector_address'])
        if(row['vector_storage']=='online'):
            bbox = get_bbox(row['vector_address'])
    else:
        #In this case,row[0]+'/'+row[1]+'/'+row[2]+'/'+row[3] = stac_challenge_folder+'/'+collection+'/'+folder+'/'+file
        bbox = get_bbox(row[0]+'/'+row[1]+'/'+row[2]+'/'+row[3])
    m1 = bbox[0]
    m2 = bbox[1]
    M1 = bbox[2]
    M2 = bbox[3]
    return bbox, m1,m2,M1,M2

In [591]:
#adapt the bbox of the collection so that the new item's bbox fits in it
def adapt_bbox_col(m1_col,m2_col,M1_col,M2_col,m1,m2,M1,M2):
    if(m1<m1_col):
        m1_col = m1       
    if(m2<m2_col):
        m2_col = m2         
    if(M1>M1_col):
        M1_col = M1
    if(M2>M2_col):
        M2_col = M2
    return m1_col,m2_col,M1_col,M2_col

In [592]:
#Create and add the items that are listed in the csv to the collection
def create_and_add_items_to_collection_from_csv(df_col,m1_col,m2_col,M1_col,M2_col, col, collection):
    print('\n\n from csv: \n \n')
    for i,row in df_col.iterrows():
        file_i = str(row['vector_title']) + '.geojson'
        print(file_i)
        bbox,m1,m2,M1,M2 = get_bbox_row(True,row);
        m1_col,m2_col,M1_col,M2_col = adapt_bbox_col(m1_col,m2_col,M1_col,M2_col,m1,m2,M1,M2)
        item = create_item(True,row, collection)
        path = row['raster_title']+'/'
        filename = row['id_item']
        col.add_item(item, path = path, filename = filename)
        id_items.append(row['id_item']);

In [593]:
#Create and add the items that have not been previously added and are stored locally to the collection 
def add_remaining_items_from_local_to_collection(col,m1_col,m2_col,M1_col,M2_col):
    print('\n\n from folder: \n \n')
    if(collection in(os.listdir(stac_challenge_folder))):   
        for folder in (os.listdir(stac_challenge_folder+'/'+collection)):
            if(folder != '.DS_Store'):
                create_add_local_item(collection,folder, col,m1_col,m2_col,M1_col,M2_col);

In [594]:
def create_add_local_item(collection,folder, col,m1_col,m2_col,M1_col,M2_col):
    for file_2 in (os.listdir(stac_challenge_folder+'/'+collection+'/'+folder)):
        if(file_2.split('.')[0] + '-item' not in id_items and file_2 !='.DS_Store'):
            print(file_2.split('.')[0] + '-item.json')  
            bbox_item,m1,m2,M1,M2 = get_bbox_row(False,[stac_challenge_folder,collection,folder,file_2])
            m1_col,m2_col,M1_col,M2_col = adapt_bbox_col(m1_col,m2_col,M1_col,M2_col,m1,m2,M1,M2)
            item = create_item(False,[folder, file_2, bbox_item], collection)
            path = folder+'/'
            filename = file_2.split('.')[0]+'-item'
            col.add_item(item, path = path, filename = filename)
            id_items.append(file_2.split('.')[0] + '-item');
    

In [595]:
#create the collections that were not listed in csv, then create their items, and add them to the collection
def create_and_add_collections_from_local():
    print('\n \n add collection stored locally but not listed in the csv \n \n');
    for collection in(os.listdir(stac_challenge_folder)):
        if('challenge_collection' not in collection and collection != '.DS_Store' and 'data_to_STAC' not in collection and '.json' not in collection):
            if(collection not in id_collections):
                print('collection from folder:',collection);
                col, col_address = create_collection(collection)
                catalog.add_catalog(col)
                for folder in (os.listdir(stac_challenge_folder+'/'+collection)):
                    if(folder != '.DS_Store'):
                        file = os.listdir(stac_challenge_folder+'/'+collection+'/'+folder)[0]
                        if('.DS_Store' in file):
                            file = os.listdir(stac_challenge_folder+'/'+folder)[1]
                        bbox,m1,m2,M1,M2 = get_bbox_row(False,[stac_challenge_folder,collection,folder,file])
                        m1_col = bbox[0]
                        m2_col = bbox[1]
                        M1_col = bbox[2]
                        M2_col = bbox[3]
                        create_add_local_item(collection,folder, col,m1_col,m2_col,M1_col,M2_col); 
                bbox_col = [m1_col,m2_col,M1_col,M2_col]
                col.data['extent']['spatial'] = bbox_col
                col.save_as(col_address) 
                cols[collection] = col;
    

## Create the collections and items:

In [596]:
id_collections = []
id_items = []
cols = {};


for collection in df['collection'].unique():
    print('collection:',collection);
    col, col_address = create_collection(collection);
    catalog.add_catalog(col)
    df_col = df[df['collection'] == collection];
    bbox,m1,m2,M1,M2 = get_bbox_row(True,df_col.iloc[0]);
    m1_col = bbox[0]
    m2_col = bbox[1]
    M1_col = bbox[2]
    M2_col = bbox[3]
    
    create_and_add_items_to_collection_from_csv(df_col,m1_col,m2_col,M1_col,M2_col,col, collection)
    add_remaining_items_from_local_to_collection(col,m1_col,m2_col,M1_col,M2_col)
    
    bbox_col = [m1_col,m2_col,M1_col,M2_col]
    col.data['extent']['spatial'] = bbox_col
    col.save_as(col_address) 
    cols[collection] = col;
create_and_add_collections_from_local()
catalog.save_as(catalog_address)


collection: accra
challenge_collection_accra


 from csv: 
 

accra_1_buildings.geojson
accra_1_drains.geojson
accra_1_roads.geojson
accra_2_buildings.geojson


 from folder: 
 

accra_2_roads-item.json
accra_2_drains-item.json
accra_3_roads-item.json
accra_3_drains-item.json
accra_3_buildings-item.json
collection: monrovia
challenge_collection_monrovia


 from csv: 
 

monrovia_2_buildings.geojson


 from folder: 
 

monrovia_3_roads-item.json
monrovia_3_drains-item.json
monrovia_3_buildings-item.json
monrovia_4_roads-item.json
monrovia_4_buildings-item.json
monrovia_4_drains-item.json
monrovia_2_drains-item.json
monrovia_2_roads-item.json
monrovia_1_drains-item.json
monrovia_1_buildings-item.json
monrovia_1_roads-item.json
collection: zanzibar
challenge_collection_zanzibar


 from csv: 
 

znz_022_buildings.geojson
znz_001_buildings.geojson


 from folder: 
 


 
 add collection stored locally but not listed in the csv 
 

collection from folder: pointe-noire
challenge_collection_poi

challenge_catalog