# Geospatial metadata for the DRC_CPF dataset

## Author
- Kamwoo Lee (klee16@worldbank.org)

## Contents
1. [Load required packages and summary data](#load_packages_data)
2. [Metadata information](#metadata_information)
3. [General description](#general_description)
4. [Data quality info](#data_quality_info)
5. [Feature catalogue](#feature_catalogue)
6. [Upload to NADA catalog](#upload)

# 1. Load required packages and summary data <a name="load_packages_data"></a>

In [62]:
import pynada as nada
import inspect
import pandas as pd
import numpy as np
import json
import datetime
from tqdm.notebook import tqdm

In [2]:
DATA_DIR = "DRC_CPF/"
SUMMARY_DIR = "02_CPF_DRC_data_exploration/"
RESULT_DIR = "03_CPF_DRC_metadata/"

In [167]:
dataset_info = pd.read_excel(SUMMARY_DIR + "dataset_summary_v1.xlsx", sheet_name="dataset_info", dtype=str, index=False, header=None, encoding='utf8').set_index(0).transpose().to_dict(orient='records')[0]
layer_summaries = pd.read_excel(SUMMARY_DIR + "dataset_summary_v1.xlsx", sheet_name="layer_summaries", index=False, encoding='utf8').fillna('')
field_summaries = pd.read_excel(SUMMARY_DIR + "dataset_summary_v1.xlsx", sheet_name="field_summaries", index=False, encoding='utf8').fillna('')
controlled_vocabulary = pd.read_excel(SUMMARY_DIR + "dataset_summary_v1.xlsx", sheet_name="controlled_vocabulary", index=False, encoding='utf8').fillna('')

In [189]:
layer_summaries

Unnamed: 0,dir,layer_name,layer_label,description,data_type,crs_name,crs_code,num_dimensions,vector_shape_type,vector_object_count,raster_width,raster_height,table_num_rows,min_lon,max_lon,min_lat,max_lat,source_name,source_url,data_process_summary
0,,DRC_grid,,,FeatureClass,GCS_WGS_1984,4326,14,Polyline,177,,,,12.4,29.26,-12.2621,-1.65132,,,
1,,COD_ADM1_WSF_Summary,,,Table,,,35,,,,,48,,,,,,,
2,,COD_CITY_WSF_Summary,,,Table,,,36,,,,,477,,,,,,,
3,,location__1_2021_01_28_16_08,,,Table,,,21,,,,,410,,,,,,,
4,,PDSS_facilities,,,Table,,,7,,,,,4002,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,WB_projects,OLD_WB_project_locations_PROVINCe,,,FeatureClass,GCS_WGS_1984,3857,21,Point,11703,,,,12.2974,31.1694,-12.9762,4.87881,,,
113,WB_projects,OLD_WB_project_distribution_province_level,,,FeatureClass,GCS_WGS_1984,3857,14,Polygon,26,,,,12.1995,31.3118,-13.4557,5.39233,,,
114,WB_projects,v2021_project_portfolio_PROVINCE,,,FeatureClass,GCS_WGS_1984,3857,26,Point,13321,,,,12.2974,31.1694,-12.9762,4.87881,,,
115,WB_projects,v2021_project_distribution_province_level,,,FeatureClass,GCS_WGS_1984,3857,27,Polygon,26,,,,12.1995,31.3118,-13.4557,5.39233,,,


# 2. Metadata information <a name="metadata_information"></a>

In [172]:
dataset_id = dataset_info['dataset ID']
repository_id = "central"  # Collection ID that owns the document
published = 1  # Status: 0=draft, 1=published
overwrite = "yes"  # Overwrite document if already exists? Valid values "yes" "no"
metadata_information = {
    "title": dataset_info['dataset title'],
    "idno": dataset_info['dataset ID'],
    "producers": [
        {
        "name": dataset_info['owner name'],
        "abbr": "DECAT"
        }
    ],
    "production_date": dataset_info['creation date'],
    "version": "1.0"
}
provenance = [
    {
        "origin_description": {
            "harvest_date": "string",
            "altered": True,
            "base_url": "string",
            "identifier": "string",
            "date_stamp": "string",
            "metadata_namespace": "string"
        }
    }
]

tags =  [
    {
        "tag": "DRC"
    },
    {
        "tag": "CPF"
    }
]

lda_topics = []
embeddings = []
additional = {}

# 2. General description <a name="general_description"></a>

In [174]:
# spatialRepresentationInfo
geometric_object_counts = {}
for ix, summary in layer_summaries.iterrows():
    if summary['vector_shape_type'] == 'Point':
        geometric_object_counts['point'] = geometric_object_counts['point'] + int(summary['vector_object_count']) if 'point' in geometric_object_counts else int(summary['vector_object_count'])
    if summary['vector_shape_type'] == 'Polyline':
        geometric_object_counts['curve'] = geometric_object_counts['curve'] + int(summary['vector_object_count']) if 'curve' in geometric_object_counts else int(summary['vector_object_count'])
    if summary['vector_shape_type'] == 'Polygon':
        geometric_object_counts['surface'] = geometric_object_counts['surface'] + int(summary['vector_object_count']) if 'surface' in geometric_object_counts else int(summary['vector_object_count'])

In [175]:
# referenceSystemInfo
reference_systems = []
epsg_codes = [int(code) for code in set(layer_summaries['crs_code']) if code!= '']
for code in epsg_codes:
    reference_systems.append({
        "code": str(code),
        "codeSpace": "EPSG"
    })

In [176]:
# geographicBoundingBox
OCHA_adm0 = layer_summaries[layer_summaries['layer_name'] == "OCHA_adm0_asof20190911"]
southBoundLatitude = min(OCHA_adm0['min_lat'])
westBoundLongitude = min(OCHA_adm0['min_lon'])
northBoundLatitude = max(OCHA_adm0['max_lat'])
eastBoundLongitude = max(OCHA_adm0['max_lon'])

In [177]:
description = {
    "idno": dataset_id,
    "language": "ENG",
    "characterSet": {
        "codeListValue": "utf-8"
    },
    "hierarchyLevel": [
        "dataset"
    ],
    "contact": [
        {
            "organisationName": dataset_info['owner name'],
            "contactInfo": {
                "address": {
                    "elctronicMailAddress": dataset_info['owner email']
                }
            },
            "role": "owner"
        }
    ],
    "dateStamp": dataset_info['release date'],
    "metadataStandardName": "ISO 19115-1, ISO 19110, ISO/TS 19139",
    "dataSetURI": "http://microdatalibqa.worldbank.org/index.php",
    "spatialRepresentationInfo": [
        {
            "vectorSpatialRepresentation": {
                "topologyLevel": "geometryOnly",
                "geometricObjects": [
                    {
                        "geometricObjectType": 'point',
                        "geometricObjectCount": geometric_object_counts['point']
                    },
                    {
                        "geometricObjectType": 'curve',
                        "geometricObjectCount": geometric_object_counts['curve']
                    },
                    {
                        "geometricObjectType": 'surface',
                        "geometricObjectCount": geometric_object_counts['surface']
                    }
                ]
            }
        }
    ],
    "referenceSystemInfo": reference_systems,
    "identificationInfo": [
        {
            "citation": {
                "title": dataset_info['dataset title'],
                "date": [
                    {
                        "date": dataset_info['creation date'],
                        "type": "creation"
                    },
                    {
                        "date": dataset_info['release date'],
                        "type": "released"
                    }
                ],
                "edition": "v.1",
                "editionDate": dataset_info['release date'],
                "identifier": {
                    "code": dataset_id
                },
                "citedResponsibleParty": [
                    {
                        "organisationName": dataset_info['owner name'],
                        "contactInfo": {
                            "address": {
                                "elctronicMailAddress": dataset_info['owner email']
                            }
                        },
                        "role": "owner"
                    }
                ],
                "presentationForm": [
                    "mapDigital"
                ],
                "series": {
                    "name": "Geospatial Dataset for Country Partnership Framework"
                }
            },
            "abstract": dataset_info['abstract'],
            "purpose": dataset_info['purpose'],
            "pointOfContact": [
                {
                    "organisationName": dataset_info['owner name'],
                    "contactInfo": {
                        "address": {
                            "elctronicMailAddress": dataset_info['owner email']
                        }
                    },
                    "role": "owner"
                }
            ],
            "resourceMaintenance": [
                {
                    "maintenanceAndUpdateFrequency": "asNeeded"
                }
            ],
            "graphicOverview": [
                {
                    "fileName": "Point_layers_DRC.jpg",
                    "fileDescription": "Point layers (DRC) overview",
                    "fileType": "image/jpeg"
                },
                {
                    "fileName": "MultiLineString_layers.jpg",
                    "fileDescription": "MultiLineString layers overview",
                    "fileType": "image/jpeg"
                },
                {
                    "fileName": "MultiPolygon_layers_DRC.jpg",
                    "fileDescription": "MultiPolygon layers (DRC) overview",
                    "fileType": "image/jpeg"
                },
                {
                    "fileName": "Point_layers_WLD.jpg",
                    "fileDescription": "Point layers (WLD) overview",
                    "fileType": "image/jpeg"
                },
                {
                    "fileName": "MultiPolygon_layers_WLD.jpg",
                    "fileDescription": "MultiPolygon layers (WLD) overview",
                    "fileType": "image/jpeg"
                }
            ],
            "resourceFormat": [
                {
                    "name": "OpenFileGDB",
                    "specification": "ESRI - GeoDatabase"
                }
            ],
            "descriptiveKeywords": [
                {
                    "type": "theme",
                    "keyword": "Inclusive Growth (131)",
                    "thesaurusName": "World Bank Theme Taxonomy and Definitions",
                },
                {
                    "type": "place",
                    "keyword": dataset_info['country name'],
                    "thesaurusName": "ISO 3166-1"
                }
            ],
            "resourceConstraints": [
                {
                    "legalConstraints": {
                        "useLimitation": [
                            "unrestricted"
                        ],
                        "accessConstraints": [
                            "unrestricted"
                        ],
                        "useConstraints": [
                            "unrestricted"
                        ]
                    }
                }
            ],
            "extent": {
                "geographicElement": [
                    {
                        "geographicBoundingBox": {
                            "southBoundLatitude": southBoundLatitude,
                            "westBoundLongitude": westBoundLongitude,
                            "northBoundLatitude": northBoundLatitude,
                            "eastBoundLongitude": eastBoundLongitude
                        },
                        "geographicDescription": dataset_info['country name']
                    }
                ],
            },
            "spatialRepresentationType": "vector",
            "language": [
                "English"
            ],
            "characterSet": [
                {
                    "codeListValue": "utf8"
                }
            ],
            "topicCategory": [
                "society", "society"
            ],
            "supplementalInformation": "",
        }
    ],
    "distributionInfo": {
        "distributionFormat": [
            {
                "name": "OpenFileGDB",
                "specification": "ESRI - GeoDatabase"
            }
        ],
        "distributor": [
            {
                "organisationName": dataset_info['owner name'],
                "contactInfo": {
                    "address": {
                        "elctronicMailAddress": dataset_info['owner email']
                    }
                },
                "role": "owner"
            }
        ]
    }
}

# 3. Data quality info <a name="data_quality_info"></a>

In [178]:
dataQualityInfo = [
    # {
    #     "scope": "dataset",
    #     "lineage": {
    #         "statement": "string",
    #         "processStep": [
    #             {
    #                 "description": "string",
    #                 "rationale": "string",
    #                 "dateTime": "string",
    #                 "processor": [
    #                     {
    #                         "individualName": "string",
    #                         "organisationName": "string",
    #                         "positionName": "string",
    #                         "contactInfo": {
    #                             "phone": {
    #                                 "voice": "string",
    #                                 "facsimile": "string"
    #                             },
    #                             "address": {
    #                                 "deliveryPoint": "string",
    #                                 "city": "string",
    #                                 "postalCode": "string",
    #                                 "country": "string",
    #                                 "elctronicMailAddress": "string"
    #                             },
    #                             "onlineResource": {
    #                                 "linkage": "string",
    #                                 "name": "string",
    #                                 "description": "string",
    #                                 "protocol": "string",
    #                                 "function": "string"
    #                             }
    #                         },
    #                         "role": "string"
    #                     }
    #                 ],
    #                 "source": [
    #                     {
    #                         "description": "string",
    #                         "sourceCitation": {
    #                             "title": "string",
    #                             "alternateTitle": "string",
    #                             "date": [
    #                                 {
    #                                     "date": "string",
    #                                     "type": "string"
    #                                 }
    #                             ],
    #                             "edition": "string",
    #                             "editionDate": "string",
    #                             "identifier": {
    #                                 "authority": "string",
    #                                 "code": None
    #                             },
    #                             "citedResponsibleParty": [
    #                                 {
    #                                     "individualName": "string",
    #                                     "organisationName": "string",
    #                                     "positionName": "string",
    #                                     "contactInfo": {
    #                                         "phone": {
    #                                             "voice": "string",
    #                                             "facsimile": "string"
    #                                         },
    #                                         "address": {
    #                                             "deliveryPoint": "string",
    #                                             "city": "string",
    #                                             "postalCode": "string",
    #                                             "country": "string",
    #                                             "elctronicMailAddress": "string"
    #                                         },
    #                                         "onlineResource": {
    #                                             "linkage": "string",
    #                                             "name": "string",
    #                                             "description": "string",
    #                                             "protocol": "string",
    #                                             "function": "string"
    #                                         }
    #                                     },
    #                                     "role": "string"
    #                                 }
    #                             ],
    #                             "presentationForm": [
    #                                 "string"
    #                             ],
    #                             "series": {
    #                                 "name": "string",
    #                                 "issueIdentification": "string",
    #                                 "page": "string"
    #                             },
    #                             "otherCitationDetails": "string",
    #                             "collectiveTitle": "string",
    #                             "ISBN": "string",
    #                             "ISSN": "string"
    #                         }
    #                     }
    #                 ]
    #             }
    #         ]
    #     }
    # }
]

# 4. Feature catalogue <a name="feature_catalogue"></a>

In [191]:
feature_type_list = []
feature_layer_summaries = layer_summaries[(layer_summaries['data_type'] == 'FeatureClass') | (layer_summaries['data_type'] == 'Table')]
for ix_layer, layer_summary in tqdm(layer_summaries.iterrows(), total=len(layer_summaries)):
    characteristics_list = []
    
    field_summaries_in_layer = field_summaries[field_summaries['layer_name'] == layer_summary['layer_name']]
    for lx_field, field_summary in field_summaries_in_layer.iterrows():
        dtype = field_summary['type']
        if dtype in ['Integer']:
            value_type = "xs:int"
        elif dtype in ['Double', 'Geometry']:
            value_type = "xs:decimal"
        else:
            value_type = "xs:string"
        characteristics = {}
        characteristics['memberName'] = field_summary['field_name']
        characteristics['definition'] = field_summary['definition']
        characteristics['cardinality'] = {"lower": 1, "upper": 1}
        characteristics['valueMeasurementUnit'] = "NA"
        characteristics['valueType'] = value_type

        controlled_vocabulary_in_field = controlled_vocabulary[(controlled_vocabulary['layer_name'] == field_summary['layer_name']) & (controlled_vocabulary['field_name'] == field_summary['field_name'])]
        if len(controlled_vocabulary_in_field) > 0:
            characteristics['listedValue'] = []
            for ix_value, value_info in controlled_vocabulary_in_field.iterrows():
                characteristics['listedValue'].append(
                    {
                        "label": value_info['value_name'],
                        "definition": value_info['definition']
                    }
                )
        elif field_summary['num_unique_values'] <= 10:
            characteristics['listedValue'] = [
                {
                    "label": str(value),
                    "definition":""
                } for value in eval(field_summary['first_10_unique_values'])]
        characteristics_list.append(characteristics)
    
    feature_type = {}
    feature_type['typeName'] = layer_summary['layer_name']
    feature_type['definition'] = layer_summary['description']
    feature_type['isAbstract'] = False
    feature_type['carrierOfCharacteristics'] = characteristics_list
    feature_type_list.append(feature_type)



  0%|          | 0/117 [00:00<?, ?it/s]

In [192]:
feature_catalogue = {
    "name": "Geospatial dataset for DRC DPF - Feature Catalogue",
    "featureType": feature_type_list
}

In [193]:
description['dataQualityInfo'] = dataQualityInfo
description['feature_catalogue'] = feature_catalogue

# 5. Test upload to NADA catalog <a name="test_upload"></a>

In [194]:
metadata = {
    'metadata_information': metadata_information,
    'description': description,
    'provenance': provenance,
    'tags': tags,
    'lda_topics': lda_topics,
    'embeddings': embeddings,
    'additional': additional
}
with open(RESULT_DIR + 'DRC_CPF_metadata.json', 'w') as fp:
    json.dump(metadata, fp,  indent=4)

In [200]:
api_info = pd.read_csv('../API_info.csv', header=None)
nada.set_api_url(api_info.iloc[4,0])
nada.set_api_key(api_info.iloc[4,1])

In [201]:
response = nada.create_geospatial_dataset(
    dataset_id=dataset_id,
    repository_id=repository_id,
    published=published,
    overwrite=overwrite,
    metadata_information=metadata_information,
    description=description,
    provenance=provenance,
    tags=tags,
    lda_topics=lda_topics,
    embeddings=embeddings,
    additional=additional
)
print(response)

Geospatial dataset successfully added to the catalog.
                                                                  0
id                                                            14572
repositoryid                                                central
type                                                     geospatial
idno                                         DRC_2022_CPF_GEO_v01_M
title             Geospatial Dataset for Country Partnership Fra...
year_start                                                     2022
year_end                                                       2022
nation                                             Congo, Dem. Rep.
authoring_entity                                                   
published                                                         1
created                                                  1650402718
changed                                                  1650402718
varcount                                                      

In [206]:
nada.add_resource(
    dataset_id=dataset_id,
    dctype="map",
    dcformat="application/zip",
    title='[Geodatabase] Geospatial dataset for CPF DRC, FY22-26',
    filename=DATA_DIR + f'drc_cpf_gdb.zip',
    overwrite='yes'
)

You provided a resource file. Processing...
Uploading the file...
File successfully uploaded.
Resource successfully added to the dataset.


In [202]:
nada.add_resource(
    dataset_id=dataset_id,
    dctype="doc/rep",
    dcformat="application/pdf",
    title="[Report] Congo, Democratic Republic of - Country Partnership Framework for the Period FY22-26 (Vol. 2) : Executive Summary (English)",
    filename="https://documents.worldbank.org/en/publication/documents-reports/documentdetail/797881648491150291/executive-summary",
    overwrite='yes'
)

nada.add_resource(
    dataset_id=dataset_id,
    dctype="doc/rep",
    dcformat="application/pdf",
    title="[Report] Congo, Democratic Republic of - Country Partnership Framework for the Period FY22-26 (English)",
    filename="https://documents.worldbank.org/en/publication/documents-reports/documentdetail/214221646062568502/congo-democratic-republic-of-country-partnership-framework-for-the-period-fy22-26",
    overwrite='yes'
)

You provided a resource URL. Processing...
Resource successfully added to the dataset.
You provided a resource URL. Processing...
Resource successfully added to the dataset.


In [203]:
graphic_overview_files = ['raster_building_desnity_DRC.jpg', 'raster_GSM_DRC.jpg', 'vector_point_layers_DRC.jpg', 'vector_polygon_layers_DRC.jpg', 'vector_polyline_layers_DRC.jpg', 'vector_point_layers_WLD.jpg', 'vector_polygon_layers_WLD.jpg']
for file in graphic_overview_files:
    nada.add_resource(
        dataset_id=dataset_id,
        dctype="map",
        dcformat="image/jpeg",
        title='[Visualization] ' + file,
        filename=SUMMARY_DIR + file,
        overwrite='yes'
    )

You provided a resource file. Processing...
Uploading the file...
File successfully uploaded.
Resource successfully added to the dataset.
You provided a resource file. Processing...
Uploading the file...
File successfully uploaded.
Resource successfully added to the dataset.
You provided a resource file. Processing...
Uploading the file...
File successfully uploaded.
Resource successfully added to the dataset.
You provided a resource file. Processing...
Uploading the file...
File successfully uploaded.
Resource successfully added to the dataset.
You provided a resource file. Processing...
Uploading the file...
File successfully uploaded.
Resource successfully added to the dataset.
You provided a resource file. Processing...
Uploading the file...
File successfully uploaded.
Resource successfully added to the dataset.
You provided a resource file. Processing...
Uploading the file...
File successfully uploaded.
Resource successfully added to the dataset.


In [204]:
nada.upload_thumbnail(dataset_id, SUMMARY_DIR + "vector_polyline_layers_DRC.jpg")

Uploading thumbnail...
Thumbnail successfully uploaded.
