# Metadata fra Geodata-info.dk
Denne notebook er tænkt til at hente data fra [geodata-info.dk](https://www.geodata-info.dk) (GDI) så egne geodata kan beriges udfra den nøgle (Identifikator) GDI tildeler deres datasæt. Det kræver således at man selv har tagget sine geodata med den nøglen og at disse kan udstilles, så data kan beriges med metadata. Vi arbejder med [Mapcentias GeoCloud2](http://www.mapcentia.com/dk/produkt/) som bruger [PostgreSQL](https://www.postgresql.org/), så derfor vil denne notebook være rettet mod at snakke med denne platform.
    
### Om Geodata-info.dk
Geodata-info.dk er den danske geoportal, der gør det muligt for professionelle brugere samt borgere med interesse for geodata at søge efter geodatasæt og geodatatjenester. Geodata-info.dk omfatter desuden den danske søgetjeneste i henhold til INSPIRE-direktivet (Forordning nr. 976/2009 for så vidt angår nettjenesterne).

In [None]:
import requests
import xml.etree.ElementTree as ET
import json
import sys
import pandas as pd
sys.path.append('/python/')
import connections as con

In [None]:
class Geometadata:
    """
    Henter metadata om tabeller fra geodata-info.dk
    """
    def __init__(self):
        self.fileIdentifier_path = "gmd:fileIdentifier/gco:CharacterString"
        self.title_path = "gmd:identificationInfo/gmd:MD_DataIdentification/gmd:citation/gmd:CI_Citation/gmd:title/gco:CharacterString"
        self.abstract_path = "gmd:identificationInfo/gmd:MD_DataIdentification/gmd:abstract/gco:CharacterString"
        self.organisationName_path = "gmd:contact/gmd:CI_ResponsibleParty/gmd:organisationName/gco:CharacterString"
    
    def get_xml(self,  gid):
        url = f"https://geodata-info.dk/srv/api/records/{gid}/formatters/xml"
        r = requests.request("GET", url)
        root = ET.fromstring(r.text)
        
        return root
    
    def get_property(self, xml, xml_path):
        
        namespaces = {
            'gmd': 'http://www.isotc211.org/2005/gmd',
            'gco': 'http://www.isotc211.org/2005/gco'
        }
               
        prop = xml.find(xml_path, namespaces)
        
        return prop.text
        
    def get_meta_data(self, gid, properties=['fileIdentifier', 'title', 'abstract', 'organisationName']):
    
        """
        Henter metadata udfra metadata id fra geodata-info.dk
        fileIdentifier
        title
        abstract
        organisationName  
        """
        
        root = self.get_xml(gid)
        
        props = {}

        for prop in properties:
            prop = str(prop).lower()
            
            if prop == "fileidentifier":
                props.update({"fileIdentifier" : self.get_property(root, self.fileIdentifier_path)})
            elif prop == "title":
                props.update({"title" : self.get_property(root, self.title_path)})
            elif prop == "abstract":
                props.update({"abstract" : self.get_property(root, self.abstract_path)})
            elif prop == "organisationname":
                props.update({"organisationName" : self.get_property(root, self.organisationName_path)})
                
        return props

# Berig tabeller med metadata fra Geodata-info

In [None]:
meta = Geometadata()

In [None]:
query = """
    select "_key_", RIGHT(value, length(value) - 5) id
    from SETTINGS.GEOMETRY_COLUMNS_JOIN, jsonb_array_elements_text(tags)
    where value like '_gdi:%%'
"""

In [None]:
df, engine = con.sql_to_dataframe('production', query)

In [None]:
properties=['title', 'abstract', 'organisationName']

In [None]:
df[properties] = df['id'].apply(lambda x: pd.Series(meta.get_meta_data(x, properties).values()))

In [None]:
df['title'] = df.title.apply(lambda x: list(x)[0]) 
df['abstract'] = df.abstract.apply(lambda x: list(x)[1]) 
df['organisationName'] = df.organisationName.apply(lambda x: list(x)[2]) 

# To PostgreSQL

In [None]:
df.to_sql('geodatainfo_meta')

# Gammelt

In [None]:
def add_metadata(gid, properties, df):
    """
    Add metadata columns to dataframe
    """
    data = meta.get_meta_data(gid, properties)
    print(data)
   
    for key in data:
        #print(df[key], data[key])
        df[key] = data[key]

In [None]:
url = "https://geodata-info.dk/srv/api/records/fb9e06dc-9f02-42ac-bf1c-e0d662b81f4f/formatters/xml"
r = requests.request("GET", url)

In [None]:
root = ET.fromstring(r.text)

In [None]:
namespaces = {
    'gmd': 'http://www.isotc211.org/2005/gmd',
    'gco': 'http://www.isotc211.org/2005/gco'
}

In [None]:
base = "gmd:identificationInfo/gmd:MD_DataIdentification/"
abs_path = f"{base}gmd:abstract/gco:CharacterString"

In [None]:
abs_path

In [None]:
abstact = root.find(path, namespaces)

In [None]:
abstact.text

# HENT DATA WFS

Tabelnavn (typename) fra WFS'er GC2 scheduler jobs udtrækkes

In [None]:
wfs = pd.read_csv('data/wfs.csv')

In [None]:
def typename(url):
    for item in url.split('&'):
        if item.split('=')[0].lower() == 'typename':
            return (item.split('=')[1])

In [None]:
wfs['typename'] = wfs['url'].apply(lambda x: typename(x))

In [None]:
wfs.head()

## FIND uuid for hver typename

In [None]:
def get_uuid(typename):
    url = "https://www.geodata-info.dk/srv/dan/q"

    querystring = {
        "_content_type":"json",
        "any":typename,
        "bucket":"s101",
        "facet.q":"",
        "fast":"index",
        "from":"1",
        "resultType":"details",
        "sortBy":"relevance",
        "to":"20"
    }

    headers = {
        'Cache-Control': "no-cache"
        }

    response = requests.request("GET", url, headers=headers, params=querystring)
    j = response.json()
    
    try:
        return j['metadata']['geonet:info']['uuid']
    except Exception as e:
        print(e)

In [None]:
get_uuid('dmp:AA_BES_LINJER')

In [None]:
wfs['uuid'] = wfs.typename.apply(get_uuid)

In [None]:
wfs[wfs.uuid != 'None']

In [None]:
def get_uuid(typename):
    url = "https://www.geodata-info.dk/srv/dan/q"

    querystring = {
        "_content_type":"json",
        "any":typename,
        "bucket":"s101",
        "facet.q":"",
        "fast":"index",
        "from":"1",
        "resultType":"details",
        "sortBy":"relevance",
        "to":"20"
    }

    headers = {
        'Cache-Control': "no-cache"
        }

    response = requests.request("GET", url, headers=headers, params=querystring)
    j = response.json()
    
    return j

In [None]:
get_uuid('dmp:AA_BES_LINJER')