In [20]:
import rdflib
import pandas as pd
import numpy as np

g=rdflib.Graph()

# all issued and modified are actually from dcterms and only of date
# dcterms:issued "1999-07-02"^^<http://www.w3.org/2001/XMLSchema#date> ;
# dcterms:modified "2012-06-14"^^<http://www.w3.org/2001/XMLSchema#date> ;

def reformat(predicate):
    """
    # replace http://purl.org/dc/elements/1.1/type with dc:type ...first!
    # "dc:description", "dc:source", "dc:title", "dc:type", "dc:publisher",

    # replace http://purl.org/dc/elements/1.1/ rest with dcterms:
    # replace http://purl.org/dc/terms/ terms of course with dcterms:

    # replace http://www.w3.org/1999/02/22-rdf-syntax-ns# with rdf:
    # replace http://www.w3.org/2004/02/skos/core# with skos:

    # filter only for http://vocab.smart-project.info/papawai/term/ and remove
    # http://www.w3.org/2004/02/skos/core#inCollection skos:inCollection
    """
    if "http://purl.org/dc/elements/1.1/type" in predicate:
        return "dc:type"
    elif "http://purl.org/dc/elements/1.1/description" in predicate:
        return "dc:description"
    elif "http://purl.org/dc/elements/1.1/source" in predicate:
        return "dc:source"
    elif "http://purl.org/dc/elements/1.1/title" in predicate:
        return "dc:title"
    elif "http://purl.org/dc/elements/1.1/publisher" in predicate:
        return "dc:publisher"
    elif "http://purl.org/dc/elements/1.1/" in predicate:
        return predicate.replace("http://purl.org/dc/elements/1.1/", "dcterms:")
    elif "http://purl.org/dc/terms/" in predicate:
        return predicate.replace("http://purl.org/dc/terms/", "dcterms:")
    elif "http://www.w3.org/1999/02/22-rdf-syntax-ns#" in predicate:
        return predicate.replace("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "rdf:")
    elif "http://www.w3.org/2004/02/skos/core#" in predicate:
        return predicate.replace("http://www.w3.org/2004/02/skos/core#", "skos:")

def generate(graph, col_id, hier):
    for s,p,o in graph:
        term_short = "http://vocab.smart-project.info/{}/{}/".format(col_id, hier)
        if term_short in s:
            newPredicate = reformat(p)
            if "skos:inCollection" not in newPredicate and "rdf:type" not in newPredicate:
                term_id = s.replace(term_short, "")
                # print(term_id,newPredicate,o)
                yield {'term': term_id, 'fields': newPredicate, 'data': o}

# g.parse('http://vocab.smart-project.info/spq-papawai/data', format="n3")
# collection = "papawai"
# hierarchy = "term"

# g.parse('http://vocab.smart-project.info/spq-ngmp/data', format="n3")
# collection = "ngmp"
# hierarchy = "phenomenon"

# g.parse('http://vocab.smart-project.info/spq-glossary/data', format="n3")
# collection = "glossary"
# hierarchy = "term"

g.parse('http://vocab.smart-project.info/spq-awahou/data', format="n3")
collection = "awahou"
hierarchy = "term"

rows = list(generate(g, collection, hierarchy))
df = pd.DataFrame(rows)
df.head(20)

Unnamed: 0,data,fields,term
0,GNS SR 2016/13,dcterms:bibliographicCitation,30
1,2016,dcterms:available,50
2,AL,dc:publisher,47
3,Tephra,skos:label,73
4,2016,dcterms:available,5
5,The portion of stream flow that is not runoff ...,dc:description,7
6,AL,dc:publisher,56
7,Vapour phase alteration,dc:title,64
8,An accumulation of groundwater that is above t...,dc:description,55
9,Perched,skos:prefLabel,55


In [21]:
df['fields'].unique()

array(['dcterms:bibliographicCitation', 'dcterms:available',
       'dc:publisher', 'skos:label', 'dc:description', 'dc:title',
       'skos:prefLabel', 'skos:definition', 'dcterms:modified'],
      dtype=object)

In [22]:
from rdflib import Literal


max_field_list = []
    
def new_field_name(row):
    dat = row['data']
    field = row['fields']
    if isinstance(dat, Literal):
            if dat.language is not None:
                field = field + "@" + dat.language
    max_field_list.append(field)
    return field

df['fields'] = df.apply(new_field_name, axis=1)

df.head()

Unnamed: 0,data,fields,term
0,GNS SR 2016/13,dcterms:bibliographicCitation,30
1,2016,dcterms:available,50
2,AL,dc:publisher,47
3,Tephra,skos:label,73
4,2016,dcterms:available,5


In [23]:
grouped = df.groupby('term')

# elem = grouped.get_group('12')
# len(elem['fields'].tolist())
# display(elem)

df_list = []
uniq_set = set(max_field_list)

for name, group in grouped:
    term_id = name
    dc_fields = group['fields'].tolist()
    dc_set = set(dc_fields)
    missing_cols = uniq_set.difference(dc_set)
    data_list = group['data'].tolist()
    data_tuple = [(x) for x in data_list]
    # display(data_tuple)
    this_df_data = [data_tuple] 
    this_df = pd.DataFrame.from_records(this_df_data, columns=dc_fields)
    this_df['term_id'] = int(name)
    for label in missing_cols:
        this_df[label] = np.nan
    this_df.reindex(index=this_df['term_id'])
    df_list.append(this_df)
    # with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    #     display(this_df.head())

display(len(df_list))

cols = list(uniq_set)
cols.append('term_id')
start_df = pd.DataFrame.from_records([], columns=cols)
display('start shape: ' + str(start_df.shape))
display(start_df)

for gdf in df_list:
    start_df = pd.concat([start_df, gdf], join='outer', axis=0, ignore_index=True)
    # display(gdf.head())
    # start_df.append(gdf)

# full_df = pd.concat(df_list, axis=1 join='inner')
full_df = start_df.copy()

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(full_df.head(50))

82

'start shape: (0, 10)'

Unnamed: 0,dcterms:bibliographicCitation,dcterms:available,dc:publisher,dcterms:modified,dc:description,skos:prefLabel@en,dc:title,skos:label,skos:definition@en,term_id


Unnamed: 0,dc:description,dc:publisher,dc:title,dcterms:available,dcterms:bibliographicCitation,dcterms:modified,skos:definition@en,skos:label,skos:prefLabel@en,term_id
0,Material that is eroded or is reworked and dep...,AL,Alluvial sediment,2016,GNS SR 2016/13,2016-09-05T05:08:26.478Z,Material that is eroded or is reworked and dep...,Alluvial sediment,Alluvial sediment,1
1,The most probable outcome based on a set of fa...,AL,Best-estimate,2016,GNS SR 2016/13,2016-09-05T05:08:26.658Z,The most probable outcome based on a set of fa...,Best-estimate,Best-estimate,10
2,"An area of land where surface water from rain,...",AL,Catchment,2016,GNS SR 2016/13,2016-09-05T05:08:26.659Z,"An area of land where surface water from rain,...",Catchment,Catchment,11
3,A rock composed of broken fragments of mineral...,AL,Breccia,2016,GNS SR 2016/13,2016-09-05T05:08:26.663Z,A rock composed of broken fragments of mineral...,Breccia,Breccia,12
4,"Organic compounds that contains carbon, chlori...",AL,Chlorofluorocarbons (CFCs),2016,GNS SR 2016/13,2016-09-05T05:08:26.664Z,"Organic compounds that contains carbon, chlori...",Chlorofluorocarbons (CFCs),Chlorofluorocarbons (CFCs),13
5,A large crater at the top of a volcano formed ...,AL,Caldera Basin,2016,GNS SR 2016/13,2016-09-05T05:08:26.664Z,A large crater at the top of a volcano formed ...,Caldera Basin,Caldera Basin,14
6,The lowest quantity of a substance that can be...,AL,Detection limit,2016,GNS SR 2016/13,2016-09-05T05:08:26.670Z,The lowest quantity of a substance that can be...,Detection limit,Detection limit,15
7,"An area of land where surface water from rain,...",AL,Catchment,2016,GNS SR 2016/13,2016-09-05T05:08:26.671Z,"An area of land where surface water from rain,...",Catchment,Catchment,16
8,Oxygen that is dissolved in water. It is an im...,AL,Dissolved Oxygen (DO),2016,GNS SR 2016/13,2016-09-05T05:08:26.673Z,Oxygen that is dissolved in water. It is an im...,Dissolved Oxygen (DO),Dissolved Oxygen (DO),17
9,"Organic compounds that contains carbon, chlori...",AL,Chlorofluorocarbons (CFCs),2016,GNS SR 2016/13,2016-09-05T05:08:26.674Z,"Organic compounds that contains carbon, chlori...",Chlorofluorocarbons (CFCs),Chlorofluorocarbons (CFCs),18


In [24]:
fields = []
for t in cols:
    if "dc:" in t:
        ln = t.replace("dc:", "http://purl.org/dc/elements/1.1/")
        ln = ln.replace("@en", "")
        ln = ln.replace("@mi", "")
        fields.append({'element': t, 'description': ln})
    elif "dcterms:" in t:
        ln = t.replace("dcterms:", "http://purl.org/dc/terms/")
        ln = ln.replace("@en", "")
        ln = ln.replace("@mi", "")
        fields.append({'element': t, 'description': ln})
    elif "skos:" in t:
        ln = t.replace("skos:", "http://www.w3.org/2004/02/skos/core#")
        ln = ln.replace("@en", "")
        ln = ln.replace("@mi", "")
        fields.append({'element': t, 'description': ln})
        
df2 = pd.DataFrame.from_dict(fields)
df2.head()

Unnamed: 0,description,element
0,http://purl.org/dc/terms/bibliographicCitation,dcterms:bibliographicCitation
1,http://purl.org/dc/terms/available,dcterms:available
2,http://purl.org/dc/elements/1.1/publisher,dc:publisher
3,http://purl.org/dc/terms/modified,dcterms:modified
4,http://purl.org/dc/elements/1.1/description,dc:description


In [25]:
writer = pd.ExcelWriter(collection + '.xlsx')
full_df.to_excel(writer,'Terms')
df2.to_excel(writer,'TermsMeaning')
writer.save()
writer.close()