# RDFification Notebook

In [2]:
# Import Libraries
from rdflib import Graph, URIRef, Literal, RDF, XSD, RDFS #basic RDF handling
from rdflib.namespace import Namespace #common namespace
import pandas as pd #for handling csv and csv contents
import unicodedata
import re
import fileinput
import csv


In [3]:
# Define graph 'g' and namespaces
sio = Namespace('http://semanticscience.org/resource/')
esgreen = Namespace('https://w3id.org/esgreen/')
obo = Namespace('http://purl.obolibrary.org/obo/')
#eol = Namespace('https://eol.org/pages/')
wiki = Namespace('http://en.wikipedia.org/wiki/')
g = Graph()
g.bind('sio', sio)
g.bind('esgreen', esgreen)

In [1]:
#Functions for URI's

def prepareUri(uri):
    return esgreen + str(uri).replace(' ', '_').replace('"', '').lower()

def prepareUriWiki(uri):
    return wiki + str(uri).replace(' ', '_').replace('"', '')

### Dataset 1: ArboladoParquesHistoricoSingularesForestales_YYYY



| Original variable name | New variable name | Description                                             | Type   | Use                | SIO Term | Other term |
| ---------------------- | ----------------- | ------------------------------------------------------- | ------ | ------------------ | --------- | --------- |
| PARQUE                 | park              | The unique ID name of the park on which tree is located | `string` | To locate the tree | [Site](https://vemonet.github.io/semanticscience/browse/class-siosite.html) |
| ESPECIE                | scientific_name   | Botanical name for the dominant specie                  | `string` | To group by taxon  | [MaterialEntity](https://vemonet.github.io/semanticscience/browse/class-siomaterialentity.html) | Specie |
| UNIDADES YEAR          | count             | Number of tree from same type                           | `int`    | To count/sum       | [MemberCount](https://vemonet.github.io/semanticscience/browse/class-siomembercount.html) | |


* **An example of expected triples using SIO:**


```rdf
######## Turtle syntax ########

:parque rdf:type sio:Site .
:parque sio:isLocatedIn Parque .
:parque sio:contains :collection-of-especie .

:collection-of-especie rdf:type sio:Collection .
:collection-of-especie sio:hasMember :especie .
:collection-of-especie sio:has-attribute :unidades .
:unidades a sio:memberCount .
:unidades sio:has-value "unidades_year" .
:unidades sio:has-unit obo:UO_0000189 .
:unidades sio.measuredAt, Literal(datatype=XSD.date)))

<!-- map UniqueIdentifier with WIKI and gbif database -->

:especie a :habitatSpecies.
:especie sio:UniqueIdentifier :ESPECIE-code . 
:especie RDFS.label :ESPECIE .
:especie RDFS.seeAlso :WIKI-ESPECIE .
```

**Data Preprocessing**

In [12]:
# harmonize name files
file_dates = [ '_2019', '_2020', '2017' ]

# for loop to harmonize variable names per date
for file_date in file_dates:
    # Read in the csv file
    df = pd.read_csv(f'data/inputs/preprocessing/ArboladoParquesHistoricoSingularesForestales{file_date}.csv', sep = ';')
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')] # remove unnamed cols
    print(df.head())
    if file_date == '2017':
        count_col = 'n_de_ejemplares'
        especie_col = 'especie'
        #print(f'if',count_col, especie_col, file_date)
    else:
        file_date = file_date.replace('_', '')
        count_col = 'unidades_' + file_date
        especie_col = 'especie'

                     parque                 especie  unidades_2019
0  jardines_del_buen_retiro  aesculus_hippocastanum         6008.0
1  jardines_del_buen_retiro      platanus_x_hybrida          925.0
2  jardines_del_buen_retiro     cercis_siliquastrum          724.0
3  jardines_del_buen_retiro   trachycarpus_fortunei          604.0
4  jardines_del_buen_retiro  cupressus_sempervirens          552.0
                     parque                 especie  unidades_2020
0  jardines_del_buen_retiro  aesculus_hippocastanum         5951.0
1  jardines_del_buen_retiro      platanus_x_hybrida          934.0
2  jardines_del_buen_retiro     cercis_siliquastrum          720.0
3  jardines_del_buen_retiro   trachycarpus_fortunei          532.0
4  jardines_del_buen_retiro        celtis_australis          541.0
                     parque                 especie  n_de_ejemplares
0  jardines_del_buen_retiro  aesculus_hippocastanum            6.377
1  jardines_del_buen_retiro      platanus_hispanica       

In [20]:
file_dates = [ '_2019', '_2020', '2017' ]

for file_date in file_dates:
    # Read in the csv file
    df = pd.read_csv(f'data/inputs/preprocessing/ArboladoParquesHistoricoSingularesForestales{file_date}.csv', sep = ';')
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')] # remove unnamed cols
    #print(df.head())
    if file_date == '2017':
        count_col = 'n_de_ejemplares'
        especie_col = 'especie'
        #print(f'if',count_col, especie_col, file_date)
    else:
        file_date = file_date.replace('_', '')
        count_col = 'unidades_' + file_date
        especie_col = 'especie'
 # Iterate dataframe and generate RDF triples
    for index, row in df.iterrows():
        #a= str((row[especie_col]).capitalize())
        park_uri = URIRef(prepareUri(row['parque']))
        specie_uri = URIRef(prepareUri(row[especie_col]))
        wiki_uri = URIRef(prepareUriWiki(str(row[especie_col]).capitalize()))
        #print(wiki_uri)
        # eol_uri = URIRef(prepareUriEOL(row[especie_col]))

        collection_uri = URIRef(prepareUri(f"collection-{row[especie_col]}-{row['parque']}"))
        count_uri = URIRef(prepareUri(f"count-{file_date}-{row[especie_col]}-{row['parque']}"))
        
        g.add((park_uri, RDF.type, sio.site))
        g.add((park_uri, RDFS.label, Literal(str(row['parque']).lower())))
        #g.add(COORDINATES)

        g.add((collection_uri, RDF.type, sio.Collection))
        g.add((collection_uri, sio.hasMember, specie_uri))
        g.add((specie_uri, RDF.type, obo.FLOPO_0900033))
        g.add((collection_uri, sio.hasAttribute, count_uri))

        g.add((count_uri, RDF.type, sio.MemberCount))
        g.add((count_uri, sio.hasValue, Literal(row[count_col], datatype=XSD.integer)))
        g.add((count_uri, sio.hasUnit, obo.UO_0000189))
        g.add((count_uri, sio.measuredAt, Literal(file_date, datatype=XSD.date)))

        g.add((specie_uri, RDF.type, sio.Specie))
        #g.add((wiki_uri, RDF.type, sio.Specie))
        g.add((specie_uri, RDFS.label, Literal(str(row[especie_col]).lower())))
        g.add((specie_uri, RDFS.seeAlso, wiki_uri))

In [18]:
outputfile = 'outputs/rdflib-output.ttl'
g.serialize(outputfile, format='turtle')
print(outputfile)

outputs/rdflib-output.ttl


## CHALLENGES AND TODO
* Data cleaning: remove latin character and others _(*&(&#))_, unnecessary rows as total and aggregate values. [see data-cleaning.py](data-cleaning.py)
* All entities uses SIO schema but **specie** is not clear yet.
* The issue will be to map each entity with global identifier within biodiversity database (e.g. wikidata API such https://www.wikidata.org/w/api.php?action=wbsearchentities&search=pinus&language=en or https://www.gbif.org/species/2684241). [see data-argumentation.py](data-argumentation.py)
* Inconsistency file and variable names and therefore harmonization of the entity names.
* **Data Argumentation with georeferencing parks and taxo, family and other related terms from scientificname.** [see unique-species.py](unique-species.py)

---

### Dataset 2: ArboladoZonasVerdesDistritosCalles_2020

| Original variable name | New variable name     | Description                                                  | Type   | Use                       |
| ---------------------- | --------------------- | ------------------------------------------------------------ | ------ | ------------------------- |
| Nombre_distrito        | district_name         | The unique name of the district on which tree is located     | ``string`` | To locate the tree        |
| Num_distrito           | district_name         | The unique ID number of the district on which tree is located | `string` | No use        |
| NOMBRE_ESPECIE         | scientific_name       | Botanical name for the dominant specie                       | `string` | To group by taxon         |
| UNIDADES YEAR          | count                 | Number of tree from same type                                | `int`    | To count/sum              |
| Total                  | subTotalCountDistrict | Total amount of tree in each district within a city          | `int`    | No use |

* Expected triples:
```rdf

######## Turtle syntax ########

:distrito rdf:type sio:SpatialRegion .
:distrito sio:isLocatedIn Nombre_distrito .
:distrito sio:contains :collection-of-especie .

:collection-of-especie rdf:type sio:Collection .
:collection-of-especie sio:hasMember :especie .
:collection-of-especie sio:has-attribute :unidades .
:unidades a sio:memberCount .
:unidades sio:has-value "unidades_year" .
:unidades sio:has-unit obo:UO_0000189 .
:unidades sio.measuredAt, Literal(datatype=XSD.date)))

:especie a :habitatSpecies.
:especie sio:UniqueIdentifier :ESPECIE-code .
:especie RDFS.label :ESPECIE .
:especie RDFS.seeAlso :WIKI-ESPECIE .

In [29]:

file_dates = [ '_2019', '_2020', '_2017' ]

for file_date in file_dates:
    # Read in the csv file
    df = pd.read_csv(f'data/inputs/preprocessing/ArboladoZonasVerdesDistritosCalles{file_date}.csv', sep = ';') 
    file_date = file_date.replace('_', '')
    print(file_date)
    if file_date == '2017':
        especie_col = 'especie'
        count_col = 'unidades'
        district_col = 'distrito'
    else:
        #file_date = file_date.replace('_', '')
        count_col = 'n_unidades'
        especie_col = 'nombre_especie'
        district_col = 'nombre_distrito'
        

    # Iterate dataframe and generate RDF triples
    for index, row in df.iterrows():

        district_uri = URIRef(prepareUri(row[district_col]))
        specie_uri = URIRef(prepareUri(row[especie_col]))
        wiki_uri = URIRef(prepareUriWiki(str(row[especie_col]).capitalize()))

        collection_uri = URIRef(prepareUri(f"collection-{row[especie_col]}-{row[district_col]}"))
        count_uri = URIRef(prepareUri(f"count-{file_date}-{row[especie_col]}-{row[district_col]}"))
        
        g.add((district_uri, RDF.type, sio.SpatialRegion))
        g.add((district_uri, RDFS.label, Literal(str(row[district_col]).lower())))

        g.add((collection_uri, RDF.type, sio.Collection))
        g.add((collection_uri, sio.hasMember, specie_uri))
        g.add((collection_uri, sio.hasAttribute, count_uri))

        g.add((count_uri, RDF.type, sio.MemberCount))
        g.add((count_uri, sio.hasValue, Literal(row[count_col], datatype=XSD.integer)))
        g.add((count_uri, sio.hasUnit, obo.UO_0000189))
        g.add((count_uri, sio.measuredAt, Literal(file_date, datatype=XSD.date)))

        g.add((specie_uri, RDF.type, sio.Specie))
        g.add((specie_uri, RDFS.label, Literal(str(row[especie_col]).lower())))
        g.add((specie_uri, RDFS.seeAlso, wiki_uri))


        # g = createAttr(g, 
        #     predType=sio['memberCount'], 
        #     subject=row[especie_col], 
        #     obj=row['PARQUE'], 
        #     extra={'memberCount': row['UNIDADES 2020']}
        # )



# print(g.serialize(format='turtle'))
outputfile = 'outputs/rdflib-output2.ttl'
g.serialize(outputfile, format='turtle')
print(f'finished....' + outputfile)
#print(g.serialize(format='turtle'))

2019
2020
2017
finished....outputs/rdflib-output2.ttl


## CHALLENGES
* Remove unnecessary rows as total, num district and aggregate values.
* 2017 has to be converted into csv file.
* All entities uses SIO schema but the issue will be to map each entity with global identifier within biodiversity database (e.g. wikidata API such https://www.wikidata.org/w/api.php?action=wbsearchentities&search=pinus&language=en or https://www.gbif.org/species/2684241).
* Still data has to be cleaned e.g. extra text in some rows, even if character latin has been removed previously.
* Inconsistency file and variable names.
* Georeferencing still has to happen.
* District a Spatial region or Site SIO class.

---
## EstadoParquesHistoricoSingularesForestales

    
  * `Estado_arbolado_ParquesHistoricoSingularesForestales_YYYY`
  
     | Original variable name           | New variable name | Description                                                  | Type   | Use                            |
      | -------------------------------- | ----------------- | ------------------------------------------------------------ | ------ | ------------------------------ |
      | PARQUE                           | park_name         | The unique name of the park on which tree is located         | ``string`` | To locate the tree             |
      | Altura Promedio (m)              | avgTreeHt         | Average height (m) of all trees in a Park. Calculated as distance from ground level to three top | `int`    | for growth curve or change     |
      | Perimetro Promedio (cm)          | avgTreePerim      | Average circumference of all trees in a Park. Diameter * Pi  | ``int``    | Phenology/allometric equations |
      | Recién Plantado y no consolidado | n_ageNew          | Number of trees which age is 1 to 5 years                    | ``int``    | Phenology/allometric equations |
      | Joven                            | n_ageJuvenile     | Num of trees in juvenile stage                               | `int`    |                                |
      | Maduro                           | n_ageAdult        | Num of trees Achieved max. Optimal development               | `int`    |                                |
      | Viejo                            | n_ageOld          | Num of trees deprecated age stage                            | `int`    |                                |
      | Otros                            | n_others          | Number of trees death and others                             | `int`    |                                |
      | Total General                    | subTotalCountPark | Total amount of trees in each park within a city             | `int`    | To count/agg per district      |
      | Total                            | totalCountPark    | Total amount of tree in all parks within a city              | `int`    | To count/agg the whole city    |

* Expected triples:


######## Turtle syntax ########
```rdf
:PARQUE rdf:type sio:Site .
:PARQUE sio:contains :collection-of-ESPECIE-STATUS .
:collection-of-ESPECIE-STATUS rdf:type sio:Collection .
:collection-of-trees sio:hasAttribute :age .
:age a :subClassOf sio:dimensional-quantity .
:age sio: has-label :lifeCycleInfo .
:lifeCycleInfo sio:has-label "Recien plantado y no consolidado" .
:Recien plantado y no consolidado a sio:memberCount .
:membercount sio:has-value "Recien plantado y no consolidado" .
:lifeCycleInfo sio:has-label "Recien plantado y no consolidado" .
:Recien plantado y no consolidado a sio:memberCount .
:membercount sio:has-value "Recien plantado y no consolidado" .
:lifeCycleInfo sio:has-label "Joven" .
:Joven a sio:memberCount .
:membercount sio:has-value "Joven" .  
:collection-of-trees sio:hasAttribute : Maduro .
:Maduro a sio:mean .
:Maduro sio: has-value "Maduro" .
:collection-of-trees sio:hasAttribute : Viejo .
:Viejo a sio:mean .
:Viejo sio: has-value "Viejo" .

In [32]:
obo = Namespace('http://purl.obolibratory.org/obo/')
#ppo = Namespace('http://purl.obolibratory.org/ppo/')

file_dates = ['2017', '2018', '2019', '2020']

for file_date in file_dates:
    # Read in the csv file
    df = pd.read_csv(f'data/inputs/preprocessing/EstadoParquesHistoricoSingularesForestales_{file_date}.csv', sep = ';')
    # file_date = file_date.replace('_', '')   
    # df = df.loc[:, ~df.columns.str.contains('^Unnamed')] # remove unnamed cols
    # df.columns = df.columns.str.replace(" ", "_")
    #print('before.....',df.columns)
    # cols = df.columns.to_list()
    #statuses = ['Recien-plantado-y-no-consolidado','Joven', 'Maduro', 'Viejo', 'Otros']
    if file_date == '2017':
        # df.columns = ['parque', 'total_arboles', 'altura_media', 'perimetro_medio_(cm)', 'recien_plantado_y_no_consolidado', 'joven', 'maduro', 'viejo', 'otros']
        #for col in df.columns:
        df.rename(columns={'recien_plantado':'recien_plantado_y_no_consolidado', 'altura_media':'altura_promedio_(m)', 'perimetro_medio': 'perimetro_promedio_(cm)'}, inplace=True)
        stats_col = ['altura_promedio_(m)','perimetro_promedio_(cm)']
        statuses = ['recien_plantado_y_no_consolidado','joven', 'maduro', 'viejo', 'otros']
        #print(statuses)
       # stats_col = 'Altura_media'
    #print(file_date, df)
    else:
        stats_col = ['altura_promedio_(m)','perimetro_promedio_(cm)']
        statuses = ['recien_plantado_y_no_consolidado','joven', 'maduro', 'viejo', 'otros']
        print(file_date, df.columns)
    print(f'after',df.columns)
#     # print(cols)
#     # for (idx, row) in df.iterrows():
#     #     cols = df.columns
#     #     #cols = cols.replace('�', '')
#     #     #print(cols)
    
    ### Iterate dataframe and generate RDF triples
    for index, row in df.iterrows():
        park_uri = URIRef(prepareUri(row['parque']))

        status_uris = {}
        # for age_status in statuses:
        #     status_uris[age_status] = URIRef(prepareUri(age_status))
        print(status_uris)
        collection_uri = URIRef(prepareUri(f"collection-of-trees-LocatedIn-{row['parque']}"))
        stats_uri = URIRef(prepareUri(f"measures-{file_date}-{row['parque']}"))

        
        g.add((park_uri, RDF.type, sio.site))
        g.add((park_uri, RDFS.label, Literal(str(row['parque']).lower())))
        g.add((collection_uri, RDF.type, sio.Collection))
        print(statuses)
        for age_status in statuses:
            # age_status_uri = URIRef(prepareUri(age_status))
            age_status_uri = URIRef(str(collection_uri) + '-life-Status-' + age_status.lower())
            print(age_status_uri)
            g.add((age_status_uri, RDF.type, sio.LifeStatus))
            #g.add((sio.LifeStatus, RDF.type, ppo.LifeStatus))
            g.add((age_status_uri, sio.hasQuality, Literal(age_status)))
            g.add((age_status_uri, RDF.type, sio.MemberCount))
            g.add((age_status_uri, sio.hasValue, Literal(row[age_status], datatype=XSD.integer)))
            g.add((age_status_uri, sio.hasUnit, obo.UO_0000189))
            g.add((age_status_uri, sio.measuredAt, Literal(file_date, datatype=XSD.date)))


        for statistics in stats_col:
            #print('toto')
            # age_status_uri = URIRef(prepareUri(age_status))
            statistics_uri = URIRef(str(stats_uri) + '-SpatialQuantity-' + statistics.lower())
            print(statistics_uri)
            g.add((statistics_uri, RDF.type, sio.DimensionalQuantity))
            g.add((statistics_uri, sio.hasQuality, Literal(age_status)))
            g.add((statistics_uri, sio.hasValue, Literal(row[age_status], datatype=XSD.float)))
            if stats_col == 'altura_promedio_(m)':
                g.add((statistics_uri, sio.hasUnit, obo.UO_0000008)) #mts
            else:
                g.add((statistics_uri, sio.hasUnit, obo.UO_0000007)) #cmts
            g.add((statistics_uri, sio.measuredAt, Literal(file_date, datatype=XSD.date)))
        

        # g.add((status_uri, RDF.type, sio.LifeStatus))
        # g.add((status_uri, RDFS.label, Literal(str(row[age_status]).lower())))



# print(g.serialize(format='turtle'))
#g.serialize('outputs/rdflib-output3.ttl', format='turtle')
outputfile = 'outputs/rdflib-output3.ttl'
g.serialize(outputfile, format='turtle')
print(f'finished....' + outputfile)


after Index(['parque', 'total_arboles', 'altura_promedio_(m)',
       'perimetro_promedio_(cm)', 'recien_plantado_y_no_consolidado', 'joven',
       'maduro', 'viejo', 'otros', 'Unnamed: 9', 'Unnamed: 10'],
      dtype='object')
{}
['recien_plantado_y_no_consolidado', 'joven', 'maduro', 'viejo', 'otros']
https://w3id.org/esgreen/collection-of-trees-locatedin-jardines_del_buen_retiro-life-Status-recien_plantado_y_no_consolidado
https://w3id.org/esgreen/collection-of-trees-locatedin-jardines_del_buen_retiro-life-Status-joven
https://w3id.org/esgreen/collection-of-trees-locatedin-jardines_del_buen_retiro-life-Status-maduro
https://w3id.org/esgreen/collection-of-trees-locatedin-jardines_del_buen_retiro-life-Status-viejo
https://w3id.org/esgreen/collection-of-trees-locatedin-jardines_del_buen_retiro-life-Status-otros
https://w3id.org/esgreen/measures-2017-jardines_del_buen_retiro-SpatialQuantity-altura_promedio_(m)
https://w3id.org/esgreen/measures-2017-jardines_del_buen_retiro-SpatialQuant

## CHALLENGES
* Remove unnecessary rows as total, num district and aggregate values.
* Missing measurements for 2017 - it has only counts.
* LifeCycle status properties can be described with LifeStatus but is binomial (either dead/alive): https://vemonet.github.io/semanticscience/browse/class-siolifestatus.html. **perhaps status descriptor**?
* Still data has to be cleaned e.g. extra text in some rows, even if character latin has been removed previously.
* Inconsistency file and variable names.
* Georeferencing still has to happen.
* Otros has to be defined or removed.
* SpatialQuantity namespace is to valid.

---
## EstadoZonasVerdesDistritosCalles
  
  * `EstadoZonasVerdesDistritosCalles_YYYY`
  
     | Original variable name                   | New variable name     | Description                                                  | Type   | Use                            |
      | ---------------------------------------- | --------------------- | ------------------------------------------------------------ | ------ | ------------------------------ |
      | NOMBRE DISTRITO                          | area_name             | Name of the area/district on which tree is located           | `string` | To locate the park             |
      | Num_DISTRITO                             | area_code             | The unique ID name of the park on which tree is located      | `int`    |                                |
      | Recién Plantado y no consolidado (RPyNC) | n_ageNew              | Number of trees which age is 1 to 5 years                    | `int`    | Phenology/allometric equations |
      | Altura Media (Hmedia)_RRLyNC             | avgTreeHt_New         |                                                              |        |                                |
      | Joven (J)                                | n_ageJuvenile         | Num of trees in juvenile stage                               | `int`    |                                |
      | Hmedia_J                                 | avgTreeHt_Juvenile    | Average height of all J trees in a Park. Calculated as distance from ground level to three top (m) | `int`    | for growth curve or change     |
      | Maduro (M)                               | n_ageAdult            | Num of trees Achieved max. Optimal development               | `int`    |                                |
      | Hmedia_M                                 | avgTreeHt_Adult       | Average height of all M trees in a Park. Calculated as distance from ground level to three top | `int`    | for growth curve or change     |
      | Viejo (V)                                | n_ageOld              | Num of trees deprecated age stage                            | `int`    |                                |
      | HMedia_V                                 | avgTreeHt_Juvenile    | Average height of all J trees in a Park. Calculated as distance from ground level to three top | `int`    | for growth curve or change     |
      | Otros                                    | n_others              | Number of trees death and others                             | `int`    |                                |
      | Hmedia_O                                 | avgTreeHt_Others      | Average height of all O trees in a Park. Calculated as distance from ground level to three top | `int`    | for growth curve or change     |
      | Total General                            | subTotalCountDistrict | Total amount of trees in each district within a city         | `int`    | To count/agg per district      |
      
      ```ttl
      # EstadoZonasVerdesDistritosCalles_YYYY.csv
      ######## Turtle syntax ########
      
      :NOMBRE_DISTRITO rdf:type :District .
      District sio: similarTo :SpatialRegion .
      :barajas sio:contains :collection-of-trees .
      :collection-of-trees a sio:Collection .
      :collection-of-trees sio:hasAttribute :age .
      :age a :subClassOf sio:dimensional-quantity.
      :age sio: has-label :lifeCycleInfo .
      :lifeCycleInfo sio:has-value "New" .
      :lifeCycleInfo a sio:memberCount .
      :cph_count sio:has-value "1039" .
      :collection-of-trees sio:hasAttribute : averageHeight-barajas .  
      :averageHeight-barajas a sio:mean .
      :averageHeight-barajas sio: has-value "4.19"
      ```