# **Inference from ontology - StaDiOS**

* Install the dependencies to be used in the analysis

In [None]:
# SPARQL Endpoint interface to Python
%pip install sparqlwrapper
from SPARQLWrapper import SPARQLWrapper, JSON

# Pandas depencies
import pandas as pd
from pandas import json_normalize

# Export dependencies and file modification
import json
#from google.colab import files
#from IPython.display import FileLink

# Graphical representations and interaction options
%pip install matplotlib
import matplotlib.pyplot as plt

# Calculation of coordinate distances
%pip install geopy
from geopy.distance import geodesic as gd
import re
import operator

*  What treatments/follow-up tests do X and Y manifestations of Z disease have in common?
* As the ontology grows, different manifestations may have commonalities that we had not noticed at first

In [2]:
sparql = SPARQLWrapper("http://localhost:3030/StaDiOS/sparql", agent="SparqlWrapper - StaDiOS analysis")

sparql.setQuery("""

PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX std: <http://www.semanticweb.org/storh/ontologies/2022/11/StaDiOS#>

SELECT ?disease ?development ?manifestation ?treatment
WHERE {
    ?disease std:hasDiseaseDevelopments ?development .
    ?development std:hasDevelopmentManifestations ?manifestation .
    ?manifestation std:hasManifestationTreatmentStrategy ?treatment .
} 

""")

# Ask for the result in JSON format
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
results = json_normalize(results["results"]["bindings"])

# Simplify column names and data types
simplified_table = results.filter(regex='value')
simplified_table = simplified_table.rename(columns = lambda col: col.replace(".value", ""))
simplified_table = simplified_table.apply(lambda row: row.replace({'http://www.semanticweb.org/storh/ontologies/2022/11/StaDiOS#':''}, regex=True))

grouped_dataframe  = simplified_table.groupby(['treatment'])
grouped_dataframe = grouped_dataframe.filter(lambda x: ~x['treatment'].str.contains('Default').any())
grouped_dataframe = grouped_dataframe.sort_values('treatment', ascending=False)
grouped_dataframe.reset_index(drop=True)

Unnamed: 0,disease,development,manifestation,treatment
0,PBD_ProfoundBiotinidaseDeficiency,PBD_NaturalDevelopment,PBD_SkinProblemsManifestation,PBD_FakeTreatmentStrategy
1,PBD_ProfoundBiotinidaseDeficiency,PBD_NaturalDevelopment,PBD_MentalDelayManifestation,PBD_FakeTreatmentStrategy
2,PBD_ProfoundBiotinidaseDeficiency,PBD_NaturalDevelopment,PBD_SeizureManifestation,PBD_FakeTreatmentStrategy
3,PBD_ProfoundBiotinidaseDeficiency,PBD_NaturalDevelopment,PBD_SkinProblemsManifestation,PBD_BiotinTreatmentStrategy
4,PBD_ProfoundBiotinidaseDeficiency,PBD_NaturalDevelopment,PBD_HypotoniaManifestation,PBD_BiotinTreatmentStrategy
5,PBD_ProfoundBiotinidaseDeficiency,PBD_NaturalDevelopment,PBD_MentalDelayManifestation,PBD_BiotinTreatmentStrategy
6,PBD_ProfoundBiotinidaseDeficiency,PBD_NaturalDevelopment,PBD_SeizureManifestation,PBD_BiotinTreatmentStrategy
7,PBD_ProfoundBiotinidaseDeficiency,PBD_NaturalDevelopment,PBD_VisionLossManifestation,PBD_BiotinTreatmentStrategy
8,PBD_ProfoundBiotinidaseDeficiency,PBD_NaturalDevelopment,PBD_HearingProblemsManifestation,PBD_BiotinTreatmentStrategy


# **Inference from external ontologies - StaDiOS**

* Obtain all parameters whose country of origin is within the European Union
* A study or studies of a given disease may have parameters with different geographic locations.

In [3]:
sparql = SPARQLWrapper("http://localhost:3030/StaDiOS/sparql", agent="SparqlWrapper - StaDiOS analysis")

sparql.setQuery("""

PREFIX std: <http://www.semanticweb.org/storh/ontologies/2022/11/StaDiOS#>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>

SELECT ?disease ?development ?parameterType ?parameter ?studyIdentifier ?parameterCountry ?countryLabel WHERE {

   ?parameter std:hasCountry ?parameterCountry;
              std:hasStudyIdentifier ?studyIdentifier;
              rdf:type ?parameterType .
   ?parameterType a owl:Class.

   SERVICE <https://query.wikidata.org/sparql> {
      { 
         wd:Q458 wdt:P527  ?country .
         ?country rdfs:label ?countryLabel .
         FILTER (lang(?countryLabel)="en")
      }
   }
   FILTER CONTAINS(?countryLabel,?parameterCountry)
}

""")

# Ask for the result in JSON format
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
results = json_normalize(results["results"]["bindings"])

# Simplify column names and data types
simplified_table = results.filter(regex='value')
simplified_table = simplified_table.rename(columns = lambda col: col.replace(".value", ""))
simplified_table = simplified_table.apply(lambda row: row.replace({'http://www.semanticweb.org/storh/ontologies/2022/11/StaDiOS#':''}, regex=True))

simplified_table

Unnamed: 0,parameterType,parameter,studyIdentifier,parameterCountry,countryLabel
0,Utility,PBD_SkinProblemsManifestationUtility,PBD_001,Spain,Spain
1,Utility,PBD_HypotoniaManifestationUtility,PBD_001,Spain,Spain
2,Utility,PBD_HearingProblemsManifestationUtility,PBD_001,Spain,Spain
3,Utility,PBD_MentalDelayManifestationUtility,PBD_001,Spain,Spain
4,Utility,PBD_BasePopulationUtility,PBD_001,Spain,Spain
5,Utility,PBD_SeizureManifestationUtility,PBD_001,Spain,Spain
6,Utility,PBD_VisionLossManifestationUtility,PBD_001,Spain,Spain
7,Cost,PBD_SkinProblemsManifestationCost,PBD_001,Spain,Spain
8,Cost,PBD_HearingProblemsManifestationCost,PBD_001,Spain,Spain
9,Cost,PBD_ScreeningCost,PBD_001,Spain,Spain


* Get parameters with their countries and coordinates
* All the parameters of a study of a given disease should have a geographical context. Can we obtain its coordinates?

In [4]:
sparql = SPARQLWrapper("http://localhost:3030/StaDiOS/sparql", agent="SparqlWrapper on Jupyter example")

sparql.setQuery("""

PREFIX std: <http://www.semanticweb.org/storh/ontologies/2022/11/StaDiOS#>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>

SELECT DISTINCT ?disease ?development ?parameterType ?parameter ?studyIdentifier ?countryLabel ?coordinates
WHERE 
{
  ?disease a std:Disease;
            std:hasDiseaseDevelopments ?development.
  ?parameter std:hasCountry ?parameterCountry;
             std:hasStudyIdentifier ?studyIdentifier;
             rdf:type ?parameterType.
  ?parameterType a owl:Class.
  
  SERVICE <https://query.wikidata.org/sparql> 
  {
      { 
          ?item ((wdt:P31)/(wdt:P279*)) wd:Q6256;
          rdfs:label ?countryLabel;
          wdt:P625 ?coordinates.
          FILTER (lang(?countryLabel)="en")
      }
   }
  FILTER CONTAINS(?countryLabel,?parameterCountry)
}

""")

# Ask for the result in JSON format
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
results = json_normalize(results["results"]["bindings"])

# Simplify column names and data types
simplified_table = results.filter(regex='value')
simplified_table = simplified_table.rename(columns = lambda col: col.replace(".value", ""))
simplified_table = simplified_table.apply(lambda row: row.replace({'http://www.semanticweb.org/storh/ontologies/2022/11/StaDiOS#':''}, regex=True))
simplified_table

Unnamed: 0,disease,development,parameterType,parameter,studyIdentifier,countryLabel,coordinates
0,PBD_ProfoundBiotinidaseDeficiency,PBD_NaturalDevelopment,Utility,PBD_SkinProblemsManifestationUtility,PBD_001,Spain,Point(-3.5 40.2)
1,PBD_ProfoundBiotinidaseDeficiency,PBD_NaturalDevelopment,Utility,PBD_HypotoniaManifestationUtility,PBD_001,Spain,Point(-3.5 40.2)
2,PBD_ProfoundBiotinidaseDeficiency,PBD_NaturalDevelopment,Utility,PBD_HearingProblemsManifestationUtility,PBD_001,Spain,Point(-3.5 40.2)
3,PBD_ProfoundBiotinidaseDeficiency,PBD_NaturalDevelopment,Utility,PBD_MentalDelayManifestationUtility,PBD_001,Spain,Point(-3.5 40.2)
4,PBD_ProfoundBiotinidaseDeficiency,PBD_NaturalDevelopment,Utility,PBD_BasePopulationUtility,PBD_001,Spain,Point(-3.5 40.2)
5,PBD_ProfoundBiotinidaseDeficiency,PBD_NaturalDevelopment,Utility,PBD_SeizureManifestationUtility,PBD_001,Spain,Point(-3.5 40.2)
6,PBD_ProfoundBiotinidaseDeficiency,PBD_NaturalDevelopment,Utility,PBD_VisionLossManifestationUtility,PBD_001,Spain,Point(-3.5 40.2)
7,PBD_ProfoundBiotinidaseDeficiency,PBD_NaturalDevelopment,Cost,PBD_SkinProblemsManifestationCost,PBD_001,Spain,Point(-3.5 40.2)
8,PBD_ProfoundBiotinidaseDeficiency,PBD_NaturalDevelopment,Cost,PBD_HearingProblemsManifestationCost,PBD_001,Spain,Point(-3.5 40.2)
9,PBD_ProfoundBiotinidaseDeficiency,PBD_NaturalDevelopment,Cost,PBD_ScreeningCost,PBD_001,Spain,Point(-3.5 40.2)


* Obtain the parameters closest to a parameter type and country of origin.
* Given a disease, a reference study, a geographic location and a type of parameter we obtain those parameters that are geographically closest to us.

In [5]:
def get_nearest_param(dataframe, disease, study_identifier, country_label, parameter_type):

    # We are left with the available countries and parameters
    distance_list = list()
    aux_table = dataframe[dataframe['disease'] == disease]
    aux_table = aux_table.reset_index()

    # From a country and study of origin we obtain its coordinates
    origin_coordinates = aux_table[(aux_table['countryLabel'] == country_label) & \
                                   (aux_table['studyIdentifier'] == study_identifier)]
    origin_coordinates = re.findall (r'([^( )]+)(?!.*\()', origin_coordinates.iloc[0]['coordinates'])
    origin = (origin_coordinates[0], origin_coordinates[1])
    
    # We calculate the distances of the rest of the parameters with respect to the point of origin
    for index,row in aux_table.iterrows():
        if row['studyIdentifier'] != study_identifier and row['parameterType'] == parameter_type:
            coordinates = re.findall (r'([^( )]+)(?!.*\()', row['coordinates'])
            destination = (coordinates[0], coordinates[1])
            distance = gd(destination, origin).km
            #list_element = []
            distance_list.append({'distance': distance, 'distanceUnit': 'km', 'studyIdentifier': row['studyIdentifier'], 
                                  'parameterType': row['parameterType'], 'parameterName': row['parameter'], 
                                  'countryLabel': row['countryLabel']})
    
    return distance_list

distances = list(get_nearest_param(simplified_table, 'PBD_ProfoundBiotinidaseDeficiency', 'PBD_002', 'Germany', 'Cost'))
sorted_distances = sorted(distances, key=lambda p: p['distance'])
distances_df = pd.DataFrame(sorted_distances)
distances_df

Unnamed: 0,distance,distanceUnit,studyIdentifier,parameterType,parameterName,countryLabel
0,1913.828777,km,PBD_001,Cost,PBD_SkinProblemsManifestationCost,Spain
1,1913.828777,km,PBD_001,Cost,PBD_HearingProblemsManifestationCost,Spain
2,1913.828777,km,PBD_001,Cost,PBD_ScreeningCost,Spain
3,1913.828777,km,PBD_001,Cost,PBD_MentalDelayManifestationCost,Spain
4,1913.828777,km,PBD_001,Cost,PBD_HypotoniaManifestationCost,Spain
5,1913.828777,km,PBD_001,Cost,PBD_VisionLossManifestationCost,Spain
6,1913.828777,km,PBD_001,Cost,PBD_BiotinTreatmentCost,Spain
7,1913.828777,km,PBD_001,Cost,PBD_ClinicalDiagnosisCost,Spain
8,1913.828777,km,PBD_001,Cost,PBD_FollowUpCost,Spain
9,1913.828777,km,PBD_001,Cost,PBD_SeizureManifestationCost,Spain
