In [1]:
#1. Library imports
import certifi
from SPARQLWrapper import SPARQLWrapper, JSON #To send SPARQL queries and to return results in JSON format
import os
import requests #To make easy HTTP request. Useful to interact with APIs.

#2. Security
certifi_path = certifi.where()
os.environ["SSL_CERT_FILE"]=certifi_path
os.environ["REQUESTS_CA_BUNDLE"]=certifi_path

#3. Functions
## 3.1 Configuring endpoint, establishing credentials and defining output format
def configure_sparql(endpoint_url, user=None, password=None):
    sparql = SPARQLWrapper(endpoint_url)
    if user and password:
        sparql.setCredentials(user, password)
    sparql.setReturnFormat(JSON)
    return sparql

## 3.2 Query execution 
def execute_query(sparql, query):
    try:
        sparql.setQuery(query)
        results=sparql.queryAndConvert()
        return results
    except Exception as e:
        print(f"Error al ejecutar la consulta SPARQL: {str(e)}")
        return None
    
## 3.3 Inparanoid query
def query_InParanoid(sparql, uniprot_id, species1, species2):
    query = f"""
    PREFIX orthology: <http://purl.org/net/orth#>
    PREFIX orth: <http://purl.org/net/orth#>
    PREFIX obo: <http://purl.obolibrary.org/obo/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX resource: <http://semanticscience.org/resource/>

    SELECT DISTINCT ?iduniprot1 ?iduniprot2 ?species1 ?species2
    WHERE {{
      GRAPH <http://semantics.inf.um.es/inparanoid> {{
        ?cluster a orth:OrthologsCluster ;
                 orthology:hasHomologousMember ?node1 ;
                 orthology:hasHomologousMember ?node2 .
        ?node1 rdfs:label ?geneid1 ;
               resource:SIO_010078 ?uniprot1 .
        ?uniprot1 a orthology:Protein ;
                  rdfs:label ?iduniprot1 ;
                  obo:RO_0002162 ?taxid1 .
        ?taxid1 rdfs:label ?species1 .
        ?node2 rdfs:label ?geneid2 ;
               resource:SIO_010078 ?uniprot2 .
        ?uniprot2 a orthology:Protein ;
                  rdfs:label ?iduniprot2 ;
                  obo:RO_0002162 ?taxid2 .
        ?taxid2 rdfs:label ?species2 .
        FILTER (?node1 != ?node2 && ?species1 != ?species2)
        VALUES (?iduniprot1 ?species1 ?species2) {{("{uniprot_id}" "{species1}" "{species2}")}}
      }}
    }}
    LIMIT 100
    """

    results = execute_query(sparql, query)
    data = []
    if results:
        for result in results["results"]["bindings"]:
            data.append({
                "UniProt ID 1": result.get("iduniprot1", {}).get("value", ""),
                "Species 1": result.get("species1", {}).get("value", ""),
                "UniProt ID 2": result.get("iduniprot2", {}).get("value", ""),
                "Species 2": result.get("species2", {}).get("value", "")
            })
    return pd.DataFrame(data)

## 3.4 OMA query
def query_OMA(sparql, uniprot_id, species1, species2):
    query = f"""
    PREFIX obo:<http://purl.obolibrary.org/obo/>
    PREFIX orth:<http://purl.org/net/orth#>
    PREFIX lscr:<http://purl.org/lscr#>
    PREFIX up:<http://purl.uniprot.org/core/>
    PREFIX dct: <http://purl.org/dc/terms/>

    SELECT DISTINCT ?name_prot1 ?name_species1 ?name_prot2 ?name_species2
    WHERE {{
        ?cluster a orth:OrthologsCluster ;
            orth:hasHomologousMember ?node1 ;
            orth:hasHomologousMember ?node2 .
        ?node1 orth:hasHomologousMember* ?protein1 .
        ?node2 orth:hasHomologousMember* ?protein2 .
        ?protein1 a orth:Protein ;
            orth:organism ?taxid1 ;
            lscr:xrefUniprot ?iduniprot1 .
        ?taxid1 obo:RO_0002162 ?taxon1 .
        ?taxon1 up:scientificName ?name_species1 .
        ?iduniprot1 dct:identifier ?name_prot1 .
        ?protein2 a orth:Protein ;
            orth:organism ?taxid2 ;
            lscr:xrefUniprot ?iduniprot2 .
        ?taxid2 obo:RO_0002162 ?taxon2 .
        ?taxon2 up:scientificName ?name_species2 .
        ?iduniprot2 dct:identifier ?name_prot2 .
        FILTER (?node1 != ?node2 && ?taxid1 != ?taxid2)
        VALUES (?name_species1 ?name_species2 ?name_prot1) {{ 
            ("{species1}" "{species2}" "{uniprot_id}")  
        }}
    }}
    """

    results = execute_query(sparql, query)
    data = []
    if results:
        for result in results["results"]["bindings"]:
            data.append({
                "UniProt ID 1": result.get("name_prot1", {}).get("value", ""),
                "Species 1": result.get("name_species1", {}).get("value", ""),
                "UniProt ID 2": result.get("name_prot2", {}).get("value", ""),
                "Species 2": result.get("name_species2", {}).get("value", "")
            })
    return pd.DataFrame(data)

## 3.5 Curation of proteins
def is_curated(protein_id):
    url = f"https://rest.uniprot.org/uniprotkb/{protein_id}.json"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        entry_type = data.get("entryType", "")
        return "reviewed (Swiss-Prot)" in entry_type
    else:
        #If the entrance cannot be accessed it assume it is not cured
        return False
    
## 3.6 Assigning a score to each result
def assign_score(row, total_databases):
    score_count = (row['Count'] / total_databases) * 100
    if row['Curated'] is True:
        score_curated = 50
    else:
        score_curated = 0  #Asigna 50 puntos si está curada, 0 si no lo está
    total_score = 0.5 * score_count + score_curated
    return total_score

## 3.7 Check name species
def check_species(species):
    while True:
        parts_species = species.split()
        if len(parts_species) == 2:
            genus, species_name = parts_species
            if genus[0].isupper() and genus[1:].islower() and species_name.islower():
                break
        return print("Error: The entry must be in format Genus species (for example, Homo sapines). Try again")
    
## 3.8 Get human UniProt IDs
def get_human_uniprot_ids():
    url = "https://rest.uniprot.org/uniprotkb/stream?query=organism_id:9606&format=tsv&fields=accession"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.text
        uniprot_ids = data.split('\n')[1:-1]  #Ignore first line (header) and last one (empty line)
        return uniprot_ids
    else:
        print(f"Error fetching data: {response.status_code}")
        return []

