### ASSIGNMENT 5: SPARQL QUERIES

#### Andrea Álvarez Pérez

In [1]:
%endpoint https://sparql.uniprot.org/sparql
%format JSON

## Query 1: How many protein records are in UniProt? 

PREFIX up:<http://purl.uniprot.org/core/>

SELECT (COUNT(DISTINCT ?prot) AS ?numProt)
WHERE
{
    ?prot a up:Protein .
}
## Answer at date 23/12/2021: 360157660

## Query 2: How many Arabidopsis thaliana protein records are in UniProt? 

PREFIX up:<http://purl.uniprot.org/core/>
PREFIX taxon:<http://purl.uniprot.org/taxonomy/>

SELECT (COUNT(DISTINCT ?prot) AS ?numProt)
WHERE
{
  ?prot a up:Protein ;
           up:organism taxon:3702 .
}
## Answer at date 23/12/2021: 136782

## Query 3: Retrieve pictures of Arabidopsis thaliana from UniProt? 

PREFIX foaf:<http://xmlns.com/foaf/0.1/>
PREFIX up:<http://purl.uniprot.org/core/>

SELECT ?name ?image                           
WHERE {
       ?taxon  foaf:depiction ?image .
       ?taxon up:scientificName ?name .
  FILTER regex(?name, '^Arabidopsis.thaliana', 'i') .
}

## Query 4: What is the description of the enzyme activity of UniProt Protein Q9SZZ8 

PREFIX up:<http://purl.uniprot.org/core/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT ?description
WHERE
{
    uniprotkb:Q9SZZ8 a up:Protein ;
                       up:enzyme ?enzyme.
    ?enzyme up:activity ?activity.
    ?activity rdfs:label ?description
}
## Answer at date 23/12/2021: 
## Beta-carotene + 4 reduced ferredoxin [iron-sulfur] cluster + 2 H(+) + 2 O(2) = zeaxanthin + 4 oxidized ferredoxin [iron-sulfur] cluster + 2 H(2)O

## Query 5: Retrieve the proteins ids, and date of submission, for proteins that have been added to UniProt this year (HINT Google for “SPARQL FILTER by date”)

PREFIX up:<http://purl.uniprot.org/core/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?prot_id ?date 
WHERE
{
    ?protein a up:Protein ;
               up:created ?date . 
    FILTER (?date >= "2021-01-01"^^xsd:date)
    
# STR transforms the URL into a string
# SUBSTR divides the URL string and takes the letters from position 33 foward
# BIND assigns the divided string value to a variable
    BIND(SUBSTR(STR(?protein),33) AS?prot_id)
}

## Query 6: How  many species are in the UniProt taxonomy?

PREFIX up:<http://purl.uniprot.org/core/>

SELECT (COUNT(DISTINCT ?species) AS?numSpecies)
where {
  ?species a up:Taxon ;
       up:rank up:Species
}
## Answer at date 23/12/2021: 2029846

## Query 7: How many species have at least one protein record? (this might take a long time to execute, so do this one last!)

PREFIX up:<http://purl.uniprot.org/core/>

SELECT (STR(COUNT(DISTINCT ?species)) AS ?numSpecies)
WHERE
{
    ?protein a up:Protein ;
               up:organism ?species .
    ?species up:rank up:Species
}
## Answer at date 23/12/2021: 1057158

## Query 8: Find the AGI codes and gene names for all Arabidopsis thaliana proteins that have a protein function annotation description that mentions “pattern formation”

PREFIX up:<http://purl.uniprot.org/core/>
PREFIX skos:<http://www.w3.org/2004/02/skos/core#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>

SELECT DISTINCT ?name ?agi ?description
WHERE {
  ?prot a up:Protein ;
            up:organism taxon:3702 ;
            up:annotation ?annotation ;
            up:encodedBy ?gene .
  
  ?gene skos:prefLabel ?name ;
        up:locusName ?agiCode .
    
  ?annotation a up:Function_Annotation ;
              rdfs:comment ?description .
  
## Filter the description with "pattern formation"
  FILTER regex(?description, 'pattern.formation', 'i') .
}


# MetaNetX metabolic networks for metagenomics database

## Query 9: What is the MetaNetX Reaction identifier (starts with “mnxr”) for the UniProt Protein uniprotkb:Q18A79

PREFIX mnx: <https://rdf.metanetx.org/schema/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX up: <http://purl.uniprot.org/uniprot/>

SELECT ?reac_id 
WHERE{
    ?pept mnx:peptXref up:Q18A79 .
    ?cata mnx:pept ?pept .
    ?gpr mnx:cata ?cata ;
         mnx:reac ?reac .
    ?reac rdfs:label ?reac_id .
}

# FEDERATED QUERY - UniProt and MetaNetX

## Query 10: What is the official Gene ID (UniProt calls this a “mnemonic”) and the 
# MetaNetX Reaction identifier (mnxr…..) for the protein that has “Starch synthase” catalytic 
# activity in Clostridium difficile (taxon 272563).


SELECT ?upGeneId ?reac_id
WHERE{
  # Gene ID
  ?upProtein up:mnemonic ?upGeneId .
  ?upProtein up:organism taxon:272563 ;
             up:annotation ?a .
  
  ?a a up:Catalytic_Activity_Annotation ;
     up:catalyticActivity ?ca .
  ?ca up:catalyzedReaction ?rhea .
  
  # Filter catalytic activity for "starch synthase"
  
  # Federated query: access to MetaNetX for reaction ID
  SERVICE <https://rdf.metanetx.org/sparql> {
    ?mnxr mnx:reacXref ?rhea .
    ?reac mnx:mnxr ?mnxr .
    ?reac rdfs:label ?reac_id .
  }

}

