This notebook gives a list of basic SPARQL queries which can be run on the chamuça Hindi lexicon. First we will need to import Graph from rdflib and create a Graph object.

In [10]:
from rdflib import Graph


In [11]:
g=Graph()
g.parse("chamuca_hi_lex.rdf", format="turtle")

<Graph identifier=N7d993d275af1434aa5c09c053811c916 (<class 'rdflib.graph.Graph'>)>

Basic queries: First up, whether a string has source Dalgado

In [8]:
query = """
PREFIX lexinfo: <http://www.lexinfo.net/ontology/2.0/lexinfo#>
PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>

SELECT ?entry ?etymology
WHERE {
  ?entry a ontolex:LexicalEntry ;
         lexinfo:etymology ?etymology .
  FILTER regex(?etymology, "Source: Dalgado", "i")
}
"""



A query to extract separate etymologies for an entry: first entry

In [14]:
query1 = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX lexinfo: <http://www.lexinfo.net/ontology/2.0/lexinfo#>
PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>

SELECT ?entry (STR(?etym1) AS ?individualEtymology)
WHERE {
  ?entry lexinfo:etymology ?etymText .
  FILTER CONTAINS(?etymText, "|")
  BIND(STRDT(?etymText, xsd:string) AS ?etymString)

  # Extract first etymology
  BIND(STRBEFORE(?etymString, "|") AS ?etym1)
}
"""

query2= """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX lexinfo: <http://www.lexinfo.net/ontology/2.0/lexinfo#>
PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>

SELECT ?entry (STR(?etym1) AS ?individualEtymology)
WHERE {
  ?entry lexinfo:etymology ?etymText .
  FILTER CONTAINS(?etymText, "|")
  BIND(STRDT(?etymText, xsd:string) AS ?etymString)

  # Extract first etymology
  BIND(STRAFTER(?etymString, "|") AS ?etym1)
}
"""

In [16]:
for s, o in g.query(query2):
    print(f"{s} --> {o}")

http://lari-datasets.ilc.cnr.it/chamuca_hi_lex#अंबर_entry -->  fa عنبر ‘ambergris’ (source:  Wiktionary) | pal ‘ambergris’ mbl > ar عَنْبَر (Source: Dalgado) .
http://lari-datasets.ilc.cnr.it/chamuca_hi_lex#अक्तूबर_entry -->  en October (Source: McGregor) .
http://lari-datasets.ilc.cnr.it/chamuca_hi_lex#अनीसून_entry -->  grc ἄνισον ‘anise (Pimpinella anisum)’ (Source: Dalgado) | grc ἄνισον (Source: Wiktionary) >  la anisum ‘anise (Pimpinella anisum)’  (Source: Dalgado) .
http://lari-datasets.ilc.cnr.it/chamuca_hi_lex#अस्तबल_entry -->  ar اسطبل (Source: McGregor) .
http://lari-datasets.ilc.cnr.it/chamuca_hi_lex#अस्पताल_entry -->  en hospital (Source: McGregor) .
http://lari-datasets.ilc.cnr.it/chamuca_hi_lex#आमीन_entry -->  ar آمين ‘amen’ (Source: Dalgado) .
http://lari-datasets.ilc.cnr.it/chamuca_hi_lex#कंदील_entry -->  lat candēla ‘a light made of wax or tallow' > grc κανδήλη ‘oil lamp' > ar قنديل  ‘lamp' (Source: Dalgado) .
http://lari-datasets.ilc.cnr.it/chamuca_hi_lex#कप्तान_entry 

http://lari-datasets.ilc.cnr.it/chamuca_hi_lex#अंबर_entry --> pt ambar (Source: Dalgado) 
http://lari-datasets.ilc.cnr.it/chamuca_hi_lex#अक्तूबर_entry --> pt outubro (Source: Wiktionary) 
http://lari-datasets.ilc.cnr.it/chamuca_hi_lex#अनीसून_entry --> pt ? anis (Source: Dalgado) 
http://lari-datasets.ilc.cnr.it/chamuca_hi_lex#अस्तबल_entry --> pt estábulo (Source: McGregor) 
http://lari-datasets.ilc.cnr.it/chamuca_hi_lex#अस्पताल_entry --> pt espital (Source: Wiktionary) 
http://lari-datasets.ilc.cnr.it/chamuca_hi_lex#आमीन_entry --> pt amen (Source: Dalgado) 
http://lari-datasets.ilc.cnr.it/chamuca_hi_lex#कंदील_entry --> pt ? candil (Source: Dalgado) 
http://lari-datasets.ilc.cnr.it/chamuca_hi_lex#कप्तान_entry --> pt capitão (Source: Dalgado) 
http://lari-datasets.ilc.cnr.it/chamuca_hi_lex#कर्नल_entry --> pt coronel (Source: Dalgado) 
http://lari-datasets.ilc.cnr.it/chamuca_hi_lex#कलापट्टी_entry --> pt calafate (Source: Dalgado) 
http://lari-datasets.ilc.cnr.it/chamuca_hi_lex#क़मीज़_entr

In [16]:
query = """
PREFIX lexinfo: <http://www.lexinfo.net/ontology/2.0/lexinfo#>
PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>

SELECT ?entry ?matchedStep
WHERE {
?entry a ontolex:LexicalEntry ;
        lexinfo:etymology ?etymology .

# Extract just the step with "Source: Dalgado", stopping at next " |" or at period
BIND(REPLACE(?etymology, ".*?(\\b[^|]*?Source: Dalgado[^|]*?)(\\s*\\|.*|\\s*\\.)$", "$1") AS ?matchedStep)

FILTER CONTAINS(?matchedStep, "Source: Dalgado")
}
"""

In [26]:
query = """
PREFIX lexinfo: <http://www.lexinfo.net/ontology/2.0/lexinfo#>
PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>

SELECT ?entry ?etymology
WHERE {
  ?entry a ontolex:LexicalEntry ;
         lexinfo:etymology ?etymology .
  FILTER regex(?etymology, "[^|]*Source: Dalgado", "i")
}
"""

In [28]:
query ="""
PREFIX lexinfo: <http://www.lexinfo.net/ontology/2.0/lexinfo#>
PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>

SELECT ?entry ?matchedStep
WHERE {
  ?entry a ontolex:LexicalEntry ;
         lexinfo:etymology ?etymology .

  # Extract just the step with "Source: Dalgado", stopping at next " |" or at period
  BIND(REPLACE(?etymology, ".*?(\\b[^|]*?Source: Dalgado[^|]*?)(\\s*\\|.*|\\s*\\.)$", "$1") AS ?matchedStep)
  
  FILTER CONTAINS(?matchedStep, "Source: Dalgado")
}
"""

In [4]:
print(g.query(query))

for p in g.query(query):
    print(p)

<rdflib.plugins.sparql.processor.SPARQLResult object at 0x0000023EBCB9A990>
(rdflib.term.URIRef('http://lari-datasets.ilc.cnr.it/chamuca_hi_lex#अँग्रेज़_entry'), rdflib.term.Literal('pt inglês (Source: Dalgado) .'))
(rdflib.term.URIRef('http://lari-datasets.ilc.cnr.it/chamuca_hi_lex#अंग्रेज़_entry'), rdflib.term.Literal('pt inglês (Source: Dalgado) .'))
(rdflib.term.URIRef('http://lari-datasets.ilc.cnr.it/chamuca_hi_lex#अंबर_entry'), rdflib.term.Literal('pt ambar (Source: Dalgado) | fa عنبر ‘ambergris’ (source:  Wiktionary) | pal ‘ambergris’ mbl > ar عَنْبَر (Source: Dalgado) .'))
(rdflib.term.URIRef('http://lari-datasets.ilc.cnr.it/chamuca_hi_lex#अचार_entry'), rdflib.term.Literal('pt achar (Source: Dalgado) .'))
(rdflib.term.URIRef('http://lari-datasets.ilc.cnr.it/chamuca_hi_lex#अनन्नास_entry'), rdflib.term.Literal('tpn naná ‘pineapple’ (Source: Wiktionary) > pt ananás ‘pineapple’ (Source: Dalgado) .'))
(rdflib.term.URIRef('http://lari-datasets.ilc.cnr.it/chamuca_hi_lex#अनीसून_entry'),