In [11]:
import sys
from SPARQLWrapper import SPARQLWrapper, JSON
import json
import string
import pandas as pd

In [13]:
endpoint_url = "https://query.wikidata.org/sparql"


language_to_QID = {"AKK" : "Q35518",
                   "AR" : "Q13955",
                   "CS" : "Q9056",
                   "DE" : "Q188",
                   "EN" : "Q1860",
                   "FR" : "Q150",
                   "HE" : "Q9288",
                   "HIT" : "Q35668",
                   "IT" : "Q652",
                   "RU" : "Q7737",
                   "SUX" : "Q36790",
                   "TR" : "Q256",
                   "LA" : "Q397"}

def getShortLang(lang) :
  ShortQuery1 = """
  # tool: ordia
  # title: List of lexemes for a language
  SELECT
    ?lexeme ?lexemeLabel ?form ?formlabel
    ?lexical_category ?lexical_categoryLabel
  WITH {
    SELECT ?lexeme (GROUP_CONCAT(DISTINCT ?lexemeLab; SEPARATOR = " // ") AS ?lexemeLabel) ?lexical_category ?form (GROUP_CONCAT(DISTINCT ?formLab; SEPARATOR = " // ") AS ?formlabel) WHERE {
      ?lexeme a ontolex:LexicalEntry ; """

  ShortQuery2 = f"""
              dct:language wd:{language_to_QID.get(lang)} ;
              wikibase:lemma ?lexemeLab ."""
  ShortQuery3 = """
      OPTIONAL {
        ?lexeme wikibase:lexicalCategory ?lexical_category .
      }
    ?lexeme ontolex:lexicalForm ?form .
    ?form ontolex:representation ?formLab .
    }Group by ?lexeme ?lexical_category ?form
  } AS %results
  WHERE {
    INCLUDE %results
    OPTIONAL {
      ?lexical_category rdfs:label ?lexical_categoryLabel .
      FILTER (LANG(?lexical_categoryLabel) = "en")
    }
    # SERVICE does not work!?
    # SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
  }
    """
  return ShortQuery1 + ShortQuery2 + ShortQuery3

In [15]:
print(getAncient("SUX"))


  # tool: ordia
  # title: List of lexemes for a language
  SELECT
    ?lexeme ?lexemeLabel ?form ?formlabel
    ?lexical_category ?lexical_categoryLabel
  WITH {
    SELECT ?lexeme (GROUP_CONCAT(DISTINCT ?lexemeLab; SEPARATOR = " // ") AS ?lexemeLabel) ?lexical_category ?form (GROUP_CONCAT(DISTINCT ?formLab; SEPARATOR = " // ") AS ?formlabel) WHERE {
      ?lexeme a ontolex:LexicalEntry ; 
              dct:language wd:Q36790 ;
              wikibase:lemma ?lexemeLab .
      OPTIONAL {
        ?lexeme wikibase:lexicalCategory ?lexical_category .
      }
    ?lexeme ontolex:lexicalForm ?form .
    ?form ontolex:representation ?formLab .
    }Group by ?lexeme ?lexical_category ?form
  } AS %results
  WHERE {
    INCLUDE %results
    OPTIONAL {
      ?lexical_category rdfs:label ?lexical_categoryLabel .
      FILTER (LANG(?lexical_categoryLabel) = "en")
    }
    # SERVICE does not work!?
    # SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
  }
    


In [16]:
def get_results(endpoint_url, query) :
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

def queryFunc(lang, char) :
    queryOne = '''
# tool: ordia
# title: List of lexemes for a language
SELECT
  ?lexeme ?lexemeLabel ?form ?formlabel
  ?lexical_category ?lexical_categoryLabel
WITH {
  SELECT ?lexeme (GROUP_CONCAT(DISTINCT ?lexemeLab; SEPARATOR = " // ") AS ?lexemeLabel) ?lexical_category ?form (GROUP_CONCAT(DISTINCT ?formLab; SEPARATOR = " // ") AS ?formlabel) WHERE {'''
    queryTwo = f'''
    ?lexeme a ontolex:LexicalEntry ;
            dct:language wd:{language_to_QID.get(lang)} ;
            wikibase:lemma ?lexemeLab .
    FILTER(STRSTARTS(?formLab, "'''


    queryThree = f"{char}"


    queryFour = '''"))
    OPTIONAL {
      ?lexeme wikibase:lexicalCategory ?lexical_category .
    }
   ?lexeme ontolex:lexicalForm ?form .
   ?form ontolex:representation ?formLab .
  }Group by ?lexeme ?lexical_category ?form
} AS %results
WHERE {
  INCLUDE %results
  OPTIONAL {
    ?lexical_category rdfs:label ?lexical_categoryLabel .
    FILTER (LANG(?lexical_categoryLabel) = "en")
  }
  # SERVICE does not work!?
  # SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
}
'''
    return queryOne + queryTwo + queryThree + queryFour

In [18]:
def query_To_DF_With_Char(lang, char):
  results = get_results(endpoint_url, queryFunc(lang, char))
  resultsList = []
  for result in results["results"]["bindings"]:
      key = list(result.keys())
      values = [i.get("value") for i in list(result.values())]
      rowDict = {}
      for key, value in zip(key, values):
        rowDict.update({key : value})
      resultsList.append(rowDict)
  df = pd.DataFrame(resultsList)
  return df

def query_To_DF(query):
  results = get_results(endpoint_url, query)
  resultsList = []
  for result in results["results"]["bindings"]:
      key = list(result.keys())
      values = [i.get("value") for i in list(result.values())]
      rowDict = {}
      for key, value in zip(key, values):
        rowDict.update({key : value})
      resultsList.append(rowDict)
  df = pd.DataFrame(resultsList)
  return df

In [27]:
def get_forms(lang) :
    """
    This function takes a language and returns all the Wikidata form data as a data frame. 
    The data frame contains the following columns: "lexeme", "lexemeLabel", "form", "formlabel", "lexical_category", and "lexical_categoryLabel"

    Parameters:
    param1 (str): Must be the ISO code for the following languages: Akkadian (AKK), Arabic (AR), Czech (CS), German (DE), English (EN), French (FR), Hebrew (HE), Hittite (HIT), Italian (IT), Sumerian (SUX), Turkish (TR), Latin (LA)

    Returns:
    pandas.core.frame.DataFrame
    """

    alphabet = string.ascii_lowercase + string.ascii_uppercase
    accents = 'àâäæçéèêëîïôœùûüÿÀÂÄÆÇÉÈÊËÎÏÔŒÙÛÜŸ'
    special_chars = 'äöüßÄÖÜŒéàăŪñãõ'
    numbers = '1234567890'
    symbols = "-`å¨'.,/¿¡@€£¢$!?&~#"

    romance_chars = alphabet + accents + special_chars + numbers + symbols


    hebrew_alphabet = [
    'א', 'ב', 'ג', 'ד', 'ה', 'ו', 'ז', 'ח', 'ט', 'י',
    'כ', 'ך', 'ל', 'מ', 'ם', 'נ', 'ן', 'ס', 'ע', 'פ',
    'ף', 'צ', 'ץ', 'ק', 'ר', 'ש', 'ת',
    '׳', '״', 'ֽ', '׀', '—', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'
    ]

    czech_alphabet = (
    "aábcčdďeéěfghichíjklmnňoópqrřsštťuúůvwxyzž" +
    "AÁBCČDĎEÉĚFGHICHÍJKLMNOPQRSŘŠTŤUÚŮVWXYZŽ" +
    "1234567890-`å¨'.,/¿¡@€£¢$!?&~#"
    )

    if lang == "SUX" or lang == "AKK" or lang == "TR" or lang == "HIT" or lang == "AR":
        query = getShortLang(lang)
        return query_To_DF(query)
    else :
        df_accumulator = []
        if lang == "CS" :
            for i in czech_alphabet :
                df_accumulator.append(query_To_DF_With_Char(lang, i))
        if lang == "HE" :
            for i in hebrew_alphabet :
                df_accumulator.append(query_To_DF_With_Char(lang, i))
        else :
            for i in romance_chars :
                df_accumulator.append(query_To_DF_With_Char(lang, i))
        return pd.concat(df_accumulator)