In [51]:
import os.path
import textwrap
import requests

def generate_sparql_query(fullName, property_labels_to_ids, language='en'):
    """
    Query WikiData for the properties of the given person listed in the given property map.
    All properties that are simple values without a label must have an "_id" suffix, all date
    properties must begin with "date"
    :param fullName: 
    :param property_labels_to_ids: 
    :param language: 
    :return: 
    """
    propSelection = ""
    for label, pid in property_labels_to_ids.items():
        if label.endswith("_id") or label.startswith("image"):
            propSelection += f"""
                OPTIONAL {{ ?item wdt:{pid} ?{label}. }}"""
        elif label.startswith("date"):  
            # Dates, fetched directly but need special handling for formatting if desired
            propSelection += f"""
                OPTIONAL {{ ?item wdt:{pid} ?{label}. }}"""
        else:
            propSelection += f"""
                OPTIONAL {{ ?item wdt:{pid} ?{label}Id .
                   ?{label}Id rdfs:label ?{label} FILTER(LANG(?{label}) = "{language}") .
                   SERVICE wikibase:label {{ bd:serviceParam wikibase:language "{language}". }} }}"""

    query = textwrap.dedent(f"""
    SELECT DISTINCT ?item ?itemLabel {"".join([f"(SAMPLE(?{label}) AS ?{label})" for label in property_labels_to_ids])}
    WHERE {{
          ?item wdt:P31 wd:Q5; rdfs:label "{fullName}"@{language}.
          {textwrap.dedent(propSelection)}
    }}
    GROUP BY ?item ?itemLabel 
    """)
    return query

def construct_image_url(filename):
    return f"https://commons.wikimedia.org/wiki/Special:FilePath/{requests.utils.quote(filename)}"


def get_wikipedia_links(qid, languages):
    """
    Fetch Wikipedia links for a given Wikidata QID and a list of languages.

    Parameters:
    - qid (str): The QID of the Wikidata item.
    - languages (list): A list of language codes (e.g., ['en', 'de']).

    Returns:
    - dict: A dictionary with languages as keys and Wikipedia URLs as values.
    """
    url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbgetentities",
        "ids": qid,
        "props": "sitelinks",
        "format": "json"
    }

    response = requests.get(url, params=params)
    data = response.json()

    links = {}
    if "entities" in data and qid in data["entities"]:
        sitelinks = data["entities"][qid].get("sitelinks", {})
        for lang in languages:
            sitekey = f"{lang}wiki"
            if sitekey in sitelinks:
                links[lang] = sitelinks[sitekey]["url"]
            else:
                links[lang] = None  # Or use '' to represent absence of link

    return links


def query_wikidata(fullName, property_map, language='en'):
    SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
    query = generate_sparql_query(fullName, property_map, language)
    headers = {'User-Agent': 'Mozilla/5.0', 'Accept': 'application/json'}
    response = requests.get(SPARQL_ENDPOINT, headers=headers, params={'query': query, 'format': 'json'})

    if response.status_code != 200:
        response.raise_for_status()
        
    results = response.json()['results']['bindings']
    
    if not results:
        return None
    
    # Initialize with fullName to ensure it appears first
    data = {
        'fullName': fullName
    }
    
    # use first result
    result = results[0]
    
    # iterate over fields
    for label in property_map:
        if label in result:
            value = result[label]['value']
            data[label] = value
        else:
            data[label] = None
            
    # add item URI
    data['item'] = os.path.basename(result['item']['value'])
        
    return data


def get_person_info_from_wikidata(names, property_map, language='en'):
    all_data = []
    for fullName in names:
        data = query_wikidata(fullName, property_map, language)
        if data:
            all_data.append(data)
    if all_data:
        # Ensure fullName appears first by reordering columns based on property_labels_to_ids keys
        columns_order = ['fullName', 'item'] + list(property_map.keys())
        df = pd.DataFrame(all_data, columns=columns_order)
    else:
        df = pd.DataFrame(columns=['fullName'] + list(property_map.keys()))
    return df

In [52]:
# Now calling the updated function with the 'language' parameter
property_labels_to_ids = {
    'sexOrGender': 'P21',
    'image': 'P18',
    'countryOfCitizenship': 'P27',
    'givenName': 'P735',
    'familyName': 'P734',
    'dateOfBirth': 'P569',
    'dateOfDeath': 'P570',
    'occupation': 'P106',
    'fieldOfWork': 'P101',
    'employer': 'P108',
    'viaf_id': 'P214',
    'isni_id': 'P213',
    'gnd_id': 'P227'
}

scholars = [
    "Hans Kelsen",
    "Hugo Sinzheimer",
    "Karl Renner",
    "Ernst Fraenkel",
    "Franz Leopold Neumann",
    "Otto Kahn-Freund",
    "Otto Kirchheimer",
    "Herrmann Kantorowicz",
    "Ludwig Bendix",
    "Arthur Nussbaum",
    "Theodor Geiger",
    "Erhard Blankenburg",
    "Wolfgang Kaupen",
    "Rüdiger Lautmann",
    "Thilo Ramm",
    "Rudolf Wiethölter",
    "Niklas Luhmann",
    "Gunther Teubner",
    "Volkmar Gessner"
]
df = get_person_info_from_wikidata(scholars, property_labels_to_ids)
df

Unnamed: 0,fullName,item,sexOrGender,image,countryOfCitizenship,givenName,familyName,dateOfBirth,dateOfDeath,occupation,fieldOfWork,employer,viaf_id,isni_id,gnd_id
0,Hans Kelsen,Q84165,male,http://commons.wikimedia.org/wiki/Special:File...,Cisleithania,Hans,Kelsen,1881-10-11T00:00:00Z,1973-04-19T00:00:00Z,judge,international law,Charles University,31998356,0000000121266076,118561219.0
1,Hugo Sinzheimer,Q86043,male,http://commons.wikimedia.org/wiki/Special:File...,Germany,Hugo,Sinzheimer,1875-04-12T00:00:00Z,1945-09-16T00:00:00Z,lawyer,,Goethe University Frankfurt,27864307,0000000109619641,118614711.0
2,Karl Renner,Q11726,male,http://commons.wikimedia.org/wiki/Special:File...,Cisleithania,Karl,Renner,1870-12-14T00:00:00Z,1950-12-31T00:00:00Z,lawyer,politics,Austrian Federal Government,61669459,0000000121358165,118599739.0
3,Ernst Fraenkel,Q86812,male,,Germany,Ernst,Fraenkel,1898-12-26T00:00:00Z,1975-03-28T00:00:00Z,lawyer,,Free University Berlin,27108403,0000000110230959,118534602.0
4,Franz Leopold Neumann,Q112562068,male,,,Leopold,Neumann,,,printer,publishing,,637163874508945722514,,
5,Otto Kahn-Freund,Q121832,male,http://commons.wikimedia.org/wiki/Special:File...,Germany,Otto,,1900-11-17T00:00:00Z,1979-08-16T00:00:00Z,judge,,University of Oxford,76317591,0000000109168959,118559362.0
6,Otto Kirchheimer,Q214397,male,,Germany,Otto,Kirchheimer,1905-11-11T00:00:00Z,1965-11-22T00:00:00Z,jurist,,Office of Strategic Services,32042801,0000000081110244,118562371.0
7,Ludwig Bendix,Q28053205,male,,,Ludwig,Bendix,1857-10-28T00:00:00Z,1923-09-28T00:00:00Z,university teacher,,,88720482,0000000061811334,1023309920.0
8,Arthur Nussbaum,Q103088,male,http://commons.wikimedia.org/wiki/Special:File...,United States of America,Arthur,Nussbaum,1877-01-31T00:00:00Z,1964-11-22T00:00:00Z,lawyer,law,Columbia University,5180962,0000000120988288,117071676.0
9,Theodor Geiger,Q96410,male,,Germany,Theodor,Geiger,1891-11-09T00:00:00Z,1952-06-16T00:00:00Z,university teacher,,Technical University of Braunschweig,56667946,0000000109038951,118538187.0


In [53]:
df.to_csv("scholars.csv", index=False)

In [44]:
import pandas as pd
from datetime import datetime

# Assuming df is your existing DataFrame

# Convert dateOfBirth and dateOfDeath to just the year, handle NaT/NaN appropriately
df['Year'] = pd.to_datetime(df['dateOfBirth'], errors='coerce').dt.year.astype('Int64')
df['End Year'] = pd.to_datetime(df['dateOfDeath'], errors='coerce').dt.year.astype('Int64')

# Create 'Display Date' as "dateOfBirth - dateOfDeath"
df['Display Date'] = df['Year'].astype(str).replace('<NA>','')  + ' - ' + df['End Year'].astype(str).replace('<NA>','')

# Create 'Headline' as "fullName (dateOfBirth - dateOfDeath)"
df['Headline'] = df['fullName'] + ' (' + df['Display Date'] + ')'

# Create 'Text' column by combining occupation, fieldOfWork, employer
df['Text'] = df[['occupation', 'fieldOfWork']].apply(lambda x: '<br>'.join(x.dropna()), axis=1)

# Use the image directly; assuming the URLs are already correctly formed in the 'image' column
df['Media'] = df['image']

# Add a "Group" column with the value "actors" for all rows
df['Group'] = 'actors'

# fix date columns
df['Display Date'] = df['Display Date'].fillna('')  # Ensure no NaNs in Display Date
df['Headline'] = df['Headline'].fillna('')  # Ensure no NaNs in Headline
df['Text'] = df['Text'].fillna('')  # Ensure no NaNs in Text
df['Media'] = df['Media'].fillna('')  # Ensure no NaNs in Media

# Now select and order the DataFrame according to the TimelineJS template requirements
columns = "Year	Month	Day	Time	End Year	End Month	End Day	End Time	Display Date	Headline	Text	Media	Media Credit	Media Caption	Media Thumbnail	Type	Group	Background	Link".split("\t")
for col in columns:
    if col not in df:
        df[col] = ''
timeline_df = df[columns]

timeline_df.to_excel("timeline_data.xlsx", index=False)
