In [45]:
import requests
import pandas as pd

def generate_sparql_query(fullName, property_labels_to_ids, language='en'):
    """
    Query WikiData for the properties of the given person listed in the given property map.
    All properties that are simple values without a label must have an "_id" suffix, all date
    properties must begin with "date"
    :param fullName: 
    :param property_labels_to_ids: 
    :param language: 
    :return: 
    """
    propSelection = ""
    for label, pid in property_labels_to_ids.items():
        if label.endswith("_id") or label.startswith("image"):
            propSelection += f"OPTIONAL {{ ?item wdt:{pid} ?{label}. }}"
        elif label.startswith("date"):  
            # Dates, fetched directly but need special handling for formatting if desired
            propSelection += f"OPTIONAL {{ ?item wdt:{pid} ?{label}. }}"            
        else:
            propSelection += f"""OPTIONAL {{ ?item wdt:{pid} ?{label}Id .
                                   ?{label}Id rdfs:label ?{label} FILTER(LANG(?{label}) = "{language}") .
                                   SERVICE wikibase:label {{ bd:serviceParam wikibase:language "{language}". }} }}"""

    # Include English and German Wikipedia URLs
    wikipediaURLs = """
    OPTIONAL { ?article schema:about ?item; schema:inLanguage "en"; schema:isPartOf <https://en.wikipedia.org/>. BIND(CONCAT(STR(?article)) AS ?englishWikipedia) }
    OPTIONAL { ?article schema:about ?item; schema:inLanguage "de"; schema:isPartOf <https://de.wikipedia.org/>. BIND(CONCAT(STR(?article)) AS ?germanWikipedia) }
    """

    query = f"""
    SELECT DISTINCT ?itemLabel {"".join([f"(SAMPLE(?{label}) AS ?{label})" for label in property_labels_to_ids])} ?englishWikipedia ?germanWikipedia WHERE {{
      ?item wdt:P31 wd:Q5; rdfs:label "{fullName}"@{language}.
      {propSelection}
      {wikipediaURLs}
    }}
    GROUP BY ?itemLabel ?englishWikipedia ?germanWikipedia
    """
    return query

def construct_image_url(filename):
    return f"https://commons.wikimedia.org/wiki/Special:FilePath/{requests.utils.quote(filename)}"


def query_wikidata(fullName, property_labels_to_ids, language='en'):
    SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
    query = generate_sparql_query(fullName, property_labels_to_ids, language)
    headers = {'User-Agent': 'Mozilla/5.0', 'Accept': 'application/json'}
    response = requests.get(SPARQL_ENDPOINT, headers=headers, params={'query': query, 'format': 'json'})

    if response.status_code == 200:
        results = response.json()['results']['bindings']
        data = {'fullName': fullName}  # Initialize with fullName to ensure it appears first
        if results:
            for label in property_labels_to_ids:
                if f"{label}" in results[0]:
                    value = results[0][f"{label}"]['value']
                    #if label.startswith("image"):
                    #    value = construct_image_url(value)
                    data[label] = value
                else:
                    data[label] = None
            return data
    return None

def properties_to_dataframe(names, property_labels_to_ids, language='en'):
    all_data = []
    for fullName in names:
        data = query_wikidata(fullName, property_labels_to_ids, language)
        if data:
            all_data.append(data)
    if all_data:
        # Ensure fullName appears first by reordering columns based on property_labels_to_ids keys
        columns_order = ['fullName'] + list(property_labels_to_ids.keys())
        df = pd.DataFrame(all_data, columns=columns_order)
    else:
        df = pd.DataFrame(columns=['fullName'] + list(property_labels_to_ids.keys()))
    return df



# Now calling the updated function with the 'language' parameter
property_labels_to_ids = {
    'sexOrGender': 'P21',
    'image': 'P18',
    'countryOfCitizenship': 'P27',
    'givenName': 'P735',
    'familyName': 'P734',
    'dateOfBirth': 'P569',
    'dateOfDeath': 'P570',
    'occupation': 'P106',
    'fieldOfWork': 'P101',
    'employer': 'P108',
    'viaf_id': 'P214',
    'isni_id': 'P213',
    'gnd_id': 'P227'
}



In [46]:
scholars = [
    "Hans Kelsen",
    "Hugo Sinzheimer",
    "Karl Renner",
    "Ernst Fraenkel",
    "Franz Leopold Neumann",
    "Otto Kahn-Freund",
    "Otto Kirchheimer",
    "Herrmann Kantorowicz",
    "Ludwig Bendix",
    "Arthur Nussbaum",
    "Theodor Geiger",
    "Erhard Blankenburg",
    "Wolfgang Kaupen",
    "Rüdiger Lautmann",
    "Thilo Ramm",
    "Rudolf Wiethölter",
    "Niklas Luhmann",
    "Gunther Teubner"
]
df = properties_to_dataframe(scholars, property_labels_to_ids)
df

Unnamed: 0,fullName,sexOrGender,image,countryOfCitizenship,givenName,familyName,dateOfBirth,dateOfDeath,occupation,fieldOfWork,employer,viaf_id,isni_id,gnd_id
0,Hans Kelsen,male,http://commons.wikimedia.org/wiki/Special:File...,Czechoslovakia,Hans,Kelsen,1881-10-11T00:00:00Z,1973-04-19T00:00:00Z,lawyer,law,Charles University,31998356,0000000121266076,118561219.0
1,Hugo Sinzheimer,male,http://commons.wikimedia.org/wiki/Special:File...,Germany,Hugo,Sinzheimer,1875-04-12T00:00:00Z,1945-09-16T00:00:00Z,lawyer,,University of Amsterdam,27864307,0000000109619641,118614711.0
2,Karl Renner,male,http://commons.wikimedia.org/wiki/Special:File...,First Republic of Austria,Karl,Renner,1870-12-14T00:00:00Z,1950-12-31T00:00:00Z,lawyer,politics,Austrian Federal Government,61669459,0000000121358165,118599739.0
3,Ernst Fraenkel,male,,Germany,Ernst,Fraenkel,1891-04-05T00:00:00Z,1971-08-18T00:00:00Z,university teacher,,Goethe University Frankfurt,50078162,,121259854.0
4,Franz Leopold Neumann,male,,,Leopold,Neumann,,,printer,publishing,,637163874508945722514,,
5,Otto Kahn-Freund,male,http://commons.wikimedia.org/wiki/Special:File...,Germany,Otto,,1900-11-17T00:00:00Z,1979-08-16T00:00:00Z,judge,,University of Oxford,76317591,0000000109168959,118559362.0
6,Otto Kirchheimer,male,,Germany,Otto,Kirchheimer,1905-11-11T00:00:00Z,1965-11-22T00:00:00Z,jurist,,Office of Strategic Services,32042801,0000000081110244,118562371.0
7,Ludwig Bendix,male,,Germany,Ludwig,Bendix,1877-06-28T00:00:00Z,1954-01-03T00:00:00Z,lawyer,,,74647579,0000000081553379,118702033.0
8,Arthur Nussbaum,male,http://commons.wikimedia.org/wiki/Special:File...,Germany,Arthur,Nussbaum,1877-01-31T00:00:00Z,1964-11-22T00:00:00Z,lawyer,international law,Frederick William University Berlin,5180962,0000000120988288,117071676.0
9,Theodor Geiger,male,,Germany,Theodor,Geiger,1891-11-09T00:00:00Z,1952-06-16T00:00:00Z,sociologist,,Technical University of Braunschweig,56667946,0000000109038951,118538187.0


In [22]:
df.to_csv("scholars.csv", index=False)

In [44]:
import pandas as pd
from datetime import datetime

# Assuming df is your existing DataFrame

# Convert dateOfBirth and dateOfDeath to just the year, handle NaT/NaN appropriately
df['Year'] = pd.to_datetime(df['dateOfBirth'], errors='coerce').dt.year.astype('Int64')
df['End Year'] = pd.to_datetime(df['dateOfDeath'], errors='coerce').dt.year.astype('Int64')

# Create 'Display Date' as "dateOfBirth - dateOfDeath"
df['Display Date'] = df['Year'].astype(str).replace('<NA>','')  + ' - ' + df['End Year'].astype(str).replace('<NA>','')

# Create 'Headline' as "fullName (dateOfBirth - dateOfDeath)"
df['Headline'] = df['fullName'] + ' (' + df['Display Date'] + ')'

# Create 'Text' column by combining occupation, fieldOfWork, employer
df['Text'] = df[['occupation', 'fieldOfWork']].apply(lambda x: '<br>'.join(x.dropna()), axis=1)

# Use the image directly; assuming the URLs are already correctly formed in the 'image' column
df['Media'] = df['image']

# Add a "Group" column with the value "actors" for all rows
df['Group'] = 'actors'

# fix date columns
df['Display Date'] = df['Display Date'].fillna('')  # Ensure no NaNs in Display Date
df['Headline'] = df['Headline'].fillna('')  # Ensure no NaNs in Headline
df['Text'] = df['Text'].fillna('')  # Ensure no NaNs in Text
df['Media'] = df['Media'].fillna('')  # Ensure no NaNs in Media

# Now select and order the DataFrame according to the TimelineJS template requirements
columns = "Year	Month	Day	Time	End Year	End Month	End Day	End Time	Display Date	Headline	Text	Media	Media Credit	Media Caption	Media Thumbnail	Type	Group	Background	Link".split("\t")
for col in columns:
    if col not in df:
        df[col] = ''
timeline_df = df[columns]

timeline_df.to_excel("timeline_data.xlsx", index=False)
