In [13]:
import pandas as pd

# Load the CSV mapping of country names to Wikidata URIs
country_to_wikidata_df = pd.read_csv('countries.csv', header=None, names=['Country', 'WikidataURI'])

# Convert the country names to a dictionary for fast lookup
country_to_wikidata = dict(zip(country_to_wikidata_df['Country'], country_to_wikidata_df['WikidataURI']))

# Read the TTL file as text
with open("sports_data.ttl", "r", encoding="utf-8") as ttl_file:
    ttl_lines = ttl_file.readlines()

# Prepare to write the updated TTL
with open("updated.ttl", "w", encoding="utf-8") as updated_ttl_file:
    for line in ttl_lines:
        updated_ttl_file.write(line)

        # If a line contains birthPlace and is a literal (country name), check for a Wikidata URI
        if "schema:birthPlace" in line and '"' in line:
            # Extract the country name from the line (assuming the country is inside quotes)
            birth_place_literal = line.split('"')[1]
            
            # Check if the birthPlace literal is in the CSV mapping
            if birth_place_literal in country_to_wikidata:
                # Get the corresponding Wikidata URI
                wikidata_uri = country_to_wikidata[birth_place_literal]
                
                # Add the new line with the Wikidata URI for birthPlace
                updated_ttl_file.write(f'    schema:birthPlace <{wikidata_uri}> ;\n')

print("Updated TTL saved to 'updated.ttl'")


Updated TTL saved to 'updated.ttl'


In [11]:
# Now read updated.ttl and check if there is any schema:birthPlace with country name only, print the number of unmatched country names, and the unmatched country names
with open("updated.ttl", "r", encoding="utf-8") as updated_ttl_file:
    updated_ttl_lines = updated_ttl_file.readlines()

# Initialize a set to store the unmatched country names
unmatched_country_names = set()

for line in updated_ttl_lines:
    # If a line contains birthPlace and is a literal (country name), check for a Wikidata URI
    if "schema:birthPlace" in line and '"' in line:
        # Extract the country name from the line (assuming the country is inside quotes)
        birth_place_literal = line.split('"')[1]
        
        # Check if the birthPlace literal is in the CSV mapping
        if birth_place_literal not in country_to_wikidata:
            # Add the unmatched country name to the set
            unmatched_country_names.add(birth_place_literal)

# Print the number of unmatched country names and the unmatched country names
print(f"Number of unmatched country names: {len(unmatched_country_names)}")
print("Unmatched country names:")
for country_name in unmatched_country_names:
    print(country_name)

Number of unmatched country names: 0
Unmatched country names:
