In [None]:
import pandas as pd

country_to_wikidata_df = pd.read_csv('countries.csv', header=None, names=['Country', 'WikidataURI'])

country_to_wikidata = dict(zip(country_to_wikidata_df['Country'], country_to_wikidata_df['WikidataURI']))

with open("sports_data.ttl", "r", encoding="utf-8") as ttl_file:
    ttl_lines = ttl_file.readlines()

with open("updated.ttl", "w", encoding="utf-8") as updated_ttl_file:
    for line in ttl_lines:
        updated_ttl_file.write(line)

        # If a line contains birthPlace and is a literal (country name), check for a Wikidata URI
        if "schema:birthPlace" in line and '"' in line:
            # Extract the country name from the line (assuming the country is inside quotes)
            birth_place_literal = line.split('"')[1]
            
            # Check if the birthPlace literal is in the CSV mapping
            if birth_place_literal in country_to_wikidata:
                # Get the corresponding Wikidata URI
                wikidata_uri = country_to_wikidata[birth_place_literal]
                
                # Add the new line with the Wikidata URI for birthPlace
                updated_ttl_file.write(f'    schema:birthPlace <{wikidata_uri}> ;\n')

print("Updated TTL saved to 'updated.ttl'")

Updated TTL saved to 'updated.ttl'


In [11]:
with open("updated.ttl", "r", encoding="utf-8") as updated_ttl_file:
    updated_ttl_lines = updated_ttl_file.readlines()

unmatched_country_names = set()

for line in updated_ttl_lines:
    # If a line contains birthPlace and is a literal (country name), check for a Wikidata URI
    if "schema:birthPlace" in line and '"' in line:
        # Extract the country name from the line (assuming the country is inside quotes)
        birth_place_literal = line.split('"')[1]
        
        # Check if the birthPlace literal is in the CSV mapping
        if birth_place_literal not in country_to_wikidata:
            # Add the unmatched country name to the set
            unmatched_country_names.add(birth_place_literal)

print(f"Number of unmatched country names: {len(unmatched_country_names)}")
print("Unmatched country names:")
for country_name in unmatched_country_names:
    print(country_name)

Number of unmatched country names: 0
Unmatched country names:


In [15]:
with open("sports_data.ttl", "r", encoding="utf-8") as ttl_file:
    ttl_lines = ttl_file.readlines()

positions = set()

for line in ttl_lines:
    # If a line contains a position and is a literal, add the position to the set
    if "schema:roleName" in line and '"' in line:
        # Extract the position from the line (assuming the position is inside quotes)
        position_literal = line.split('"')[1]
        positions.add(position_literal)

print("Positions:")
for position in positions:
    print(position)

Positions:
Centre-Back
Right-Back
Centre-Forward
Goalkeeper
Defence
Left Winger
Central Midfield
Midfield
Defensive Midfield
Right Winger
Right Midfield
Left Midfield
Attacking Midfield
Offence
Left-Back


In [16]:
position_to_wikidata_df = pd.read_csv('positions.csv', header=None, names=['Position', 'WikidataURI'])

position_to_wikidata = dict(zip(position_to_wikidata_df['Position'], position_to_wikidata_df['WikidataURI']))

with open("updated.ttl", "r", encoding="utf-8") as updated_ttl_file:
    ttl_lines = updated_ttl_file.readlines()

with open("updated2.ttl", "w", encoding="utf-8") as updated_ttl_file:
    for line in ttl_lines:
        updated_ttl_file.write(line)

        # If a line contains roleName and is a literal (position), check for a Wikidata URI
        if "schema:roleName" in line and '"' in line:
            # Extract the position from the line (assuming the position is inside quotes)
            position_literal = line.split('"')[1]
            
            # Check if the roleName literal is in the CSV mapping
            if position_literal in position_to_wikidata:
                # Get the corresponding Wikidata URI
                wikidata_uri = position_to_wikidata[position_literal]
                
                # Add the new line with the Wikidata URI for roleName
                updated_ttl_file.write(f'    schema:roleName <{wikidata_uri}> ;\n')

print("Updated TTL saved to 'updated2.ttl'")

Updated TTL saved to 'updated2.ttl'


In [19]:
with open("updated2.ttl", "r", encoding="utf-8") as updated_ttl_file:
    updated_ttl_lines = updated_ttl_file.readlines()

unmatched_positions = set()

for line in updated_ttl_lines:
    # If a line contains roleName and is a literal (position), check for a Wikidata URI
    if "schema:roleName" in line and '"' in line:
        # Extract the position from the line (assuming the position is inside quotes)
        position_literal = line.split('"')[1]
        
        # Check if the roleName literal is in the CSV mapping
        if position_literal not in position_to_wikidata:
            # Add the unmatched position to the set
            unmatched_positions.add(position_literal)

print(f"Number of unmatched positions: {len(unmatched_positions)}")
print("Unmatched positions:")

Number of unmatched positions: 0
Unmatched positions:


In [12]:
# open the file teams_wikidata.csv, and using similarity between the names of the teams in the data and the names in the file, get the wikidata uri for each team, and print a tabvle with the names of the teams in the data and the wikidata uri for each team


sparqlEndpoint = "http://localhost:3030/LinkedFootball/query"; 

query = " PREFIX schema: <https://schema.org/> SELECT ?name WHERE { ?team a schema:SportsTeam ; schema:name ?name . } "

import requests

response = requests.post(sparqlEndpoint, data = {'query': query, 'format': 'json'})

data = response.json()

teams = set()

for result in data['results']['bindings']:
    teams.add(result['name']['value'])

print("Teams:")
for team in teams:
    print(team)


Teams:
Liverpool FC
VfB Stuttgart
Parma Calcio 1913
Holstein Kiel
Juventus FC
FC Porto
Valencia CF
CF Estrela da Amadora
CD Nacional
Bologna FC 1909
UD Las Palmas
Leicester City FC
CA Osasuna
SC Farense
Ipswich Town FC
Deportivo Alavés
Brighton & Hove Albion FC
Villarreal CF
CD Leganés
Sevilla FC
CD Santa Clara
SS Lazio
Real Valladolid CF
RCD Mallorca
Fulham FC
Athletic Club
AS Roma
Cagliari Calcio
TSG 1899 Hoffenheim
Crystal Palace FC
Boavista FC
Sporting Clube de Portugal
Real Sociedad de Fútbol
RB Leipzig
Rayo Vallecano de Madrid
Vitória SC
AC Monza
SV Werder Bremen
FC Augsburg
1. FC Heidenheim 1846
Venezia FC
Borussia Dortmund
ACF Fiorentina
AC Milan
FC Internazionale Milano
Newcastle United FC
Sporting Clube de Braga
RCD Espanyol de Barcelona
Moreirense FC
Empoli FC
SSC Napoli
FC Bayern München
Getafe CF
West Ham United FC
Real Madrid CF
SC Freiburg
Everton FC
Udinese Calcio
Tottenham Hotspur FC
Manchester City FC
FC Famalicão
Torino FC
Brentford FC
AFC Bournemouth
FC St. Pauli 19

In [15]:
import pandas as pd
import difflib

# Load the CSV file
teams_wikidata_df = pd.read_csv('teams_uris.csv', header=None, names=['League', 'Team', 'WikidataURI'])
teams_wikidata = dict(zip(teams_wikidata_df['Team'], teams_wikidata_df['WikidataURI']))

# Function to compute similarity
def similar(a, b):
    return difflib.SequenceMatcher(None, a, b).ratio()

# Initialize a new dictionary to store the results
results = {}

for team in teams:
    # Find the best match
    best_match = max(teams_wikidata.keys(), key=lambda x: similar(x, team))
    if similar(best_match, team) > 0.5:
        print(f"{team} -> {teams_wikidata[best_match]}")
        results[team] = teams_wikidata[best_match]
    else:
        print(f"{team} -> No match found")
        results[team] = None

# The `results` dictionary now contains the matches
print("\nFinal Results:")
for team, uri in results.items():
    print(f"{team}: {uri}")


Liverpool FC -> http://www.wikidata.org/entity/Q1130849
VfB Stuttgart -> http://www.wikidata.org/entity/Q4512
Parma Calcio 1913 -> http://www.wikidata.org/entity/Q2693
Holstein Kiel -> http://www.wikidata.org/entity/Q157828
Juventus FC -> http://www.wikidata.org/entity/Q1422
FC Porto -> http://www.wikidata.org/entity/Q128446
Valencia CF -> http://www.wikidata.org/entity/Q10333
CF Estrela da Amadora -> http://www.wikidata.org/entity/Q838134
CD Nacional -> http://www.wikidata.org/entity/Q2641
Bologna FC 1909 -> http://www.wikidata.org/entity/Q1893
UD Las Palmas -> http://www.wikidata.org/entity/Q11979
Leicester City FC -> http://www.wikidata.org/entity/Q19481
CA Osasuna -> http://www.wikidata.org/entity/Q10286
SC Farense -> http://www.wikidata.org/entity/Q744353
Ipswich Town FC -> http://www.wikidata.org/entity/Q9653
Deportivo Alavés -> http://www.wikidata.org/entity/Q223620
Brighton & Hove Albion FC -> http://www.wikidata.org/entity/Q19453
Villarreal CF -> http://www.wikidata.org/entity

In [16]:
with open("updated2.ttl", "r", encoding="utf-8") as ttl_file:
    ttl_lines = ttl_file.readlines()

with open("updated3.ttl", "w", encoding="utf-8") as updated_ttl_file:
    for line in ttl_lines:
        updated_ttl_file.write(line)

        # If a line contains a team name and is a literal, check for a Wikidata URI
        if "schema:name" in line and '"' in line:
            # Extract the team name from the line (assuming the team name is inside quotes)
            team_literal = line.split('"')[1]
            
            # Check if the team name is in the CSV mapping
            if team_literal in results and results[team_literal] is not None:
                # Get the corresponding Wikidata URI
                wikidata_uri = results[team_literal]
                
                # Add the new line with the Wikidata URI for the team
                updated_ttl_file.write(f'    schema:sameAs <{wikidata_uri}> ;\n')

In [18]:
# Add teh gYear type to teh foundingDate property of all the teams in the data

import re

# Read the TTL file
with open("updated3.ttl", "r") as file:
    ttl_data = file.read()

# Define a regex to match `schema:foundingDate` and capture the year
founding_date_pattern = r"(schema:foundingDate\s+\"(\d{4})\")"

# Replace the match with the datatype `gYear`
updated_ttl_data = re.sub(
    founding_date_pattern,
    r'schema:foundingDate "\2"^^<http://www.w3.org/2001/XMLSchema#gYear>',
    ttl_data
)

# Save the updated TTL file
with open("updated4.ttl", "w") as file:
    file.write(updated_ttl_data)

print("TTL file updated with gYear datatype for foundingDate")

TTL file updated with gYear datatype for foundingDate
