## Populate the Spotify Ontology

In [15]:
# required libraries
import pandas as pd
import os
from pathlib import Path
# Load the required libraries
from rdflib import Graph, Literal, RDF, URIRef, Namespace
# rdflib knows about some namespaces, like FOAF
from rdflib.namespace import FOAF, XSD
# CHECK DATE 
import datetime
from rdflib.namespace import OWL, RDFS
import matplotlib.pyplot as plt
import networkx as nx



In [16]:
# parameters and URLs
path = str(Path(os.path.abspath(os.getcwd())).parent.absolute())

# #spotify codes
spotify = os.path.join(path, 'INDEX', 'spotify', 'dataset', 'chart.csv')

# saving folder
savePath =  path + '/INDEX/spotify/dataset/rdf/'



In [17]:
# Construct the country and the spotify ontology namespaces not known by RDFlib
CNS = Namespace("http://eulersharp.sourceforge.net/2003/03swap/countries#")
SP = Namespace("http://www.dei.unipd.it/GraphDatabases/SpotifyOntology#")



## Country

In [39]:
# Load the CSV file into a pandas DataFrame
spotify_data = pd.read_csv(spotify)
# Extract unique regions as countries
Countries = spotify_data[['region']].drop_duplicates().rename(columns={'region': 'country_name'})

# Add a unique country_id
Countries['country_id'] = range(1, len(Countries) + 1)




In [40]:
#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("sp", SP)


In [41]:
%%time 
#measure execution time

#iterate over the country dataframe
for index, row in Countries.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + the league id as URI
    idU = "country"+str(index)
    Country = URIRef(SP[idU])
    # Add triples using store's add() method.
    g.add((Country, RDF.type, SP.Country))
    g.add((Country, SP['region'], Literal(row['country_name'], datatype=XSD.string)))    
# Correcting the visualization with the provided namespace SP


CPU times: total: 0 ns
Wall time: 11.5 ms


In [42]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
# with open(savePath + 'leagues.ttl', 'w') as file:
with open(savePath + 'country.ttl', 'w', encoding='utf-8') as file:
    file.write(g.serialize(format='turtle'))


--- saving serialization ---
CPU times: total: 0 ns
Wall time: 7.19 ms


## Artist

In [35]:
# Load the CSV files in memory


#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("so", SP)

In [37]:
# Extract unique artists from the dataset
# Process the artists
artists = spotify_data['artist'].unique()  # Get unique artists

for index, artist in enumerate(artists):
    # Create a unique URI for each artist
    artist_id = f"artist{index + 1}"  # ID starts from 1
    Artist = URIRef(SP[artist_id])
    
    # Add triples for the artist
    g.add((Artist, RDF.type, SP.Artist))  # Declare as an Artist
    g.add((Artist, FOAF.name, Literal(artist, datatype=XSD.string)))  # Add name

    # Link Artist to Person using isMemberOf
    Person = URIRef(SP[f"person{index + 1}"])  # Unique URI for Person
    g.add((Artist, SP['isMemberOf'], Person))
    g.add((Person, RDF.type, FOAF.Person))  # Declare as a Person
    

In [38]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'artist.ttl', 'w', encoding='utf-8') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 562 ms
Wall time: 3.27 s


## Song

In [18]:
# Create RDF graph
g = Graph()
g.bind("spotify", SP)
g.bind("countries", CNS)


In [31]:
# Dictionary to map artist names to URIs
artist_mapping = {}

# Process the songs
for index, row in spotify_data.iterrows():
    # Create a unique URI for each song
    song_id = f"song{index + 1}"  # ID starts from 1
    Song = URIRef(SP[song_id])
    
    # Add triples for the song
    g.add((Song, RDF.type, SP.Song))  # Declare as a Song
    g.add((Song, SP['songUrl'], Literal(row['url'], datatype=XSD.string)))  # Add song URL
    
    # Add the relationship "popularIn" (Song → Country)
    country_id = f"country{index + 1}"  # Assume region corresponds to the processed Country ID
    Country = URIRef(CNS[country_id])
    g.add((Song, SP['popularIn'], Country))  # Link song to country
    
    # Add the relationship "PerformedBy" (Song → Artist)
    artist_name = row['artist']
    
    # Check if the artist already exists in the mapping
    if artist_name not in artist_mapping:
        # Create a new URI for the artist
        artist_id = f"artist{len(artist_mapping) + 1}"
        Artist = URIRef(SP[artist_id])
        
        # Add artist to the mapping
        artist_mapping[artist_name] = Artist
        
        # Add triples for the artist
        g.add((Artist, RDF.type, SP.Artist))
        g.add((Artist, SP['name'], Literal(artist_name, datatype=XSD.string)))
        g.add((Artist, SP['isMemberOf'], URIRef("http://xmlns.com/foaf/0.1/Person")))  # Link to FOAF Person
    
    # Link the song to the artist
    g.add((Song, SP['PerformedBy'], artist_mapping[artist_name]))


In [32]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'song.ttl', 'w', encoding='utf-8') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 15.8 s
Wall time: 27.3 s


## Chart

## RankedRecorded 