## Populate the Spotify Ontology

In [45]:
# required libraries
import pandas as pd
import os
from pathlib import Path
# Load the required libraries
from rdflib import Graph, Literal, RDF, URIRef, Namespace
# rdflib knows about some namespaces, like FOAF
from rdflib.namespace import FOAF, XSD
# CHECK DATE 
import datetime
from rdflib.namespace import OWL, RDFS
import matplotlib.pyplot as plt
import networkx as nx



In [46]:
# parameters and URLs
path = str(Path(os.path.abspath(os.getcwd())).parent.absolute())

# #spotify codes
spotify = os.path.join(path, 'INDEX', 'spotify', 'dataset', 'chart.csv')

# saving folder
savePath =  path + '/INDEX/spotify/dataset/rdf/'



In [47]:
# Construct the country and the spotify ontology namespaces not known by RDFlib
CNS = Namespace("http://eulersharp.sourceforge.net/2003/03swap/countries#")
SP = Namespace("http://www.dei.unipd.it/GraphDatabases/SpotifyOntology#")



## Country

In [48]:
# Load the CSV file into a pandas DataFrame
spotify_data = pd.read_csv(spotify)
# Extract unique regions as countries
Countries = spotify_data[['region']].drop_duplicates().rename(columns={'region': 'country_name'})

# Add a unique country_id
Countries['country_id'] = range(1, len(Countries) + 1)




In [49]:
#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("sp", SP)


In [50]:
%%time 
#measure execution time

# Dictionary to store unique country URIs
country_mapping = {}

# Process unique regions (countries)
for region in spotify_data['region'].unique():
    if region not in country_mapping:
        # Create a unique URI for the country
        country_id = f"country{len(country_mapping) + 1}"  # ID starts from 1
        Country = URIRef(CNS[country_id])
        country_mapping[region] = Country  # Add region to the mapping
        
        # Add triples
        g.add((Country, RDF.type, SP.Country))
        g.add((Country, SP['region'], Literal(region, datatype=XSD.string)))


CPU times: total: 0 ns
Wall time: 13.6 ms


In [51]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
# with open(savePath + 'leagues.ttl', 'w') as file:
with open(savePath + 'country.ttl', 'w', encoding='utf-8') as file:
    file.write(g.serialize(format='turtle'))


--- saving serialization ---
CPU times: total: 0 ns
Wall time: 7.96 ms


## Artist

In [53]:
# Load the CSV files in memory


#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("so", SP)

In [54]:
# Extract unique artists from the dataset
# Process the artists
artists = spotify_data['artist'].unique()  # Get unique artists

for index, artist in enumerate(artists):
    # Create a unique URI for each artist
    artist_id = f"artist{index + 1}"  # ID starts from 1
    Artist = URIRef(SP[artist_id])
    
    # Add triples for the artist
    g.add((Artist, RDF.type, SP.Artist))  # Declare as an Artist
    g.add((Artist, FOAF.name, Literal(artist, datatype=XSD.string)))  # Add name

    # Link Artist to Person using isMemberOf
    Person = URIRef(SP[f"person{index + 1}"])  # Unique URI for Person
    g.add((Artist, SP['isMemberOf'], Person))
    g.add((Person, RDF.type, FOAF.Person))  # Declare as a Person
    

In [55]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'artist.ttl', 'w', encoding='utf-8') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 219 ms
Wall time: 998 ms


## Song

In [18]:
# Create RDF graph
g = Graph()
g.bind("spotify", SP)
g.bind("countries", CNS)


In [31]:
# Dictionary to map artist names to URIs
artist_mapping = {}

# Process the songs
for index, row in spotify_data.iterrows():
    # Create a unique URI for each song
    song_id = f"song{index + 1}"  # ID starts from 1
    Song = URIRef(SP[song_id])
    
    # Add triples for the song
    g.add((Song, RDF.type, SP.Song))  # Declare as a Song
    g.add((Song, SP['songUrl'], Literal(row['url'], datatype=XSD.string)))  # Add song URL
    
    # Add the relationship "popularIn" (Song → Country)
    country_id = f"country{index + 1}"  # Assume region corresponds to the processed Country ID
    Country = URIRef(CNS[country_id])
    g.add((Song, SP['popularIn'], Country))  # Link song to country
    
    # Add the relationship "PerformedBy" (Song → Artist)
    artist_name = row['artist']
    
    # Check if the artist already exists in the mapping
    if artist_name not in artist_mapping:
        # Create a new URI for the artist
        artist_id = f"artist{len(artist_mapping) + 1}"
        Artist = URIRef(SP[artist_id])
        
        # Add artist to the mapping
        artist_mapping[artist_name] = Artist
        
        # Add triples for the artist
        g.add((Artist, RDF.type, SP.Artist))
        g.add((Artist, SP['name'], Literal(artist_name, datatype=XSD.string)))
        g.add((Artist, SP['isMemberOf'], URIRef("http://xmlns.com/foaf/0.1/Person")))  # Link to FOAF Person
    
    # Link the song to the artist
    g.add((Song, SP['PerformedBy'], artist_mapping[artist_name]))


In [32]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'song.ttl', 'w', encoding='utf-8') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 15.8 s
Wall time: 27.3 s


## Chart

In [61]:
# Create RDF graph
g = Graph()
g.bind("countries", CNS)
g.bind("spotify", SP)

In [62]:
# Add Chart and its subclasses
Chart = URIRef(SP.Chart)
Top200 = URIRef(SP.Top200)
Viral50 = URIRef(SP.Viral50)

# Add triples for Chart and its subclasses
g.add((Chart, RDF.type, RDFS.Class))
g.add((Top200, RDF.type, RDFS.Class))
g.add((Viral50, RDF.type, RDFS.Class))
g.add((Top200, RDFS.subClassOf, Chart))
g.add((Viral50, RDFS.subClassOf, Chart))



# Add publishedIn relationships for Charts
for chart_type in ['Top200', 'Viral50']:
    chart_instance = URIRef(SP[chart_type])
    for region in spotify_data['region'].unique():
        # Link each chart type to the existing country URIs (defined in countries.ttl)
        country_id = f"country{spotify_data['region'].unique().tolist().index(region) + 1}"
        Country = URIRef(CNS[country_id])  # Reuse existing country URIs
        g.add((chart_instance, SP['publishedIn'], Country))


In [63]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'chart.ttl', 'w', encoding='utf-8') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 0 ns
Wall time: 2.37 s


## RankedRecorded 