## Populate the Spotify Ontology

In [22]:
# required libraries
import pandas as pd
import os
from pathlib import Path
# Load the required libraries
from rdflib import Graph, Literal, RDF, URIRef, Namespace
# rdflib knows about some namespaces, like FOAF
from rdflib.namespace import FOAF, XSD
# CHECK DATE 
import datetime
from rdflib.namespace import OWL, RDFS
import matplotlib.pyplot as plt
import networkx as nx



In [5]:
# parameters and URLs
path = str(Path(os.path.abspath(os.getcwd())).parent.absolute())

# #spotify codes

spotify = os.path.join(path, 'INDEX', 'spotify', 'dataset', 'chart.csv')





# # saving folder
savePath =  path + '/INDEX/spotify/dataset/rdf/'
print (savePath)


c:\wamp64\www/INDEX/spotify/dataset/rdf/


In [7]:
# Construct the country and the spotify ontology namespaces not known by RDFlib
CNS = Namespace("http://eulersharp.sourceforge.net/2003/03swap/countries#")
SP = Namespace("http://www.dei.unipd.it/GraphDatabases/SpotifyOntology#")



## Country

In [8]:
# Load the CSV file into a pandas DataFrame
spotify_data = pd.read_csv(spotify)
# Extract unique regions as countries
Countries = spotify_data[['region']].drop_duplicates().rename(columns={'region': 'country_name'})

# Add a unique country_id
Countries['country_id'] = range(1, len(Countries) + 1)

print(Countries)


       country_name  country_id
0         Argentina           1
201       Australia           2
313          Brazil           3
402         Austria           4
525         Belgium           5
...             ...         ...
68439       Vietnam          60
68791         Egypt          61
109596        India          62
110196       Israel          63
111912      Morocco          64

[64 rows x 2 columns]


In [9]:
#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("sp", SP)


In [13]:
%%time 
#measure execution time

#iterate over the country dataframe
for index, row in Countries.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + the league id as URI
    idU = "country"+str(index)
    Country = URIRef(SP[idU])
    # Add triples using store's add() method.
    g.add((Country, RDF.type, SP.Country))
    g.add((Country, SP['region'], Literal(row['country_name'], datatype=XSD.string)))    
# Correcting the visualization with the provided namespace SP


CPU times: total: 0 ns
Wall time: 6.93 ms


In [14]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
# with open(savePath + 'leagues.ttl', 'w') as file:
with open(savePath + 'country.ttl', 'w', encoding='utf-8') as file:
    file.write(g.serialize(format='turtle'))


--- saving serialization ---
CPU times: total: 15.6 ms
Wall time: 14.3 ms


## Artist

In [15]:
# Load the CSV files in memory

# clubs = pd.read_csv(spotify, sep=',', index_col='Arist_id')
#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("so", SP)

In [16]:
# Extract unique artists from the dataset
unique_artists = spotify_data['artist'].reset_index(drop=True)

# Add a unique artist_id to each artist
unique_artists = unique_artists.reset_index().rename(columns={'index': 'artist_id'})

# Iterate over the unique artists DataFrame
for _, row in unique_artists.iterrows():
    # Create the URI for the Artist using the unique artist_id
    Artist = URIRef(SP[f"artist{row['artist_id']}"])  # Artist node URI

    # Add the Artist as a class in the graph
    g.add((Artist, RDF.type, SP.Artist))  # Declaring it as an Artist
    g.add((Artist, SP['name'], Literal(row['artist'], datatype=XSD.string)))  # Adding the artist's name
    g.add((Artist, SP['artist_id'], Literal(row['artist_id'], datatype=XSD.integer)))  # Adding the artist's ID
   # Create a FOAF Person node
    Person = URIRef(FOAF["Person"])
    g.add((Artist, SP['isMemberOf'], Person))
    

In [19]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'artist.ttl', 'w', encoding='utf-8') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 10.8 s
Wall time: 21.9 s


## Chart

In [21]:
# Load the CSV files in memory

# clubs = pd.read_csv(spotify, sep=',', index_col='Arist_id')
#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("so", SP)

In [None]:
# Initialize RDF graph
Chart = SP.Chart
Top200 = SP.Top200
Viral50 = SP.Viral50
# Add the Chart class and its subclasses
g.add((Chart, RDF.type, OWL.Class))
g.add((Top200, RDF.type, OWL.Class))
g.add((Viral50, RDF.type, OWL.Class))

# Subclass relationships
g.add((Top200, RDFS.subClassOf, Chart))
g.add((Viral50, RDFS.subClassOf, Chart))

# Add properties
associatedWith = SP.associatedWith
publishedIn = SP.publishedIn

## RankedRecorded 