## Populate the Spotify Ontology

In [1]:
# required libraries
import pandas as pd
import os
from pathlib import Path
# Load the required libraries
from rdflib import Graph, Literal, RDF, URIRef, Namespace
# rdflib knows about some namespaces, like FOAF
from rdflib.namespace import FOAF, XSD
# CHECK DATE 
import datetime
from rdflib.namespace import OWL, RDFS
import matplotlib.pyplot as plt
import networkx as nx



In [2]:
# Construct the country and the spotify ontology namespaces not known by RDFlib
CNS = Namespace("http://eulersharp.sourceforge.net/2003/03swap/countries#")
SP = Namespace("http://www.dei.unipd.it/GraphDatabases/SpotifyOntology#")



In [3]:
# parameters and URLs
path = str(Path(os.path.abspath(os.getcwd())).parent.absolute())

# #spotify codes
spotify = os.path.join(path, 'INDEX', 'spotify', 'dataset', 'chart.csv')

# saving folder
savePath =  path + '/INDEX/spotify/dataset/rdf/'


## Country

In [4]:
import pandas as pd
import numpy as np

# Load the CSV file into a pandas DataFrame
spotify_data = pd.read_csv(spotify)

# Extract unique regions as countries, ensuring no duplicates
Countries = spotify_data[['region']].drop_duplicates().rename(columns={'region': 'country_name'})



In [5]:
#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("sp", SP)


In [6]:
%%time 
# Initialize a counter for country IDs starting from 1
country_counter = 1

# Iterate over the unique country names
for _, row in Countries.iterrows():
    # Create the node to add to the Graph with a sequential ID
    Country = URIRef(SP['country' + str(country_counter)])
    
    # Add triples using store's add() method
    g.add((Country, RDF.type, SP.Country))
    g.add((Country, SP['region'], Literal(row['country_name'], datatype=XSD.string)))
    
    # Increment the counter for the next country
    country_counter += 1


CPU times: total: 0 ns
Wall time: 14 ms


In [7]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
# with open(savePath + 'leagues.ttl', 'w') as file:
with open(savePath + 'country.ttl', 'w', encoding='utf-8') as file:
    file.write(g.serialize(format='turtle'))


--- saving serialization ---
CPU times: total: 15.6 ms
Wall time: 17.7 ms


## Artist

In [8]:
# Load the CSV files in memory


#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("so", SP)

In [9]:
# Extract unique artists from the dataset
# Process the artists
artists = spotify_data['artist'].unique()  # Get unique artists

for index, artist in enumerate(artists):
    # Create a unique URI for each artist
    artist_id = f"artist{index + 1}"  # ID starts from 1
    Artist = URIRef(SP[artist_id])
    
    # Add triples for the artist
    g.add((Artist, RDF.type, SP.Artist))  # Declare as an Artist
    g.add((Artist, FOAF.name, Literal(artist, datatype=XSD.string)))  # Add name

    # Link Artist to Person using isMemberOf
    Person = URIRef(SP[f"person{index + 1}"])  # Unique URI for Person
    g.add((Artist, SP['isMemberOf'], Person))
    g.add((Person, RDF.type, FOAF.Person))  # Declare as a Person
    

In [10]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'artist.ttl', 'w', encoding='utf-8') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 5.81 s
Wall time: 5.86 s


## Song

In [11]:
# Create RDF graph
g = Graph()
g.bind("spotify", SP)
g.bind("countries", CNS)


In [12]:
# Dictionary to map artist names to URIs
artist_mapping = {}

# Process the songs
for index, row in spotify_data.iterrows():
    # Create a unique URI for each song
    song_id = f"song{index + 1}"  # ID starts from 1
    Song = URIRef(SP[song_id])
    
    # Add triples for the song
    g.add((Song, RDF.type, SP.Song))  # Declare as a Song
    g.add((Song, SP['songUrl'], Literal(row['url'], datatype=XSD.string)))  # Add song URL
    g.add((Song, SP['songName'], Literal(row['title'], datatype=XSD.string)))  
    
    # Add the relationship "PerformedBy" (Song → Artist)
    artist_name = row['artist']
    
    # Check if the artist already exists in the mapping
    if artist_name not in artist_mapping:
        # Create a new URI for the artist
        artist_id = f"artist{len(artist_mapping) + 1}"
        Artist = URIRef(SP[artist_id])
        
        # Add artist to the mapping
        artist_mapping[artist_name] = Artist
        
        # Add triples for the artist
        g.add((Artist, RDF.type, SP.Artist))
        g.add((Artist, FOAF.name, Literal(artist_name, datatype=XSD.string)))  #Use artist_name
        g.add((Artist, SP['isMemberOf'], URIRef("http://xmlns.com/foaf/0.1/Person")))  # Link to FOAF Person
    
    # Link the song to the artist
    g.add((Song, SP['PerformedBy'], artist_mapping[artist_name]))


In [13]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'song.ttl', 'w', encoding='utf-8') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 2min 28s
Wall time: 2min 31s


## Chart

In [14]:
# Create RDF graph
g = Graph()
g.bind("countries", CNS)
g.bind("spotify", SP)

In [15]:
# Add Chart and its subclasses
Chart = URIRef(SP.Chart)
Top200 = URIRef(SP.Top200)
Viral50 = URIRef(SP.Viral50)

# Add triples for Chart and its subclasses
g.add((Chart, RDF.type, RDFS.Class))
g.add((Top200, RDF.type, RDFS.Class))
g.add((Viral50, RDF.type, RDFS.Class))
g.add((Top200, RDFS.subClassOf, Chart))
g.add((Viral50, RDFS.subClassOf, Chart))



# Add publishedIn relationships for Charts
for chart_type in ['Top200', 'Viral50']:
    chart_instance = URIRef(SP[chart_type])
    for region in spotify_data['region'].unique():
        # Link each chart type to the existing country URIs (defined in countries.ttl)
        country_id = f"country{spotify_data['region'].unique().tolist().index(region) + 1}"
        Country = URIRef(CNS[country_id])  # Reuse existing country URIs
        g.add((chart_instance, SP['publishedIn'], Country))

In [16]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'chart.ttl', 'w', encoding='utf-8') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 31.2 ms
Wall time: 20.7 ms


## RankedRecorded 

In [17]:
spotify_data.rename(columns={"rank": "RankedRecord"}, inplace=True)

In [18]:
# Create RDF graph
g = Graph()
g.bind("spotify", SP)

In [20]:
from datetime import datetime
# Define trendStatus values
trend_status_values = {
    "MOVE_UP": SP.MOVE_UP,
    "MOVE_DOWN": SP.MOVE_DOWN,
    "SAME_POSITION": SP.SAME_POSITION,
    "NEW_ENTRY": SP.NEW_ENTRY,
}


# Rename the rank column to RankedRecord
spotify_data.rename(columns={"rank": "RankedRecord"}, inplace=True)

# Create RDF graph
g = Graph()
g.bind("spotify", SP)

# Process ranks
for index, row in spotify_data.iterrows():
    # Create a unique URI for each rank
    rank_id = f"rank{index + 1}"  # Unique ID for each rank
    Rank = URIRef(SP[rank_id])
    
    # Add triples for the rank
    g.add((Rank, RDF.type, SP.Rank))  # Declare as a Rank
    g.add((Rank, SP['hasRank'], Literal(int(row['RankedRecord']), datatype=XSD.integer)))  # Add rank position
    
    # Convert the date to ISO 8601 format
    try:
        # Try converting the date to a proper datetime object and then format it
        date_object = datetime.strptime(row['date'], '%m/%d/%Y')  # Adjust the format if needed
        formatted_date = date_object.isoformat()  # Convert to ISO 8601 format
        g.add((Rank, SP['Date'], Literal(formatted_date, datatype=XSD.dateTime)))  # Add date
    except ValueError as e:
        print(f"Skipping invalid date: {row['date']}. Error: {e}")
        continue  # Skip rows with invalid dates
    
    # Check if the song is in the Top200 chart and if stream_count is available
    if row['chart'] == 'Top200' and pd.notna(row['streams']):
        g.add((Rank, SP['stream_count'], Literal(int(row['streams']), datatype=XSD.integer)))  # Add stream count

    # Add trendStatus
    trend_status = trend_status_values.get(row['trend'], None)
    if trend_status:
        g.add((Rank, SP['trendStatus'], trend_status))  # Add trend status
    
    # Add associatedWithChart relationship (Top200 or Viral50)
    chart_type = row['chart'].lower()  # Assume values are "top200" or "viral50"
    if chart_type == "top200":
        Chart = URIRef(SP.Top200)
    elif chart_type == "viral50":
        Chart = URIRef(SP.Viral50)
    else:
        continue
    g.add((Rank, SP['associatedWithChart'], Chart))  # Link rank to chart

    # Add AssignToSong relationship
    song_id = f"song{index + 1}"  # Assume Song URIs follow similar indexing
    Song = URIRef(SP[song_id])
    g.add((Rank, SP['AssignToSong'], Song))  # Link rank to song

In [21]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'RankedRecorded.ttl', 'w', encoding='utf-8') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 1min 52s
Wall time: 1min 56s
