In [5]:
%pip install torch-geometric

Note: you may need to restart the kernel to use updated packages.


In [6]:
import torch
import torch_geometric
from torch_geometric.data import Data
import pandas as pd
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch.nn import Linear
import csv
import random

In [7]:
# Open the CSV file and process line by line
with open('/kaggle/input/spotify-playlist/spotify_dataset.csv', mode='r', encoding='utf-8') as file:
    reader = csv.reader(file)
    valid_rows = []

    # Read all rows and filter only those with exactly 4 columns
    all_rows = list(reader)
    valid_rows = [row for row in all_rows if len(row) == 4]  # Ensure only rows with 4 columns are kept

# Sample a thousandth of the dataset randomly
dataset_size = len(valid_rows) // 1000
sampled_rows = random.sample(valid_rows, dataset_size)

# Convert the sampled rows back to a DataFrame
df = pd.DataFrame(sampled_rows, columns=["user_id", "artistname", "trackname", "playlistname"])

In [8]:
# Create mappings for users, tracks, and playlists
user_mapping = {user: idx for idx, user in enumerate(df['user_id'].unique())}
track_mapping = {track: idx + len(user_mapping) for idx, track in enumerate(df['trackname'].unique())}
artist_mapping = {artist: idx + len(user_mapping) + len(track_mapping) for idx, artist in enumerate(df['artistname'].unique())}
playlist_mapping = {playlist: idx + len(user_mapping) + len(track_mapping) + len(artist_mapping) for idx, playlist in enumerate(df['playlistname'].unique())}

In [9]:
# Creating the nodes for the graph
user_nodes = df['user_id'].map(user_mapping).values
track_nodes = df['trackname'].map(track_mapping).values
artist_nodes = df['artistname'].map(artist_mapping).values
playlist_nodes = df['playlistname'].map(playlist_mapping).values

In [10]:
# Create edge_index (i.e., relationships between users, tracks, artists, and playlists)
# Edges are bidirectional between user and track, track and artist, track and playlist
user_to_track_edges = torch.tensor([user_nodes, track_nodes], dtype=torch.long)
track_to_artist_edges = torch.tensor([track_nodes, artist_nodes], dtype=torch.long)
track_to_playlist_edges = torch.tensor([track_nodes, playlist_nodes], dtype=torch.long)

In [11]:
# Combine all edges
edge_index = torch.cat([user_to_track_edges, track_to_artist_edges, track_to_playlist_edges], dim=1)

In [12]:
# Create node features (random features here; you can use specific features like genre, track length, etc.)
num_nodes = len(user_mapping) + len(track_mapping) + len(artist_mapping) + len(playlist_mapping)
num_features = 32  # Can be adjusted
node_features = torch.randn(num_nodes, num_features)

In [13]:
# Build the target ratings or interactions for training (could be interaction count or binary like/dislike)
# Here, we are assuming interaction as edge weights, which could be 1 for presence of edge
edge_weights = torch.ones(edge_index.shape[1], dtype=torch.float)

# Create Data object
data = Data(x=node_features, edge_index=edge_index, edge_attr=edge_weights)

In [14]:
# Define the GNN model using Graph Convolutional Network (GCN)
class GNNModel(torch.nn.Module):
    def __init__(self, num_features, hidden_channels):
        super(GNNModel, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.linear = Linear(hidden_channels, 1)
    
    def forward(self, x, edge_index, edge_attr):
        # Apply first graph convolution layer
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)

        # Apply second graph convolution layer
        x = self.conv2(x, edge_index)
        x = F.relu(x)

        # Predict ratings or recommendation score
        x = self.linear(x)
        return x

In [15]:
# Training function
def train_model(model, data, epochs=50):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    model.train()
    
    for epoch in range(epochs):
        optimizer.zero_grad()
        out = model(data.x, data.edge_index, data.edge_attr)
        
        # For simplicity, we use Mean Squared Error loss
        loss = F.mse_loss(out[data.edge_index[0]], data.edge_attr)
        loss.backward()
        optimizer.step()
        
        if (epoch + 1) % 10 == 0:
            print(f'Epoch {epoch+1:03d}, Loss: {loss:.4f}')

In [16]:
# Making recommendations for a user
def get_recommendations(model, data, user_idx, top_k=5):
    model.eval()
    with torch.no_grad():
        embeddings = model.conv2(
            model.conv1(data.x, data.edge_index, data.edge_attr),
            data.edge_index
        )
        
        # Get the user embedding
        user_embedding = embeddings[user_idx]
        
        # Get track embeddings
        track_indices = torch.tensor(list(track_mapping.values()))
        track_embeddings = embeddings[track_indices]
        
        # Calculate similarity between user embedding and track embeddings
        similarity = F.cosine_similarity(user_embedding.unsqueeze(0), track_embeddings)
        
        # Get top-k most similar tracks
        top_k_indices = similarity.argsort(descending=True)[:top_k]
        
        recommended_tracks = [list(track_mapping.keys())[idx] for idx in top_k_indices]
        return recommended_tracks

In [17]:
# Initialize model
model = GNNModel(num_features=num_features, hidden_channels=64)
# Train the model
train_model(model, data)

  loss = F.mse_loss(out[data.edge_index[0]], data.edge_attr)


Epoch 010, Loss: 0.1559
Epoch 020, Loss: 0.0580
Epoch 030, Loss: 0.0382
Epoch 040, Loss: 0.0341
Epoch 050, Loss: 0.0256


In [None]:
torch.save(model, "song_recommender.pth")

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12890 entries, 0 to 12889
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   user_id       12890 non-null  object
 1   artistname    12890 non-null  object
 2   trackname     12890 non-null  object
 3   playlistname  12890 non-null  object
dtypes: object(4)
memory usage: 402.9+ KB


In [19]:
df.head()

Unnamed: 0,user_id,artistname,trackname,playlistname
0,6863084e5600ab4d3e450c05bb2276a9,Panteon Rococo,Esta Noche,Ska para entrenar ;)
1,6e2a411c2dbc921b68039b51120f7674,Elliott Smith,Pictures Of Me,Elliott Smith – New Moon
2,493b618910bba7e7a680644b0cf7d0b9,Bing Crosby,Chicago Style - Version 2,Chicago - My Kind of Town
3,6f6aeb84163c8307e81529349a3084ad,NEEDTOBREATHE,Let Us Love,Gatemusikanter
4,dd98c584b4982545b715b8a42ea01f0e,Muse,Madness,10.12


In [20]:
user_id = '11da254d9d1948488318e3ea286bf484'  # Example user_id
user_idx = user_mapping.get(user_id)
if user_idx is not None:
    recommended_tracks = get_recommendations(model, data, user_idx)
    print(f"Recommended tracks for user {user_id}: {recommended_tracks}")
else:
    print(f"User {user_id} not found.")

Recommended tracks for user 11da254d9d1948488318e3ea286bf484: ['Blood Mantra', 'Epitome VI', 'Artifact #1', 'The Eye Of Ra', 'Age Of Reason']
