In [7]:
import cudf
import cupy as cp
import numpy as np

# Load user profiles
user_profiles = cudf.read_parquet('/workspace/data/user_profiles.parquet')

# Load vectorized song features
vectorized_features = cudf.read_parquet('/workspace/data/vectorized_features.parquet')

print("User Profiles shape:", user_profiles.shape)
print("Vectorized Features shape:", vectorized_features.shape)


User Profiles shape: (33282, 2)
Vectorized Features shape: (122915, 24)


In [8]:
print(user_profiles.columns)
print(vectorized_features.columns)


Index(['user_id', 'user_profile'], dtype='object')
Index(['index', 'song_id', 'artist_name', 'track_name', 'track_id',
       'popularity', 'year', 'genre', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature',
       'genreIndex', 'genreVec', 'features'],
      dtype='object')


In [16]:
user_profiles_list = [np.array(vec) for vec in user_profiles_pd['user_profile'].tolist()]
user_profiles_np = np.vstack(user_profiles_list)

song_features_list = [np.array(vec) for vec in vectorized_features_pd['features'].tolist()]
song_features_np = np.vstack(song_features_list)


In [17]:
print("user_profiles_np dtype:", user_profiles_np.dtype)
print("user_profiles_np shape:", user_profiles_np.shape)

print("song_features_np dtype:", song_features_np.dtype)
print("song_features_np shape:", song_features_np.shape)


user_profiles_np dtype: object
user_profiles_np shape: (33282, 1)
song_features_np dtype: object
song_features_np shape: (122915, 1)


In [18]:
print(type(user_profiles_pd['user_profile'].iloc[0]))
print(user_profiles_pd['user_profile'].iloc[0])


<class 'dict'>
{'type': 0, 'size': 96, 'indices': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 20, 22, 26, 44, 46, 49, 75], 'values': [50.833333333333336, 2011.5, 0.6750833333333334, 0.6044166666666666, 4.0, -6.994416666666666, 0.6666666666666666, 0.09823333333333334, 0.22144650000000002, 0.0003362416666666667, 0.24514166666666667, 0.4035583333333333, 112.62966666666667, 238644.66666666666, 3.9166666666666665, 0.16666666666666666, 0.08333333333333333, 0.25, 0.16666666666666666, 0.08333333333333333, 0.08333333333333333, 0.16666666666666666]}


In [19]:
print(type(vectorized_features_pd['features'].iloc[0]))
print(vectorized_features_pd['features'].iloc[0])


<class 'dict'>
{'type': 0, 'size': 96, 'indices': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 64], 'values': [19.0, 2016.0, 0.415, 0.605, 7.0, -11.157, 1.0, 0.0575, 0.00116, 0.838, 0.471, 0.193, 100.059, 79500.0, 4.0, 1.0]}


In [26]:
import numpy as np

def sparse_dict_to_dense(vec_dict, size=96):
    dense = np.zeros(size, dtype=np.float32)
    if vec_dict is None:
        return dense
    indices = vec_dict.get("indices")
    values = vec_dict.get("values")
    if indices is None or values is None:
        return dense
    for idx, val in zip(indices, values):
        dense[idx] = val
    return dense



In [27]:
user_profiles_list = [sparse_dict_to_dense(vec) for vec in user_profiles_pd['user_profile']]
user_profiles_np = np.vstack(user_profiles_list)

song_features_list = [sparse_dict_to_dense(vec) for vec in vectorized_features_pd['features']]
song_features_np = np.vstack(song_features_list)


In [28]:
print("user_profiles_np shape:", user_profiles_np.shape, "dtype:", user_profiles_np.dtype)
print("song_features_np shape:", song_features_np.shape, "dtype:", song_features_np.dtype)


user_profiles_np shape: (33282, 96) dtype: float32
song_features_np shape: (122915, 96) dtype: float32


In [None]:
Define cosine similarity function in CuPy



In [30]:
def cosine_similarity_gpu(a, b):
    """
    Compute cosine similarity between two CuPy matrices:
    - a: (n_users, n_features)
    - b: (n_songs, n_features)
    Returns: (n_users, n_songs) similarity matrix
    """
    a_norm = a / cp.linalg.norm(a, axis=1, keepdims=True)
    b_norm = b / cp.linalg.norm(b, axis=1, keepdims=True)
    return cp.matmul(a_norm, b_norm.T)


In [45]:
def recommend_top_n_with_scores(user_profiles_cp, song_features_cp, top_n=10, batch_size=1000):
    all_recommendations = []

    num_users = user_profiles_cp.shape[0]

    for start in range(0, num_users, batch_size):
        end = min(start + batch_size, num_users)
        user_batch = user_profiles_cp[start:end]

        # Cosine similarity for this batch
        sim_scores = cosine_similarity_gpu(user_batch, song_features_cp)

        # Get top-N song indices per user
        top_n_indices = cp.argsort(sim_scores, axis=1)[:, -top_n:][:, ::-1]

        # Also get the similarity scores for these top indices
        rows = cp.arange(sim_scores.shape[0])[:, None]
        top_n_scores = sim_scores[rows, top_n_indices]

        # Convert to CPU and save
        top_n_indices_cpu = top_n_indices.get()
        top_n_scores_cpu = top_n_scores.get()

        for user_idx in range(top_n_indices_cpu.shape[0]):
            for rank in range(top_n):
                all_recommendations.append({
                    "user_batch_idx": start + user_idx,
                    "rank": rank + 1,
                    "song_idx": top_n_indices_cpu[user_idx, rank],
                    "score": top_n_scores_cpu[user_idx, rank]
                })

        print(f"Processed users {start} to {end}")

    return all_recommendations



In [None]:
top_recommendations = recommend_top_n(user_profiles_cp, song_features_cp, top_n=10, batch_size=500)


In [47]:
song_ids = vectorized_features_pd['song_id'].values

user_recs_mapped = [
    [song_ids[idx] for idx in recs]
    for recs in top_recommendations
]


In [52]:
recs_with_scores = recommend_top_n_with_scores(user_profiles_cp, song_features_cp, top_n=10, batch_size=500)

recommendations_df = pd.DataFrame([
    {
        "user_id": user_profiles_pd['user_id'].iloc[rec['user_batch_idx']],
        "rank": rec['rank'],
        "song_id": vectorized_features_pd['song_id'].iloc[rec['song_idx']],
        "cosine_similarity": rec['score']
    }
    for rec in recs_with_scores
])


Processed users 0 to 500
Processed users 500 to 1000
Processed users 1000 to 1500
Processed users 1500 to 2000
Processed users 2000 to 2500
Processed users 2500 to 3000
Processed users 3000 to 3500
Processed users 3500 to 4000
Processed users 4000 to 4500
Processed users 4500 to 5000
Processed users 5000 to 5500
Processed users 5500 to 6000
Processed users 6000 to 6500
Processed users 6500 to 7000
Processed users 7000 to 7500
Processed users 7500 to 8000
Processed users 8000 to 8500
Processed users 8500 to 9000
Processed users 9000 to 9500
Processed users 9500 to 10000
Processed users 10000 to 10500
Processed users 10500 to 11000
Processed users 11000 to 11500
Processed users 11500 to 12000
Processed users 12000 to 12500
Processed users 12500 to 13000
Processed users 13000 to 13500
Processed users 13500 to 14000
Processed users 14000 to 14500
Processed users 14500 to 15000
Processed users 15000 to 15500
Processed users 15500 to 16000
Processed users 16000 to 16500
Processed users 16500

In [53]:
recommendations_df.head(15)

Unnamed: 0,user_id,rank,song_id,cosine_similarity
0,ae56dfcc5fc26733fc1c44b3a11f98ec27f6be32,1,SOOUGOQ12AAF3B25B3,1.0
1,ae56dfcc5fc26733fc1c44b3a11f98ec27f6be32,2,SOJJEZD12AB017EFE4,1.0
2,ae56dfcc5fc26733fc1c44b3a11f98ec27f6be32,3,SODHUTJ12AB0187CFB,1.0
3,ae56dfcc5fc26733fc1c44b3a11f98ec27f6be32,4,SOYSCMP12AB0182A7E,1.0
4,ae56dfcc5fc26733fc1c44b3a11f98ec27f6be32,5,SOAGNKM12AB0182086,1.0
5,ae56dfcc5fc26733fc1c44b3a11f98ec27f6be32,6,SOOBGFE12A8C1318D5,1.0
6,ae56dfcc5fc26733fc1c44b3a11f98ec27f6be32,7,SOOBLXM12A67FFB375,1.0
7,ae56dfcc5fc26733fc1c44b3a11f98ec27f6be32,8,SONQICW12AB0183F1C,1.0
8,ae56dfcc5fc26733fc1c44b3a11f98ec27f6be32,9,SOETCSV12A8C139490,1.0
9,ae56dfcc5fc26733fc1c44b3a11f98ec27f6be32,10,SOCTHJZ12AC46890BA,1.0


In [67]:
import cupy as cp
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
user_profiles_np = scaler.fit_transform(user_profiles_np)
song_features_np = scaler.fit_transform(song_features_np)

user_profiles_cp = cp.asarray(user_profiles_np)
song_features_cp = cp.asarray(song_features_np)


In [68]:
import cupy as cp
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize


# Cosine similarity function on GPU
def cosine_similarity_gpu(a, b, eps=1e-8):
    a_norm = a / (cp.linalg.norm(a, axis=1, keepdims=True) + eps)
    b_norm = b / (cp.linalg.norm(b, axis=1, keepdims=True) + eps)
    return cp.matmul(a_norm, b_norm.T)

# Batch recommend top-N with similarity scores
def recommend_top_n_with_scores(user_profiles_cp, song_features_cp, top_n=10, batch_size=500):
    all_recommendations = []
    num_users = user_profiles_cp.shape[0]

    for start in range(0, num_users, batch_size):
        end = min(start + batch_size, num_users)
        user_batch = user_profiles_cp[start:end]

        sim_scores = cosine_similarity_gpu(user_batch, song_features_cp)

        # Get top-N indices and their similarity scores
        top_n_indices = cp.argsort(sim_scores, axis=1)[:, -top_n:][:, ::-1]
        rows = cp.arange(sim_scores.shape[0])[:, None]
        top_n_scores = sim_scores[rows, top_n_indices]

        top_n_indices_cpu = top_n_indices.get()
        top_n_scores_cpu = top_n_scores.get()

        for user_idx in range(top_n_indices_cpu.shape[0]):
            for rank in range(top_n):
                all_recommendations.append({
                    "user_batch_idx": start + user_idx,
                    "rank": rank + 1,
                    "song_idx": top_n_indices_cpu[user_idx, rank],
                    "score": top_n_scores_cpu[user_idx, rank]
                })

        print(f"Processed users {start} to {end}")

    return all_recommendations

# Assuming you have your data loaded and preprocessed:
# user_profiles_cp : (num_users, vector_size) CuPy array of user profiles
# song_features_cp : (num_songs, vector_size) CuPy array of song features
# user_profiles_pd : original pandas DataFrame with 'user_id'
# vectorized_features_pd : original pandas DataFrame with 'song_id'

# Run recommendation
top_n = 10
batch_size = 500

recommendations = recommend_top_n_with_scores(user_profiles_cp, song_features_cp, top_n=top_n, batch_size=batch_size)

# Build DataFrame from recommendations
recommendations_df = pd.DataFrame([
    {
        "user_id": user_profiles_pd['user_id'].iloc[rec['user_batch_idx']],
        "rank": rec['rank'],
        "song_id": vectorized_features_pd['song_id'].iloc[rec['song_idx']],
        "cosine_similarity": rec['score']
    }
    for rec in recommendations
])

# Save to CSV
recommendations_df.to_csv("top_recommendations_with_scores.csv", index=False)

print("Sample recommendations with scores:")
print(recommendations_df.head())



Processed users 0 to 500
Processed users 500 to 1000
Processed users 1000 to 1500
Processed users 1500 to 2000
Processed users 2000 to 2500
Processed users 2500 to 3000
Processed users 3000 to 3500
Processed users 3500 to 4000
Processed users 4000 to 4500
Processed users 4500 to 5000
Processed users 5000 to 5500
Processed users 5500 to 6000
Processed users 6000 to 6500
Processed users 6500 to 7000
Processed users 7000 to 7500
Processed users 7500 to 8000
Processed users 8000 to 8500
Processed users 8500 to 9000
Processed users 9000 to 9500
Processed users 9500 to 10000
Processed users 10000 to 10500
Processed users 10500 to 11000
Processed users 11000 to 11500
Processed users 11500 to 12000
Processed users 12000 to 12500
Processed users 12500 to 13000
Processed users 13000 to 13500
Processed users 13500 to 14000
Processed users 14000 to 14500
Processed users 14500 to 15000
Processed users 15000 to 15500
Processed users 15500 to 16000
Processed users 16000 to 16500
Processed users 16500

In [43]:
import os
os.environ["JAVA_HOME"] = "C:/Program Files/Java/jdk-17"

In [69]:
print(recommendations_df.head(20))


                                     user_id  rank             song_id  \
0   ae56dfcc5fc26733fc1c44b3a11f98ec27f6be32     1  SOPWTKW12A8C13D998   
1   ae56dfcc5fc26733fc1c44b3a11f98ec27f6be32     2  SOGCZWI12AF72A6745   
2   ae56dfcc5fc26733fc1c44b3a11f98ec27f6be32     3  SOVHFGH12AC468B574   
3   ae56dfcc5fc26733fc1c44b3a11f98ec27f6be32     4  SOYWQVR12A8C145A35   
4   ae56dfcc5fc26733fc1c44b3a11f98ec27f6be32     5  SOPLQNV12AB018300F   
5   ae56dfcc5fc26733fc1c44b3a11f98ec27f6be32     6  SOOSSHK12A8AE46F9D   
6   ae56dfcc5fc26733fc1c44b3a11f98ec27f6be32     7  SOJSRTF12A8C1397B5   
7   ae56dfcc5fc26733fc1c44b3a11f98ec27f6be32     8  SOQFOIL12A8AE4838F   
8   ae56dfcc5fc26733fc1c44b3a11f98ec27f6be32     9  SODIHII12A8C13A37D   
9   ae56dfcc5fc26733fc1c44b3a11f98ec27f6be32    10  SOYOVVB12A8C13F2D1   
10  2aa4c5267c2dec1f724b97cd343b9005ff0aec52     1  SOYRALT12AB018619A   
11  2aa4c5267c2dec1f724b97cd343b9005ff0aec52     2  SOBPCAB12A6D4FAD71   
12  2aa4c5267c2dec1f724b97cd343b9005ff

In [56]:
print(cp.linalg.norm(user_profiles_cp, axis=1))  # Should not be zero anywhere
print(cp.linalg.norm(song_features_cp, axis=1))  # Should not be zero anywhere


[238653.19 176395.73 270432.66 ... 206209.98 170029.7  239733.48]
[ 79525.62 294539.84 166532.14 ... 214931.48 136814.98 179518.31]
