In [2]:
#import all libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import re
import scipy.sparse as sp
from scipy.sparse import csr_matrix
!pip install implicit
import implicit
from implicit.als import AlternatingLeastSquares
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [5]:
# Load dataset
df_samp = pd.read_csv("playlist.csv")
df_samp.head()

Unnamed: 0,user_id,"""artistname""","""trackname""","""playlistname""",Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010,,,,,
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010,,,,,
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010,,,,,
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010,,,,,
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010,,,,,


In [8]:
# Filter out columns that start with 'Unnamed'
df = df_samp.loc[:, ~df_samp.columns.str.startswith('Unnamed')]
df.info()

# Strip whitespace and replace double quotes in column names of the filtered DataFrame
df.columns = df.columns.str.strip().str.replace('"', '')

df2 = df.dropna()
df2.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 4 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   user_id          1048575 non-null  object
 1    "artistname"    1046373 non-null  object
 2    "trackname"     1048566 non-null  object
 3    "playlistname"  1048501 non-null  object
dtypes: object(4)
memory usage: 32.0+ MB


In [7]:
# Combine 'artistname' and 'trackname' into a single feature 'songname'
df2['songname'] = df2['artistname'] + '  ' + df2['trackname']


In [9]:
# Clean 'songname' and 'playlistname' column by converting to lowercase and removing non-alphabetic characters
df2['songname'] = df2['songname'].astype(str).str.lower().apply(lambda x: re.sub('[^a-zA-Z\s]', '', x))
df2['playlistname'] = df2['playlistname'].astype(str).str.lower().apply(lambda x: re.sub('[^a-zA-Z\s]', '', x))

In [10]:
# Creating unique
unique_songname = df2['songname'].unique()
df_unique_songname = pd.DataFrame(unique_songname, columns=['songname'])

In [11]:
tfidf_song = TfidfVectorizer(stop_words='english')
tfidf_matrix_songs = tfidf_song.fit_transform(df_unique_songname['songname'])

In [12]:
# Define the get_song_recommendations function
def get_song_recommendations(user_id, nn_model, tfidf_matrix_songs, songname_to_index, df_unique_songname, top_n=10):
    user_songs = df2[df2['user_id'] == user_id]['songname'].values
    if len(user_songs) == 0:
        return []
    
    user_songs_indices = [songname_to_index[song] for song in user_songs if song in songname_to_index]
    if len(user_songs_indices) == 0:
        return []

    user_profile = np.mean(tfidf_matrix_songs[user_songs_indices], axis=0)
    user_profile = np.asarray(user_profile)  # Convert to a numpy array
    similarity_scores = cosine_similarity(user_profile.reshape(1, -1), tfidf_matrix_songs)
    similarity_scores = similarity_scores.flatten()
    
    top_indices = similarity_scores.argsort()[-top_n:][::-1]
    recommended_songs = df_unique_songname.iloc[top_indices]['songname'].values
    
    return recommended_songs

In [13]:
# Define the get_als_recommendations function
def get_als_recommendations(user_id, als_model, user_item_matrix, user_id_mapping, df_unique_songname, top_n=10):
    user_id_code = user_id_mapping.get(user_id)
    if user_id_code is None:
        raise ValueError(f"User ID '{user_id}' not found in user_id_mapping.")
    
    # Get recommendations (item IDs and scores) for the user
    recommended_items, _ = als_model.recommend(user_id_code, user_item_matrix[user_id_code], N=top_n)
    
    # Convert item IDs back to song names
    recommendations = [df_unique_songname.iloc[item_id]['songname'] for item_id in recommended_items]
    
    return recommendations

In [14]:
# Define the hybrid recommendation function
def hybrid_recommendations(user_id, nn_model, tfidf_matrix_songs, songname_to_index, als_model, user_item_matrix, user_id_mapping, df_unique_songname, top_n=10):
    # Get recommendations from CBF
    cbf_rec = get_song_recommendations(user_id, nn_model, tfidf_matrix_songs, songname_to_index, df_unique_songname, top_n=top_n)
    
    # Get recommendations from CF
    cf_rec = get_als_recommendations(user_id, als_model, user_item_matrix, user_id_mapping, df_unique_songname, top_n=top_n)
    
    # Combine recommendations based on certain conditions
    if len(cf_rec) >= 5:
        return cf_rec
    else:
        return cbf_rec

In [15]:
# Correct extract_playlist_features function
def extract_playlist_features(playlist, tfidf_matrix_songs, songname_to_index):
    playlist_features = []
    for song in playlist:
        song_index = songname_to_index.get(song)
        if song_index is not None:
            # Transform the song name to its TF-IDF vector
            playlist_features.append(tfidf_matrix_songs[song_index])  
    return np.array(playlist_features)

In [16]:
# Train ANN Model
nn_model = NearestNeighbors(n_neighbors=10, algorithm='brute', metric='cosine')
nn_model.fit(tfidf_matrix_songs)


In [19]:
# Creating a mapping from songname to index in unique songname
songname_to_index = {songname: index for index, songname in enumerate(df_unique_songname['songname'])}
df2['songname_index'] = df2['songname'].map(songname_to_index)

In [20]:
# Verify the mapping
print("Mapping sample:")
print(df2[['songname', 'songname_index']].head())


Mapping sample:
                                            songname  songname_index
0  elvis costello  the angels wanna wear my red s...               0
1  elvis costello  the attractions  whats so funn...               1
2                      tiffany page   years too late               2
3  elvis costello  the attractions  accidents wil...               3
4                             elvis costello  alison               4


In [21]:
# Prepare data for XGBoost
df2['play_count'] = np.random.randint(1, 100, df2.shape[0])  # Simulated play count
df2['user_id_code'] = df2['user_id'].astype('category').cat.codes
df_xgboost = df2[['user_id_code', 'songname_index', 'play_count']]
X = df_xgboost[['user_id_code', 'songname_index']]
y = df_xgboost['play_count']

In [22]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [23]:
# Convert the data to DMatrix format for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)


In [24]:

# Define XGBoost parameters
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'max_depth': 6,
    'eta': 0.1
}

In [25]:
# Train the XGBoost model
xgb_model = xgb.train(params, dtrain, num_boost_round=100)


In [26]:
# Predict and evaluate the model
y_pred = xgb_model.predict(dtest)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("XGBoost RMSE:", rmse)

XGBoost RMSE: 28.51262499441583


In [27]:
# Classification report for XGBoost (using a threshold for play_count)
y_test_bin = (y_test >= 50).astype(int)  # Example threshold for binary classification
y_pred_bin = (y_pred >= 50).astype(int)
print("XGBoost Classification Report:")
print(classification_report(y_test_bin, y_pred_bin))
print("XGBoost Accuracy:", accuracy_score(y_test_bin, y_pred_bin))

XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.48      0.49    103557
           1       0.50      0.52      0.51    105702

    accuracy                           0.50    209259
   macro avg       0.50      0.50      0.50    209259
weighted avg       0.50      0.50      0.50    209259

XGBoost Accuracy: 0.4998255750051372


In [28]:
# Prepare user-item interaction matrix for ALS
user_item_interaction = df2[['user_id', 'songname_index']]
user_item_interaction['user_id_code'] = user_item_interaction['user_id'].astype('category').cat.codes
user_item_interaction['songname_index'] = user_item_interaction['songname_index'].astype('category').cat.codes
user_item_matrix = csr_matrix((np.ones(len(user_item_interaction)), 
                               (user_item_interaction['user_id_code'], user_item_interaction['songname_index'])))
print("User-Item Interaction Matrix Shape:", user_item_matrix.shape)

User-Item Interaction Matrix Shape: (1561, 466571)


In [29]:
# Create user_id_mapping
user_id_mapping = user_item_interaction[['user_id', 'user_id_code']].drop_duplicates().set_index('user_id')['user_id_code'].to_dict()


In [30]:
# Initialize the ALS model
als_model = AlternatingLeastSquares(factors=50, regularization=0.01, iterations=15)


In [31]:
# Train the ALS model on the user-item interaction matrix
als_model.fit(user_item_matrix)


  0%|          | 0/15 [00:00<?, ?it/s]

In [32]:
# Evaluate ALS model
# Example user and item to evaluate
user_id = 0
recommended_items = als_model.recommend(user_id, user_item_matrix[user_id], N=10)
print("ALS Recommended Items for user", user_id, ":", recommended_items)

ALS Recommended Items for user 0 : (array([  969,   680,  3479,    69,   851,   624,   907, 54271,  6116,
        2645], dtype=int32), array([9.3634735e-05, 9.1380854e-05, 9.1074828e-05, 9.0661350e-05,
       8.6942680e-05, 8.5427338e-05, 8.3240360e-05, 8.2954706e-05,
       8.2225240e-05, 8.0618214e-05], dtype=float32))


In [33]:
# Function to calculate precision@k
def precision_at_k(recommended_items, actual_items, k):
    recommended_items = [item[0] for item in recommended_items[:k]]
    actual_items = actual_items.indices
    intersect = len(set(recommended_items) & set(actual_items))
    return intersect / k

In [34]:
# Function to calculate recall@k
def recall_at_k(recommended_items, actual_items, k):
    recommended_items = [item[0] for item in recommended_items[:k]]
    actual_items = actual_items.indices
    intersect = len(set(recommended_items) & set(actual_items))
    return intersect / len(actual_items)

In [35]:
# Get actual items for the user
actual_items = user_item_matrix[user_id]

In [36]:
# Calculate precision and recall at k
k = 10
precision = precision_at_k(recommended_items, actual_items, k)
recall = recall_at_k(recommended_items, actual_items, k)
print("Precision@10:", precision)
print("Recall@10:", recall)


Precision@10: 0.0
Recall@10: 0.0


In [37]:
# Example user ID for recommendations
example_user_id = df2['user_id'].iloc[0]

In [38]:
# Get hybrid recommendations
hybrid_recs = hybrid_recommendations(example_user_id, nn_model, tfidf_matrix_songs, songname_to_index, als_model, user_item_matrix, user_id_mapping, df_unique_songname)
print("Hybrid Recommendations for user", example_user_id, ":", hybrid_recs)


Hybrid Recommendations for user 9cc0cfd4d7d7885102480dd99e7a90d6 : ['nirvana  smells like teen spirit', 'the rolling stones  gimme shelter', 'nirvana  come as you are', 'dire straits  sultans of swing', 'the white stripes  seven nation army', 'creedence clearwater revival  fortunate son', 'the smashing pumpkins  ', 'radiohead  creep', 'pixies  where is my mind', 'lynyrd skynyrd  sweet home alabama']
