In [5]:
import numpy as np
import pandas as pd
import warnings # mange error
warnings.simplefilter(action='ignore', category=FutureWarning)
from scipy import sparse #include datastructure store large matrrices that have zeros
import random
import lightfm # library used for building recommendation systems,  imports specific functionalities from LightFM library.
from lightfm import LightFM, cross_validation
from lightfm.evaluation import precision_at_k, auc_score
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
p = 0.50  # to randomly select 50% of the rows

df_playlist = pd.read_csv('/content/drive/MyDrive/Spotify/spotify_dataset.csv', error_bad_lines=False, warn_bad_lines=False, skiprows=lambda i: i>0 and random.random() > p)
df_playlist.head()

Unnamed: 0,user_id,"""artistname""","""trackname""","""playlistname"""
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Lissie,All Be Okay,HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Paul McCartney,Band On The Run,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Paul McCartney,"Blackbird - Live at CitiField, NYC - Digital A...",HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,The Breakers,Dance The Go-Go,HARD ROCK 2010


In [8]:
df_playlist.shape

(6449672, 4)

In [9]:
df_playlist.columns = df_playlist.columns.str.replace('"', '')
df_playlist.columns = df_playlist.columns.str.replace('name', '')
df_playlist.columns = df_playlist.columns.str.replace(' ', '')
df_playlist.columns

Index(['user_id', 'artist', 'track', 'playlist'], dtype='object')

In [10]:
df_playlist = df_playlist.groupby('artist').filter(lambda x : len(x)>=50) # make a playlist include by artist that greater than 50 freq

In [11]:
df_playlist = df_playlist[df_playlist.groupby('user_id').artist.transform('nunique') >= 10]

In [12]:
size = lambda x: len(x)
df_freq = df_playlist.groupby(['user_id', 'artist']).agg('size').reset_index().rename(columns={0:'freq'})[['user_id', 'artist', 'freq']].sort_values(['freq'], ascending=False)
df_freq.head()

Unnamed: 0,user_id,artist,freq
1392996,defced0ece4ce946160b0d2698142eac,Vitamin String Quartet,1670
247118,26b51e580277e131f87e4c7ee4c0887a,Vitamin String Quartet,1632
412664,414050deadb38aafd8d4ad22ca634055,Vitamin String Quartet,1307
1358866,d993ff8f2de226e2c6803e47a22e9d7e,Lata Mangeshkar,1172
10916,014e695cc6df96011b90a5beb3206012,Ilaiyaraaja,1141


In [13]:
df_artist = pd.DataFrame(df_freq["artist"].unique())
df_artist = df_artist.reset_index()
df_artist = df_artist.rename(columns={'index':'artist_id', 0:'artist'})
df_artist.head()

Unnamed: 0,artist_id,artist
0,0,Vitamin String Quartet
1,1,Lata Mangeshkar
2,2,Ilaiyaraaja
3,3,Grateful Dead
4,4,Peggy Lee


In [14]:
df_artist.shape

(14321, 2)

In [32]:
df_freq  = pd.merge(df_freq , df_artist, how='inner', on='artist')
df_freq.tail()

Unnamed: 0,user_id,artist,freq,artist_id_x,artist_id_y,artist_id_x.1,artist_id_y.1
1592616,68cd249185bacf63ede2ef7ead7d2a6c,Moein,1,14317,14317,14317,14317
1592617,82988b8fb41b233433d94b86b88b44bd,Chief Commander Ebenezer Obey,1,14318,14318,14318,14318
1592618,8413993862ec3d06d3e58b73c1f91beb,Mahasti,1,14319,14319,14319,14319
1592619,946cb1bb44338964e3d097632bf9dd81,The Gevatron - Israeli Kibbutz Folk Singers,1,14320,14320,14320,14320
1592620,4e352e4d5505778a08c8dab506b21c29,The Gevatron - Israeli Kibbutz Folk Singers,1,14320,14320,14320,14320


In [16]:
#stores the ratings between the user and the song
#normalize the rating 1-10 not in binary
def create_interaction_matrix(df,user_col, item_col, rating_col, norm= False, threshold = None):
    '''
    Function to create an interaction matrix dataframe from transactional type interactions
    Required Input -
        - df = Pandas DataFrame containing user-item interactions
        - user_col = column name containing user's identifier
        - item_col = column name containing item's identifier
        - rating col = column name containing user feedback on interaction with a given item
        - norm (optional) = True if a normalization of ratings is needed
        - threshold (required if norm = True) = value above which the rating is favorable
    Expected output -
        - Pandas dataframe with user-item interactions ready to be fed in a recommendation algorithm
    '''
    interactions = df.groupby([user_col, item_col])[rating_col] \
            .sum().unstack().reset_index(). \
            fillna(0).set_index(user_col)
    if norm:
        interactions = interactions.applymap(lambda x: 1 if x > threshold else 0)
    return interactions

In [17]:
# https://github.com/aayushmnit/cookbook/blob/master/recsys.py
#takes the interactions datasets and generates dictionary ( key = interactions index , value = user id)
# mapping between the indices of users in the interaction matrix and user id
def create_user_dict(interactions):
    '''
    Function to create a user dictionary based on their index and number in interaction dataset
    Required Input -
        interactions - dataset create by create_interaction_matrix
    Expected Output -
        user_dict - Dictionary type output containing interaction_index as key and user_id as value
    '''
    user_id = list(interactions.index)
    user_dict = {}
    counter = 0
    for i in user_id:
        user_dict[i] = counter
        counter += 1
    return user_dict

In [18]:
# https://github.com/aayushmnit/cookbook/blob/master/recsys.py
#the keys are item IDs, and the values are item names.
def create_item_dict(df,id_col,name_col):
    '''
    Function to create an item dictionary based on their item_id and item name
    Required Input -
        - df = Pandas dataframe with Item information
        - id_col = Column name containing unique identifier for an item
        - name_col = Column name containing name of the item
    Expected Output -
        item_dict = Dictionary type output containing item_id as key and item_name as value
    '''
    item_dict ={}
    for i in range(df.shape[0]):
        item_dict[(df.loc[i,id_col])] = df.loc[i,name_col]
    return item_dict

In [19]:
# https://github.com/aayushmnit/cookbook/blob/master/recsys.py
# n_components reducing dim to 30 d

#The runMF function trains a recommendation model using matrix factorization techniques from LightFM,
#learning user and item embeddings in a lower-dimensional space
#based on user-item interactions for making personalized recommendations.

def runMF(interactions, n_components=30, loss='warp', k=15, epoch=30,n_jobs = 4):
    '''
    Function to run matrix-factorization algorithm
    Required Input -
        - interactions = dataset create by create_interaction_matrix
        - n_components = number of embeddings you want to create to define Item and user
        - loss = loss function other options are logistic, brp
        - epoch = number of epochs to run
        - n_jobs = number of cores used for execution
    Expected Output  -
        Model - Trained model
    '''

    #uncommented for train test split
#     x = sparse.csr_matrix(interactions.values)
    model = LightFM(no_components= n_components, loss=loss,k=k)
    model.fit(x,epochs=epoch,num_threads = n_jobs)
    return model

In [20]:
# https://github.com/aayushmnit/cookbook/blob/master/recsys.py
def sample_recommendation_user(model, interactions, user_id, user_dict,
                               item_dict,threshold = 0,nrec_items = 10, show = True):
    '''
    Function to produce user recommendations
    Required Input -
        - model = Trained matrix factorization model
        - interactions = dataset used for training the model
        - user_id = user ID for which we need to generate recommendation
        - user_dict = Dictionary type input containing interaction_index as key and user_id as value
        - item_dict = Dictionary type input containing item_id as key and item_name as value
        - threshold = value above which the rating is favorable in new interaction matrix
        - nrec_items = Number of output recommendation needed
    Expected Output -
        - Prints list of items the given user has already bought
        - Prints list of N recommended items  which user hopefully will be interested in
    '''
    n_users, n_items = interactions.shape
    user_x = user_dict[user_id]
    scores = pd.Series(model.predict(user_x,np.arange(n_items)))
    scores.index = interactions.columns
    scores = list(pd.Series(scores.sort_values(ascending=False).index))

    known_items = list(pd.Series(interactions.loc[user_id,:] \
                                 [interactions.loc[user_id,:] > threshold].index) \
								 .sort_values(ascending=False))

    scores = [x for x in scores if x not in known_items]
    return_score_list = scores[0:nrec_items]
    known_items = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
    scores = list(pd.Series(return_score_list).apply(lambda x: item_dict[x]))
    if show == True:
        print("Known Likes:")
        counter = 1
        for i in known_items:
            print(str(counter) + '- ' + i)
            counter+=1

        print("\n Recommended Items:")
        counter = 1
        for i in scores:
            print(str(counter) + '- ' + i)
            counter+=1
    return return_score_list

In [21]:
interactions = create_interaction_matrix(df = df_freq, user_col = "user_id", item_col = 'artist_id', rating_col = 'freq', norm= False, threshold = None)
interactions.head()

artist_id,0,1,2,3,4,5,6,7,8,9,...,14311,14312,14313,14314,14315,14316,14317,14318,14319,14320
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00055176fea33f6e027cd3302289378b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0007f3dd09c91198371454c608d47f22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000b0f32b5739f052b9d40fcc5c41079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000c11a16c89aa4b14b328080f5954ee,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00123e0f544dee3ab006aa7f1e5725a7,0.0,0.0,0.0,19.0,0.0,1.0,0.0,0.0,80.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0




In [22]:
interactions.shape

(12924, 14321)

In [23]:
user_dict = create_user_dict(interactions=interactions)
artists_dict = create_item_dict(df = df_artist, id_col = 'artist_id', name_col = 'artist')
x = sparse.csr_matrix(interactions.values)
train, test = lightfm.cross_validation.random_train_test_split(x, test_percentage=0.2, random_state=None)

In [24]:
%time
model = runMF(interactions = train,
                 n_components = 30,
                 loss = 'warp',
                 k = 15,
                 epoch = 30,
                 n_jobs = 4)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 8.34 µs


In [25]:
train_auc = auc_score(model, train, num_threads=4).mean()
print('Train AUC: %s' % train_auc)

Train AUC: 0.9669841


In [26]:
test_auc = auc_score(model, test, train_interactions=train, num_threads=4).mean()
print('Test AUC: %s' % test_auc)

Test AUC: 0.96763295


In [27]:
train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10, train_interactions=train).mean()
print('train Precision %.2f, test Precision %.2f.' % (train_precision, test_precision))

train Precision 0.40, test Precision 0.19.


In [28]:
# returns a list of recommended item indices
# it displays known items and the recommended items in a formatted manner.
rec_list = sample_recommendation_user(model = model,
                                      interactions = interactions,
                                      user_id = '9cc0cfd4d7d7885102480dd99e7a90d6',
                                      user_dict = user_dict,
                                      item_dict = artists_dict,
                                      threshold = 0,
                                      nrec_items = 10,
                                      show = True)

Known Likes:
1- Spector
2- Miles Kane
3- Lissie
4- Crosby, Stills & Nash
5- Noel Gallagher's High Flying Birds
6- Noah And The Whale
7- Joshua Radin
8- Elbow
9- Crowded House
10- Biffy Clyro
11- Tom Petty And The Heartbreakers
12- Madness
13- Elvis Costello
14- Pearl Jam
15- Paul McCartney
16- Bruce Springsteen

 Recommended Items:
1- The Rolling Stones
2- Kings Of Leon
3- Bob Dylan
4- Mumford & Sons
5- David Bowie
6- U2
7- The Who
8- R.E.M.
9- Radiohead
10- Coldplay


In [29]:
rec_list = sample_recommendation_user(model = model,
                                      interactions = interactions,
                                      user_id = 'ffe32d5412269f3041c58cbf0dde3306',
                                      user_dict = user_dict,
                                      item_dict = artists_dict,
                                      threshold = 0,
                                      nrec_items = 10,
                                      show = True)

Known Likes:
1- The Contours
2- Shalamar
3- KC & The Sunshine Band
4- Tony! Toni! Toné!
5- Martha Reeves & The Vandellas
6- CHVRCHES
7- The Damnwells
8- The Hold Steady
9- Phosphorescent
10- Diana Ross
11- The Supremes
12- George Benson
13- Mary J. Blige
14- Frankie Valli & The Four Seasons
15- Otis Redding
16- Earth, Wind & Fire
17- Radiohead
18- U2
19- The Isley Brothers
20- Stevie Wonder
21- Bruce Springsteen
22- Ray Charles

 Recommended Items:
1- Marvin Gaye
2- Aretha Franklin
3- The Temptations
4- Al Green
5- Prince
6- Fleetwood Mac
7- Smokey Robinson & The Miracles
8- Michael Jackson
9- James Brown
10- David Bowie
