# Content Based Recommendation System

In [1]:
from google.cloud import storage
import os
from io import BytesIO
import numpy as np
import pandas as pd
import nltk
import gensim
from nltk.corpus import stopwords
from gensim import corpora,models
from gensim.models import LdaModel, LsiModel
import warnings
from sklearn.metrics.pairwise import cosine_similarity
warnings.filterwarnings("ignore")

In [1]:
# Load the lyrics data
pd.set_option('display.max_columns', None)
data = pd.read_csv('gs://fion_bucket1/lyrics.csv')
print(data.shape)
data.head()

(19045332, 6)


Unnamed: 0.1,Unnamed: 0,track_id,mxm_tid,word,count,is_test
0,0,TRAAAAV128F421A322,4623710,i,6,0
1,1,TRAAAAV128F421A322,4623710,the,4,0
2,2,TRAAAAV128F421A322,4623710,you,2,0
3,3,TRAAAAV128F421A322,4623710,to,2,0
4,4,TRAAAAV128F421A322,4623710,and,5,0


## Stopwords Removal

In [2]:
# stop words
# nltk.download('stopwords')
stoplist = stopwords.words('english')

In [3]:
# remove stopwords
data['stop'] = data.apply(lambda row: row['word'] in stoplist, axis=1)

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,track_id,mxm_tid,word,count,is_test,stop
0,0,TRAAAAV128F421A322,4623710,i,6,0,True
1,1,TRAAAAV128F421A322,4623710,the,4,0,True
2,2,TRAAAAV128F421A322,4623710,you,2,0,True
3,3,TRAAAAV128F421A322,4623710,to,2,0,True
4,4,TRAAAAV128F421A322,4623710,and,5,0,True


In [5]:
lyric = data.loc[data['stop'] == False, :]

In [6]:
lyric

Unnamed: 0.1,Unnamed: 0,track_id,mxm_tid,word,count,is_test,stop
22,22,TRAAAAV128F421A322,4623710,like,2,0,False
23,23,TRAAAAV128F421A322,4623710,de,1,0,False
27,27,TRAAAAV128F421A322,4623710,got,1,0,False
28,28,TRAAAAV128F421A322,4623710,would,1,0,False
31,31,TRAAAAV128F421A322,4623710,seem,1,0,False
...,...,...,...,...,...,...,...
19045327,19045327,TRZZZZD128F4236844,2466899,easili,1,1,False
19045328,19045328,TRZZZZD128F4236844,2466899,disast,1,1,False
19045329,19045329,TRZZZZD128F4236844,2466899,frown,1,1,False
19045330,19045330,TRZZZZD128F4236844,2466899,teas,1,1,False


In [7]:
lyric_nostop = lyric.iloc[:, 1:5]

In [8]:
lyric_nostop.reset_index(drop=True, inplace=True)

In [9]:
lyric_nostop

Unnamed: 0,track_id,mxm_tid,word,count
0,TRAAAAV128F421A322,4623710,like,2
1,TRAAAAV128F421A322,4623710,de,1
2,TRAAAAV128F421A322,4623710,got,1
3,TRAAAAV128F421A322,4623710,would,1
4,TRAAAAV128F421A322,4623710,seem,1
...,...,...,...,...
12789775,TRZZZZD128F4236844,2466899,easili,1
12789776,TRZZZZD128F4236844,2466899,disast,1
12789777,TRZZZZD128F4236844,2466899,frown,1
12789778,TRZZZZD128F4236844,2466899,teas,1


## Before further processing, keep trackid in listening history, artist_similarity, and lyrics.

In [10]:
artist_sim = pd.read_csv('gs://fion_bucket1/artist_similarity.csv')

In [11]:
track_meta = pd.read_csv('gs://lj-bucket1/tracks_metadata.csv')

In [12]:
artist_sim

Unnamed: 0,target,new
0,AR002UA1187B9A637D,"['ARQDOR81187FB3B06C', 'AROHMXJ1187B989023', '..."
1,AR003FB1187B994355,"['ARYACSL1187FB51611', 'ARYLCCQ1187B999F4B', '..."
2,AR006821187FB5192B,"['ARW25O21187B991492', 'ARQKS2U1187FB4CFBA', '..."
3,AR009211187B989185,"['ARJRM4M1187B9B4462', 'ARHINI31187B995C1D', '..."
4,AR009SZ1187B9A73F4,"['ARY8CFI1187B98D5E3', 'ARO03MT1187B9A8F2D', '..."
...,...,...
44455,ARZZXJY1187B99E2BB,"['AREJ5K11187B993F5F', 'AR5AXVN1187B9A2761', '..."
44456,ARZZXT51187FB4627E,"['ARVIDW81187FB5AAC3', 'ARC4AJX1187FB3C6BC', '..."
44457,ARZZYRB1187B99D0B6,"['AR4XV7Y1187FB41004', 'ARLELZZ1187B993920', '..."
44458,ARZZYRH11C8A416A12,"['ARLSFWF12086C152F4', 'AR0VU8Y11C8A422C79', '..."


In [13]:
track_meta.head()

Unnamed: 0,track_id,title,song_id,release,artist_id,artist_mbid,artist_name,duration,artist_familiarity,artist_hotttnesss,year
0,TRMMMYQ128F932D901,Silent Night,SOQMMHC12AB0180CB8,Monster Ballads X-Mas,ARYZTJS1187B98C555,357ff05d-848a-44cf-b608-cb34b5701ae5,Faster Pussy cat,252.05506,0.649822,0.394032,2003
1,TRMMMCH128F425532C,Si Vos Querés,SOBNYVR12A8C13558C,De Culo,ARNWYLR1187B9B2F9C,12be7648-7094-495f-90e6-df4189d68615,Yerba Brava,145.05751,0.448501,0.372349,2003
2,TRMMMNS128F93548E1,L'antarctique,SOYGNWH12AB018191E,Des cobras des tarentules,AR59BSJ1187FB4474F,891fccfc-24c1-4bfd-bf49-c736e59e443f,3 Gars Su'l Sofa,68.96281,0.555014,0.352949,2007
3,TRMMMXI128F4285A3F,N Gana,SOGPCJI12A8C13CCA0,Afropea 3 - Telling Stories To The Sea,ARBAMQB1187FB3C650,0bb5e108-b41d-46cd-969e-69d34d1acdfe,Waldemar Bastos,273.18812,0.54369,0.373679,0
4,TRMMMKI128F931D80D,006,SOSDCFG12AB0184647,Lena 20 År,ARSB5591187B99A848,fba3e876-68f1-4a1f-99d9-c604480202ba,Lena Philipsson,262.26893,0.529819,0.410229,1998


In [14]:
artist_track = artist_sim.merge(track_meta[['track_id', 'artist_id']], how='inner', left_on='target', right_on='artist_id')

In [15]:
artist_track

Unnamed: 0,target,new,track_id,artist_id
0,AR003FB1187B994355,"['ARYACSL1187FB51611', 'ARYLCCQ1187B999F4B', '...",TRWDPFR128F93594A6,AR003FB1187B994355
1,AR003FB1187B994355,"['ARYACSL1187FB51611', 'ARYLCCQ1187B999F4B', '...",TRQCGVN128F93594B6,AR003FB1187B994355
2,AR003FB1187B994355,"['ARYACSL1187FB51611', 'ARYLCCQ1187B999F4B', '...",TRQVTFP12903CCEE67,AR003FB1187B994355
3,AR003FB1187B994355,"['ARYACSL1187FB51611', 'ARYLCCQ1187B999F4B', '...",TRLVUGC128F935949B,AR003FB1187B994355
4,AR003FB1187B994355,"['ARYACSL1187FB51611', 'ARYLCCQ1187B999F4B', '...",TRSJHZL128F93594B0,AR003FB1187B994355
...,...,...,...,...
385147,ARZZXT51187FB4627E,"['ARVIDW81187FB5AAC3', 'ARC4AJX1187FB3C6BC', '...",TRJUDKX128F1467BC0,ARZZXT51187FB4627E
385148,ARZZXT51187FB4627E,"['ARVIDW81187FB5AAC3', 'ARC4AJX1187FB3C6BC', '...",TROCCZL128F1467BC8,ARZZXT51187FB4627E
385149,ARZZXT51187FB4627E,"['ARVIDW81187FB5AAC3', 'ARC4AJX1187FB3C6BC', '...",TRKNGSD128E0791BDE,ARZZXT51187FB4627E
385150,ARZZXT51187FB4627E,"['ARVIDW81187FB5AAC3', 'ARC4AJX1187FB3C6BC', '...",TRKEFRY128E0791BE2,ARZZXT51187FB4627E


In [16]:
lyric_his = artist_track[['track_id']].merge(lyric_nostop, how='inner', on='track_id')

In [17]:
lyric_his

Unnamed: 0,track_id,mxm_tid,word,count
0,TRMZTST128E0792E44,3272695,know,19
1,TRMZTST128E0792E44,3272695,go,1
2,TRMZTST128E0792E44,3272695,see,1
3,TRMZTST128E0792E44,3272695,got,5
4,TRMZTST128E0792E44,3272695,feel,1
...,...,...,...,...
7725675,TRAGLKL12903CEDE95,9432841,hurri,2
7725676,TRAGLKL12903CEDE95,9432841,rollin,7
7725677,TRAGLKL12903CEDE95,9432841,fallin,3
7725678,TRAGLKL12903CEDE95,9432841,cheek,4


In [18]:
# Check the number of unique tracks
len(lyric_his['track_id'].unique())

142263

## Stemming

In [19]:
# stemming
porter = nltk.PorterStemmer()
lyric_his['porter'] = lyric_his.apply(lambda row: porter.stem(row['word']), axis=1)

In [20]:
lyric_his

Unnamed: 0,track_id,mxm_tid,word,count,porter
0,TRMZTST128E0792E44,3272695,know,19,know
1,TRMZTST128E0792E44,3272695,go,1,go
2,TRMZTST128E0792E44,3272695,see,1,see
3,TRMZTST128E0792E44,3272695,got,5,got
4,TRMZTST128E0792E44,3272695,feel,1,feel
...,...,...,...,...,...
7725675,TRAGLKL12903CEDE95,9432841,hurri,2,hurri
7725676,TRAGLKL12903CEDE95,9432841,rollin,7,rollin
7725677,TRAGLKL12903CEDE95,9432841,fallin,3,fallin
7725678,TRAGLKL12903CEDE95,9432841,cheek,4,cheek


## TFIDF Matrix
### trackid as rows, tokens as columns, tf-idf scores as elements

In [22]:
lyric_his['words'] = lyric_his.apply(lambda row: (row['porter']+' ') * row['count'], axis=1)

In [23]:
lyric_his

Unnamed: 0,track_id,mxm_tid,word,count,porter,words
0,TRMZTST128E0792E44,3272695,know,19,know,know know know know know know know know know k...
1,TRMZTST128E0792E44,3272695,go,1,go,go
2,TRMZTST128E0792E44,3272695,see,1,see,see
3,TRMZTST128E0792E44,3272695,got,5,got,got got got got got
4,TRMZTST128E0792E44,3272695,feel,1,feel,feel
...,...,...,...,...,...,...
7725675,TRAGLKL12903CEDE95,9432841,hurri,2,hurri,hurri hurri
7725676,TRAGLKL12903CEDE95,9432841,rollin,7,rollin,rollin rollin rollin rollin rollin rollin rollin
7725677,TRAGLKL12903CEDE95,9432841,fallin,3,fallin,fallin fallin fallin
7725678,TRAGLKL12903CEDE95,9432841,cheek,4,cheek,cheek cheek cheek cheek


In [24]:
lyric = lyric_his[['track_id', 'words']].groupby('track_id')['words'].apply(lambda x: ' '.join(x)).reset_index(name='lyrics')

In [25]:
lyric

Unnamed: 0,track_id,lyrics
0,TRAAAAV128F421A322,like like de got would seem someon under...
1,TRAAABD128F429CF47,know know know know know time time time la l...
2,TRAAAED128E0783FAB,love love love love love love love love love l...
3,TRAAAEW128F42930C0,like take would wo someth stay burn burn...
4,TRAAAFD128F92F423A,one got never feel way way take would a...
...,...,...
142258,TRZZZUK128F92E3C60,love love see see heart heart gonna gonna ...
142259,TRZZZXA128F428ED56,time la get eye think give dream wo wo ...
142260,TRZZZYV128F92E996D,get get get get get get get get get get get ge...
142261,TRZZZYX128F92D32C6,know know time time time go go go go go go g...


In [13]:
# Tokenization
# nltk.download('punkt')
lyric['lyrics_token'] = lyric.apply(lambda row: nltk.word_tokenize(row['lyrics']), axis=1)

In [14]:
lyric.iloc[:, 0:3]

Unnamed: 0,track_id,lyrics,lyrics_token
0,TRAAAAV128F421A322,like like de got would seem someon under...,"[like, like, de, got, would, seem, someon, und..."
1,TRAAABD128F429CF47,know know know know know time time time la l...,"[know, know, know, know, know, time, time, tim..."
2,TRAAAED128E0783FAB,love love love love love love love love love l...,"[love, love, love, love, love, love, love, lov..."
3,TRAAAEW128F42930C0,like take would wo someth stay burn burn...,"[like, take, would, wo, someth, stay, burn, bu..."
4,TRAAAFD128F92F423A,one got never feel way way take would a...,"[one, got, never, feel, way, way, take, would,..."
...,...,...,...
142258,TRZZZUK128F92E3C60,love love see see heart heart gonna gonna ...,"[love, love, see, see, heart, heart, gon, na, ..."
142259,TRZZZXA128F428ED56,time la get eye think give dream wo wo ...,"[time, la, get, eye, think, give, dream, wo, w..."
142260,TRZZZYV128F92E996D,get get get get get get get get get get get ge...,"[get, get, get, get, get, get, get, get, get, ..."
142261,TRZZZYX128F92D32C6,know know time time time go go go go go go g...,"[know, know, time, time, time, go, go, go, go,..."


In [58]:
lyric.iloc[:, 0:3].to_csv('lyric_model.csv', index=False)

In [28]:
len(lyric['lyrics_token'][1])

116

# *Above is data preprocessing, we can restart from here

In [2]:
lyric = pd.read_csv('lyric_model.csv')

In [3]:
# tfidf features
from sklearn.feature_extraction.text import TfidfVectorizer #alternatively, use TfidfTransformer()

tfidf_vectorizer=TfidfVectorizer(min_df=1, 
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True,
                                 ngram_range=(1,1)) #Since the original dataset only has tokens, we can only use unigram

tfidf = tfidf_vectorizer.fit_transform(lyric['lyrics']) 

In [5]:
tfidf

<142263x4788 sparse matrix of type '<class 'numpy.float64'>'
	with 7669181 stored elements in Compressed Sparse Row format>

In [6]:
int(142263/4)

35565

## Word2Vec Matrix

In [29]:
# build word2vec model                   
wv_model = gensim.models.Word2Vec(lyric['lyrics_token'],
                               size=200,     #set the size or dimension for the word vectors 
                               window=1,    #specify the length of the window of words taken as context
                               min_count=2) #ignores all words with total frequency lower than                     

In [30]:
def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector 
   

def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [31]:
# averaged word vector features from word2vec
avg_wv_train_features = averaged_word_vectorizer(corpus=lyric['lyrics_token'],
                                                 model=wv_model,
                                                 num_features=200)   

  if __name__ == '__main__':


In [32]:
len(avg_wv_train_features)

142263

In [33]:
len(avg_wv_train_features[0])

200

In [61]:
avg_wv_train_features

array([[-0.03820242, -0.05298082,  0.09911916, ..., -0.455909  ,
        -0.08220926,  0.19541479],
       [ 0.0633903 , -0.18454109, -0.18230766, ..., -0.21158542,
        -0.06406712, -0.07335967],
       [ 0.06679964, -0.1390753 , -0.12681464, ..., -0.28250462,
        -0.02071588, -0.01962863],
       ...,
       [-0.00960958, -0.14978793, -0.13793814, ..., -0.24237069,
        -0.05000849, -0.06986092],
       [ 0.13031788, -0.14230888, -0.01359256, ..., -0.33358136,
         0.00664195, -0.00502665],
       [ 0.00928513, -0.12776261,  0.05008366, ..., -0.2427845 ,
        -0.13902934,  0.20370942]])

In [62]:
avg_wv_train_features = pd.DataFrame(avg_wv_train_features)

In [64]:
avg_wv_train_features.to_csv('avg_wv_train_features.csv', index=False)

## Topic Model
### Generate Term Document Matrix

In [15]:
# Generate token dictionary class
dictionary = corpora.Dictionary(lyric['lyrics_token']) 
print(dictionary)

Dictionary(4835 unique tokens: ['arrang', 'captur', 'damn', 'de', 'devast']...)


In [16]:
# Generate a unique token list 
sort_token = sorted(dictionary.items(),key=lambda k:k[0], reverse = False)
unique_token = [token for (ID,token) in sort_token]

In [17]:
# Build a corpus
lyric['corpus'] = [dictionary.doc2bow(tokens) for tokens in lyric['lyrics_token']]
print(lyric['corpus'].head())
# (id, tf in the row)

0    [(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1...
1    [(8, 3), (40, 2), (41, 2), (42, 1), (43, 2), (...
2    [(8, 7), (16, 1), (22, 1), (29, 1), (35, 4), (...
3    [(15, 1), (16, 1), (35, 1), (40, 1), (52, 1), ...
4    [(8, 1), (35, 3), (40, 1), (43, 4), (47, 1), (...
Name: corpus, dtype: object


In [18]:
# Save a Term Document Matrix
matrix = gensim.matutils.corpus2dense(lyric['corpus'], num_terms=len(dictionary), dtype = 'int')

In [19]:
matrix.T
#(all unique word's tf in the row)

array([[1, 2, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [20]:
pd.DataFrame(matrix.T)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4825,4826,4827,4828,4829,4830,4831,4832,4833,4834
0,1,2,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,3,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,7,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142258,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
142259,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
142260,0,0,0,0,0,0,0,0,3,0,...,0,0,0,0,0,0,0,0,0,0
142261,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# Transpose the matrix and convert the numpy matrix into pandas data frame
lyric2 = pd.concat([lyric,pd.DataFrame(matrix.T)], axis=1)

In [45]:
# Fit lda model
lda = models.LdaModel(lyric2['corpus'], id2word=dictionary, num_topics=10) 
# Topic matrix (V matrix)
lda.print_topics(10) 

[(0,
  '0.018*"know" + 0.016*"time" + 0.014*"never" + 0.012*"see" + 0.011*"feel" + 0.011*"would" + 0.011*"away" + 0.010*"ca" + 0.010*"one" + 0.010*"go"'),
 (1,
  '0.060*"la" + 0.053*"de" + 0.049*"que" + 0.023*"en" + 0.022*"el" + 0.020*"le" + 0.019*"tu" + 0.017*"te" + 0.016*"un" + 0.016*"mi"'),
 (2,
  '0.055*"ich" + 0.048*"da" + 0.042*"und" + 0.039*"die" + 0.024*"du" + 0.022*"der" + 0.021*"nicht" + 0.019*"ist" + 0.019*"es" + 0.017*"ein"'),
 (3,
  '0.010*"die" + 0.010*"god" + 0.008*"world" + 0.008*"soul" + 0.008*"burn" + 0.007*"us" + 0.007*"blood" + 0.007*"dead" + 0.007*"life" + 0.007*"fire"'),
 (4,
  '0.107*"love" + 0.086*"na" + 0.039*"gon" + 0.033*"wan" + 0.021*"know" + 0.019*"give" + 0.018*"need" + 0.018*"let" + 0.016*"make" + 0.016*"want"'),
 (5,
  '0.030*"de" + 0.028*"que" + 0.025*"e" + 0.021*"eu" + 0.018*"det" + 0.017*"jag" + 0.017*"du" + 0.016*"não" + 0.015*"é" + 0.014*"en"'),
 (6,
  '0.057*"e" + 0.050*"di" + 0.045*"che" + 0.039*"non" + 0.032*"la" + 0.027*"il" + 0.025*"mi" + 0.022

In [46]:
# Generate U Matrix for LDA model
corpus_lda = lda[lyric2['corpus']] #transform lda model

# Convert corpus_lda to numpy matrix
U_matrix_lda = gensim.matutils.corpus2dense(corpus_lda,num_terms=10).T
# (Topic weight in the row)

# Write U_matrix into pandas dataframe and output
lyric2 = pd.concat([lyric2,pd.DataFrame(U_matrix_lda)], axis=1)

In [47]:
print (pd.DataFrame(matrix.T).shape)
print (pd.DataFrame(U_matrix_lda).shape)

(142263, 4835)
(142263, 10)


In [None]:
# save model to disk (no need to use pickle module)
lda.save('lda.model')

In [8]:
# later on, load trained model from file
lda =  models.LdaModel.load('lda.model')

# print all topics
model.show_topics(10)

[(0,
  '0.018*"know" + 0.016*"time" + 0.014*"never" + 0.012*"see" + 0.011*"feel" + 0.011*"would" + 0.011*"away" + 0.010*"ca" + 0.010*"one" + 0.010*"go"'),
 (1,
  '0.060*"la" + 0.053*"de" + 0.049*"que" + 0.023*"en" + 0.022*"el" + 0.020*"le" + 0.019*"tu" + 0.017*"te" + 0.016*"un" + 0.016*"mi"'),
 (2,
  '0.055*"ich" + 0.048*"da" + 0.042*"und" + 0.039*"die" + 0.024*"du" + 0.022*"der" + 0.021*"nicht" + 0.019*"ist" + 0.019*"es" + 0.017*"ein"'),
 (3,
  '0.010*"die" + 0.010*"god" + 0.008*"world" + 0.008*"soul" + 0.008*"burn" + 0.007*"us" + 0.007*"blood" + 0.007*"dead" + 0.007*"life" + 0.007*"fire"'),
 (4,
  '0.107*"love" + 0.086*"na" + 0.039*"gon" + 0.033*"wan" + 0.021*"know" + 0.019*"give" + 0.018*"need" + 0.018*"let" + 0.016*"make" + 0.016*"want"'),
 (5,
  '0.030*"de" + 0.028*"que" + 0.025*"e" + 0.021*"eu" + 0.018*"det" + 0.017*"jag" + 0.017*"du" + 0.016*"não" + 0.015*"é" + 0.014*"en"'),
 (6,
  '0.057*"e" + 0.050*"di" + 0.045*"che" + 0.039*"non" + 0.032*"la" + 0.027*"il" + 0.025*"mi" + 0.022

## Visualization of LDA topics using pyLDAvis

In [9]:
import pyLDAvis.gensim

In [23]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda, lyric2['corpus'], dictionary)

In [50]:
lyric = pd.concat([lyric,pd.DataFrame(U_matrix_lda)], axis=1)

In [51]:
lyric.head()

Unnamed: 0,track_id,lyrics,lyrics_token,corpus,0,1,2,3,4,5,6,7,8,9
0,TRAAAAV128F421A322,like like de got would seem someon under...,"[like, like, de, got, would, seem, someon, und...","[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1...",0.200995,0.024763,0.0,0.684794,0.0,0.0,0.0,0.0,0.076936,0.0
1,TRAAABD128F429CF47,know know know know know time time time la l...,"[know, know, know, know, know, time, time, tim...","[(8, 3), (40, 2), (41, 2), (42, 1), (43, 2), (...",0.609247,0.062082,0.0,0.0,0.224295,0.0,0.0,0.077318,0.0,0.022783
2,TRAAAED128E0783FAB,love love love love love love love love love l...,"[love, love, love, love, love, love, love, lov...","[(8, 7), (16, 1), (22, 1), (29, 1), (35, 4), (...",0.577299,0.0,0.0,0.0,0.418489,0.0,0.0,0.0,0.0,0.0
3,TRAAAEW128F42930C0,like take would wo someth stay burn burn...,"[like, take, would, wo, someth, stay, burn, bu...","[(15, 1), (16, 1), (35, 1), (40, 1), (52, 1), ...",0.147659,0.0,0.0,0.6713,0.0,0.0,0.0,0.0,0.170264,0.0
4,TRAAAFD128F92F423A,one got never feel way way take would a...,"[one, got, never, feel, way, way, take, would,...","[(8, 1), (35, 3), (40, 1), (43, 4), (47, 1), (...",0.709216,0.0,0.0,0.27951,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
topic_features = pd.DataFrame(U_matrix_lda)

In [67]:
topic_features.to_csv('topic_features.csv', index=False)

## Calculate Cosine Similarity between Tracks

In [7]:
topic_features = pd.read_csv('topic_features.csv')

In [8]:
topic_features_matrix = topic_features.to_numpy()

In [9]:
avg_wv_train_features = pd.read_csv('avg_wv_train_features.csv')

In [10]:
avg_wv_train_features = avg_wv_train_features.to_numpy()

In [11]:
cosine_sim_tfidf = cosine_similarity(tfidf[:int(142263/4)])
cosine_sim_tfidf

array([[1.        , 0.0109541 , 0.03660467, ..., 0.05770501, 0.0420159 ,
        0.00175765],
       [0.0109541 , 1.        , 0.15849612, ..., 0.07585991, 0.00402382,
        0.12804163],
       [0.03660467, 0.15849612, 1.        , ..., 0.2021549 , 0.        ,
        0.13829145],
       ...,
       [0.05770501, 0.07585991, 0.2021549 , ..., 1.        , 0.00898596,
        0.15232903],
       [0.0420159 , 0.00402382, 0.        , ..., 0.00898596, 1.        ,
        0.0130168 ],
       [0.00175765, 0.12804163, 0.13829145, ..., 0.15232903, 0.0130168 ,
        1.        ]])

In [12]:
cosine_sim_w2v = cosine_similarity(avg_wv_train_features[:int(142263/4)])
cosine_sim_w2v

array([[1.        , 0.85040102, 0.87387134, ..., 0.89870171, 0.9171138 ,
        0.89589825],
       [0.85040102, 1.        , 0.96339453, ..., 0.94318361, 0.81617376,
        0.95017148],
       [0.87387134, 0.96339453, 1.        , ..., 0.97544656, 0.8456146 ,
        0.96880243],
       ...,
       [0.89870171, 0.94318361, 0.97544656, ..., 1.        , 0.86555047,
        0.9711445 ],
       [0.9171138 , 0.81617376, 0.8456146 , ..., 0.86555047, 1.        ,
        0.87815783],
       [0.89589825, 0.95017148, 0.96880243, ..., 0.9711445 , 0.87815783,
        1.        ]])

In [13]:
cosine_sim_lda = cosine_similarity(topic_features_matrix[:int(142263/4)])
cosine_sim_lda

array([[1.        , 0.26270198, 0.22657326, ..., 0.15114958, 0.97745607,
        0.27812288],
       [0.26270198, 1.        , 0.95095552, ..., 0.794842  , 0.09690533,
        0.9901594 ],
       [0.22657326, 0.95095552, 1.        , ..., 0.92975042, 0.08462744,
        0.96816644],
       ...,
       [0.15114958, 0.794842  , 0.92975042, ..., 1.        , 0.05645592,
        0.81185149],
       [0.97745607, 0.09690533, 0.08462744, ..., 0.05645592, 1.        ,
        0.11592042],
       [0.27812288, 0.9901594 , 0.96816644, ..., 0.81185149, 0.11592042,
        1.        ]])

## Loda Meta Data and Similar Artist for tracking the information

In [14]:
artist_sim = pd.read_csv('gs://fion_bucket1/artist_similarity.csv')
track_meta = pd.read_csv('gs://lj-bucket1/tracks_metadata.csv')

In [15]:
# Change similar_artist list from string to list
artist_sim['similar_artist'] = artist_sim['new'].apply(lambda row: row[1:-1].split(','))
artist_sim['similar_artist'] = artist_sim['similar_artist'].apply(lambda row: [item.replace("'", '') for item in row])
artist_sim['similar_artist'] = artist_sim['similar_artist'].apply(lambda row: [item.replace(" ", '') for item in row])

In [16]:
artist_sim

Unnamed: 0,target,new,similar_artist
0,AR002UA1187B9A637D,"['ARQDOR81187FB3B06C', 'AROHMXJ1187B989023', '...","[ARQDOR81187FB3B06C, AROHMXJ1187B989023, ARAGW..."
1,AR003FB1187B994355,"['ARYACSL1187FB51611', 'ARYLCCQ1187B999F4B', '...","[ARYACSL1187FB51611, ARYLCCQ1187B999F4B, AR783..."
2,AR006821187FB5192B,"['ARW25O21187B991492', 'ARQKS2U1187FB4CFBA', '...","[ARW25O21187B991492, ARQKS2U1187FB4CFBA, ARRKD..."
3,AR009211187B989185,"['ARJRM4M1187B9B4462', 'ARHINI31187B995C1D', '...","[ARJRM4M1187B9B4462, ARHINI31187B995C1D, ARI0P..."
4,AR009SZ1187B9A73F4,"['ARY8CFI1187B98D5E3', 'ARO03MT1187B9A8F2D', '...","[ARY8CFI1187B98D5E3, ARO03MT1187B9A8F2D, AR2NW..."
...,...,...,...
44455,ARZZXJY1187B99E2BB,"['AREJ5K11187B993F5F', 'AR5AXVN1187B9A2761', '...","[AREJ5K11187B993F5F, AR5AXVN1187B9A2761, ARXXX..."
44456,ARZZXT51187FB4627E,"['ARVIDW81187FB5AAC3', 'ARC4AJX1187FB3C6BC', '...","[ARVIDW81187FB5AAC3, ARC4AJX1187FB3C6BC, ARBZ6..."
44457,ARZZYRB1187B99D0B6,"['AR4XV7Y1187FB41004', 'ARLELZZ1187B993920', '...","[AR4XV7Y1187FB41004, ARLELZZ1187B993920, ARR0C..."
44458,ARZZYRH11C8A416A12,"['ARLSFWF12086C152F4', 'AR0VU8Y11C8A422C79', '...","[ARLSFWF12086C152F4, AR0VU8Y11C8A422C79, ARS7P..."


## Define a Content-Based Recommendation System Function

In [17]:
indices = pd.Series(lyric['track_id'])

In [18]:
# Define a function to get the similar track based on the cosine similarity
def recommend_id(track_id, cosine_sim):
    if len(indices[indices == track_id]) != 0:
        idx = indices[indices == track_id].index[0]
        
        global score_series
        score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

        top10_indexes = list(score_series.iloc[1:11].index)

        recommend_trackid = lyric.iloc[top10_indexes][['track_id']]
        recommend_track = recommend_trackid.merge(track_meta[['track_id', 'artist_name', 'title']], how='inner', on='track_id')[['artist_name', 'title']]
    else:
        recommend_track = pd.DataFrame()
    return recommend_track

def recommend_title(title, artist, cosine_sim):
    similar_artist = {}
    recommended = pd.DataFrame()

    # Match the track id and artist id with the song title and artist name
    track_input = track_meta.loc[track_meta['title']==title].loc[track_meta['artist_name']==artist, ['track_id', 'artist_id']].reset_index(drop=True)
    tid = track_input['track_id'][0] #single track id
    aid = track_input['artist_id'][0] #single artist id
    said = artist_sim.loc[artist_sim['target']==aid, 'similar_artist'] #similar artists list
    recommended = recommended.append(track_meta.loc[track_meta['track_id']==tid, ['artist_name', 'title']])

    # recommended based on cosine similarity
    recommended = recommended.append(recommend_id(tid, cosine_sim))

    # recommend based on similar artist            
    for i in said.values[0]:
        stid = track_meta.loc[track_meta['artist_id']==i, 'track_id'].values
        if len(stid) > 0:
            for j in stid:
                 if len(indices[indices == j]) != 0:
                    sidx = indices[indices == j].index[0]
                    try:
                        similar_artist[j] = score_series.iloc[sidx]
                    except IndexError:
                        continue
    similar_artist_df = pd.DataFrame(similar_artist.items(), columns = ['track_id', 'cosine_smilarity']).sort_values(by='cosine_smilarity', ascending = False)
    recommend_strack = similar_artist_df[['track_id']][:10].merge(track_meta[['track_id', 'artist_name', 'title']], how='inner', on='track_id')[['artist_name', 'title']]
    recommended = recommended.append(recommend_strack)
    return recommended.reset_index(drop=True)

In [19]:
title = input('Please enter the song name:')

Please enter the song name:A Poor Recipe For Civic Cohesion


In [20]:
artist = input('Please enter the artist name:')

Please enter the artist name:Western Addiction


In [21]:
recommend_tfidf = recommend_title(title, artist, cosine_sim_tfidf)

In [22]:
recommend_w2v = recommend_title(title, artist, cosine_sim_w2v)

In [23]:
recommend_lda = recommend_title(title, artist, cosine_sim_lda)

In [24]:
recommend_tfidf

Unnamed: 0,artist_name,title
0,Western Addiction,A Poor Recipe For Civic Cohesion
1,Starfield,Hosanna
2,Daniel Johnston,Poor You
3,Patty Griffin,Poor Man's House
4,Nick Cave & The Bad Seeds,Straight To You (2010 Digital Remaster)
5,Ultravox!,Rockwrok
6,Milburn,Stockholm Syndrome
7,Muse,Stockholm Syndrome
8,Radney Foster,The Kindness Of Strangers
9,Traveling Wilburys,Poor House (2007 Remastered LP Version)


In [25]:
recommend_w2v

Unnamed: 0,artist_name,title
0,Western Addiction,A Poor Recipe For Civic Cohesion
1,Candiria,Channeling Elements
2,Dying Fetus,Forced Elimination
3,Extol,Paradigms
4,Scritti Politti,Lions After Slumber
5,7L & Esoteric,Axe Hurlers
6,Aesop Rock,Flashflood
7,Organized Konfusion,Releasing Hypnotical Gases
8,Cradle Of Filth,Sweetest Maleficia (Album Version)
9,Cradle Of Filth,Tortured Soul Asylum


In [26]:
recommend_lda

Unnamed: 0,artist_name,title
0,Western Addiction,A Poor Recipe For Civic Cohesion
1,Carcass,Symposium Of Sickness
2,Nasty Savage,Divination
3,Jimmy Needham,The Gospel
4,Winger,Who's The One
5,Job For A Cowboy,Psychological Immorality
6,Shearwater,Black Eyes
7,Fucked Up,Crusades
8,Joe Henry,This Is My Favorite Cage
9,Deathstars,Synthetic Generation


In [27]:
tfidf_w2v = recommend_tfidf.iloc[1:11, :].merge(recommend_w2v.iloc[1:11, :], how='inner', on=['artist_name', 'title'])
print(tfidf_w2v)

Empty DataFrame
Columns: [artist_name, title]
Index: []


In [28]:
tfidf_lda = recommend_tfidf.iloc[1:11, :].merge(recommend_lda.iloc[1:11, :], how='inner', on=['artist_name', 'title'])
print(tfidf_lda)

Empty DataFrame
Columns: [artist_name, title]
Index: []


In [29]:
w2v_lda = recommend_w2v.iloc[1:11, :].merge(recommend_lda.iloc[1:11, :], how='inner', on=['artist_name', 'title'])
print(w2v_lda)

Empty DataFrame
Columns: [artist_name, title]
Index: []
