In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
import os
os.chdir("/content/drive/My Drive/")

In [22]:
import pandas as pd

In [23]:
import numpy as np
import pandas
#Class for Popularity based Recommender System model
class popularity_recommender_py():
    def __init__(self):
        self.train_data = None
        self.user_id = None
        self.item_id = None
        self.popularity_recommendations = None
        
    #Create the popularity based recommender system model
    def create(self, train_data, user_id, item_id):
        self.train_data = train_data
        self.user_id = user_id
        self.item_id = item_id

        #Get a count of user_ids for each unique song as recommendation score
        train_data_grouped = train_data.groupby([self.item_id]).agg({self.user_id: 'count'}).reset_index()
        train_data_grouped.rename(columns = {'user_id': 'score'},inplace=True)
    
        #Sort the songs based upon recommendation score
        train_data_sort = train_data_grouped.sort_values(['score', self.item_id], ascending = [0,1])
    
        #Generate a recommendation rank based upon score
        train_data_sort['Rank'] = train_data_sort['score'].rank(ascending=0, method='first')
        
        #Get the top 10 recommendations
        self.popularity_recommendations = train_data_sort.head(10)

    #Use the popularity based recommender system model to
    #make recommendations
    def recommend(self, user_id):    
        user_recommendations = self.popularity_recommendations
        
        #Add user_id column for which the recommendations are being generated
        user_recommendations['user_id'] = user_id
    
        #Bring user_id column to the front
        cols = user_recommendations.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        user_recommendations = user_recommendations[cols]
        
        return user_recommendations
    

#Class for Item similarity based Recommender System model
class item_similarity_recommender_py():
    def __init__(self):
        self.train_data = None
        self.user_id = None
        self.item_id = None
        self.cooccurence_matrix = None
        self.songs_dict = None
        self.rev_songs_dict = None
        self.item_similarity_recommendations = None
        
    #Get unique items (songs) corresponding to a given user
    def get_user_items(self, user):
        user_data = self.train_data[self.train_data[self.user_id] == user]
        user_items = list(user_data[self.item_id].unique())
        
        return user_items
        
    #Get unique users for a given item (song)
    def get_item_users(self, item):
        item_data = self.train_data[self.train_data[self.item_id] == item]
        item_users = set(item_data[self.user_id].unique())
            
        return item_users
        
    #Get unique items (songs) in the training data
    def get_all_items_train_data(self):
        all_items = list(self.train_data[self.item_id].unique())
            
        return all_items
        
    #Construct cooccurence matrix
    def construct_cooccurence_matrix(self, user_songs, all_songs):
            
        ####################################
        #Get users for all songs in user_songs.
        ####################################
        user_songs_users = []        
        for i in range(0, len(user_songs)):
            user_songs_users.append(self.get_item_users(user_songs[i]))
            
        ###############################################
        #Initialize the item cooccurence matrix of size 
        #len(user_songs) X len(songs)
        ###############################################
        cooccurence_matrix = np.matrix(np.zeros(shape=(len(user_songs), len(all_songs))), float)
           
        #############################################################
        #Calculate similarity between user songs and all unique songs
        #in the training data
        #############################################################
        for i in range(0,len(all_songs)):
            #Calculate unique listeners (users) of song (item) i
            songs_i_data = self.train_data[self.train_data[self.item_id] == all_songs[i]]
            users_i = set(songs_i_data[self.user_id].unique())
            
            for j in range(0,len(user_songs)):       
                    
                #Get unique listeners (users) of song (item) j
                users_j = user_songs_users[j]
                    
                #Calculate intersection of listeners of songs i and j
                users_intersection = users_i.intersection(users_j)
                
                #Calculate cooccurence_matrix[i,j] as Jaccard Index
                if len(users_intersection) != 0:
                    #Calculate union of listeners of songs i and j
                    users_union = users_i.union(users_j)
                    
                    cooccurence_matrix[j,i] = float(len(users_intersection))/float(len(users_union))
                else:
                    cooccurence_matrix[j,i] = 0
                    
        
        return cooccurence_matrix

    
    #Use the cooccurence matrix to make top recommendations
    def generate_top_recommendations(self, user, cooccurence_matrix, all_songs, user_songs):
        print("Non zero values in cooccurence_matrix :%d" % np.count_nonzero(cooccurence_matrix))
        
        #Calculate a weighted average of the scores in cooccurence matrix for all user songs.
        user_sim_scores = cooccurence_matrix.sum(axis=0)/float(cooccurence_matrix.shape[0])
        user_sim_scores = np.array(user_sim_scores)[0].tolist()
 
        #Sort the indices of user_sim_scores based upon their value
        #Also maintain the corresponding score
        sort_index = sorted(((e,i) for i,e in enumerate(list(user_sim_scores))), reverse=True)
    
        #Create a dataframe from the following
        columns = ['user_id', 'song', 'score', 'rank']
        #index = np.arange(1) # array of numbers for the number of samples
        df = pandas.DataFrame(columns=columns)
         
        #Fill the dataframe with top 10 item based recommendations
        rank = 1 
        for i in range(0,len(sort_index)):
            if ~np.isnan(sort_index[i][0]) and all_songs[sort_index[i][1]] not in user_songs and rank <= 10:
                df.loc[len(df)]=[user,all_songs[sort_index[i][1]],sort_index[i][0],rank]
                rank = rank+1
        
        #Handle the case where there are no recommendations
        if df.shape[0] == 0:
            print("The current user has no songs for training the item similarity based recommendation model.")
            return -1
        else:
            return df
 
    #Create the item similarity based recommender system model
    def create(self, train_data, user_id, item_id):
        self.train_data = train_data
        self.user_id = user_id
        self.item_id = item_id

    #Use the item similarity based recommender system model to
    #make recommendations
    def recommend(self, user):
        
        ########################################
        #A. Get all unique songs for this user
        ########################################
        user_songs = self.get_user_items(user)    
            
        print("No. of unique songs for the user: %d" % len(user_songs))
        
        ######################################################
        #B. Get all unique items (songs) in the training data
        ######################################################
        all_songs = self.get_all_items_train_data()
        
        print("no. of unique songs in the training set: %d" % len(all_songs))
         
        ###############################################
        #C. Construct item cooccurence matrix of size 
        #len(user_songs) X len(songs)
        ###############################################
        cooccurence_matrix = self.construct_cooccurence_matrix(user_songs, all_songs)
        
        #######################################################
        #D. Use the cooccurence matrix to make recommendations
        #######################################################
        df_recommendations = self.generate_top_recommendations(user, cooccurence_matrix, all_songs, user_songs)
                
        return df_recommendations
    
    #Get similar items to given items
    def get_similar_items(self, item_list):
        
        user_songs = item_list
        
        ######################################################
        #B. Get all unique items (songs) in the training data
        ######################################################
        all_songs = self.get_all_items_train_data()
        
        print("no. of unique songs in the training set: %d" % len(all_songs))
         
        ###############################################
        #C. Construct item cooccurence matrix of size 
        #len(user_songs) X len(songs)
        ###############################################
        cooccurence_matrix = self.construct_cooccurence_matrix(user_songs, all_songs)
        
        #######################################################
        #D. Use the cooccurence matrix to make recommendations
        #######################################################
        user = ""
        df_recommendations = self.generate_top_recommendations(user, cooccurence_matrix, all_songs, user_songs)
         
        return df_recommendations

In [24]:
song_df_1 = pd.read_csv('triplets_file.csv')
song_df_1.head()

Unnamed: 0,user_id,song_id,listen_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1


In [25]:
song_df_2 = pd.read_csv('song_data.csv')
song_df_2.head()

Unnamed: 0,song_id,title,release,artist_name,year
0,SOQMMHC12AB0180CB8,Silent Night,Monster Ballads X-Mas,Faster Pussy cat,2003
1,SOVFVAK12A8C1350D9,Tanssi vaan,Karkuteillä,Karkkiautomaatti,1995
2,SOGTUKN12AB017F4F1,No One Could Ever,Butter,Hudson Mohawke,2006
3,SOBNYVR12A8C13558C,Si Vos Querés,De Culo,Yerba Brava,2003
4,SOHSBXH12A8C13B0DF,Tangle Of Aspens,Rene Ablaze Presents Winter Sessions,Der Mystic,0


In [26]:
song_df = pd.merge(song_df_1, song_df_2.drop_duplicates(['song_id']), on="song_id", how="left")
song_df.head()

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Thicker Than Water,Jack Johnson,0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Graduation,Kanye West,2007
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,In Between Dreams,Jack Johnson,2005
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999


In [27]:
print(len(song_df_1), len(song_df_2))

2000000 1000000


In [28]:
len(song_df)

2000000

In [29]:
song_df['song'] = song_df['title']+"-"+song_df['artist_name']
song_df.head()

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year,song
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Thicker Than Water,Jack Johnson,0,The Cove-Jack Johnson
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976,Entre Dos Aguas-Paco De Lucia
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Graduation,Kanye West,2007,Stronger-Kanye West
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,In Between Dreams,Jack Johnson,2005,Constellations-Jack Johnson
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999,Learn To Fly-Foo Fighters


In [30]:
song_df['song'].head()

0            The Cove-Jack Johnson
1    Entre Dos Aguas-Paco De Lucia
2              Stronger-Kanye West
3      Constellations-Jack Johnson
4        Learn To Fly-Foo Fighters
Name: song, dtype: object

In [31]:
song_grouped = song_df.groupby(['song']).agg({'listen_count':'count'}).reset_index()
song_grouped.head()

Unnamed: 0,song,listen_count
0,#!*@ You Tonight [Featuring R. Kelly] (Explici...,78
1,#40-DAVE MATTHEWS BAND,338
2,& Down-Boys Noize,373
3,' Cello Song-Nick Drake,103
4,'97 Bonnie & Clyde-Eminem,93


In [32]:
grouped_sum = song_grouped['listen_count'].sum()
song_grouped['percentage'] = (song_grouped['listen_count']/grouped_sum)*100
song_grouped.sort_values(['listen_count', 'song'], inplace=True)
song_grouped.head()

Unnamed: 0,song,listen_count,percentage
2991,Ghosts (Toxic Avenger Mix)-Ladytron,48,0.0024
5811,No Creo En El Jamas-Juanes,48,0.0024
2147,Don´t Leave Me Now-Amparanoia,50,0.0025
3526,Historia Del Portero-Ricardo Arjona,51,0.00255
7072,Scared-Three Days Grace,51,0.00255


In [None]:
## POPULARITY

In [33]:
pr = popularity_recommender_py()

In [34]:
pr.create(song_df, 'user_id', 'song')

In [35]:
pr.recommend(song_df['user_id'][5])

Unnamed: 0,user_id,song,score,Rank
7127,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Sehr kosmisch-Harmonia,8277,1.0
9084,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Undo-Björk,7032,2.0
2068,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Dog Days Are Over (Radio Edit)-Florence + The ...,6949,3.0
9877,b80344d063b5ccb3212f76538f3d9e43d87dca9e,You're The One-Dwight Yoakam,6412,4.0
6774,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Revelry-Kings Of Leon,6145,5.0
7115,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Secrets-OneRepublic,5841,6.0
3613,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Horn Concerto No. 4 in E flat K495: II. Romanc...,5385,7.0
2717,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Fireflies-Charttraxx Karaoke,4795,8.0
3485,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Hey_ Soul Sister-Train,4758,9.0
8847,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Tive Sim-Cartola,4548,10.0


In [36]:
pr.recommend(song_df['user_id'][10])

Unnamed: 0,user_id,song,score,Rank
7127,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Sehr kosmisch-Harmonia,8277,1.0
9084,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Undo-Björk,7032,2.0
2068,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Dog Days Are Over (Radio Edit)-Florence + The ...,6949,3.0
9877,b80344d063b5ccb3212f76538f3d9e43d87dca9e,You're The One-Dwight Yoakam,6412,4.0
6774,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Revelry-Kings Of Leon,6145,5.0
7115,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Secrets-OneRepublic,5841,6.0
3613,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Horn Concerto No. 4 in E flat K495: II. Romanc...,5385,7.0
2717,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Fireflies-Charttraxx Karaoke,4795,8.0
3485,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Hey_ Soul Sister-Train,4758,9.0
8847,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Tive Sim-Cartola,4548,10.0


In [None]:
## SIMILARITY

In [37]:
ir = item_similarity_recommender_py()
ir.create(song_df, 'user_id', 'song')

In [38]:
user_items = ir.get_user_items(song_df['user_id'][5])

In [39]:
for user_item in user_items:
  print(user_item)

The Cove-Jack Johnson
Entre Dos Aguas-Paco De Lucia
Stronger-Kanye West
Constellations-Jack Johnson
Learn To Fly-Foo Fighters
Apuesta Por El Rock 'N' Roll-Héroes del Silencio
Paper Gangsta-Lady GaGa
Stacked Actors-Foo Fighters
Sehr kosmisch-Harmonia
Heaven's gonna burn your eyes-Thievery Corporation feat. Emiliana Torrini
Let It Be Sung-Jack Johnson / Matt Costa / Zach Gill / Dan Lebowitz / Steve Adams
I'll Be Missing You (Featuring Faith Evans & 112)(Album Version)-Puff Daddy
Love Shack-The B-52's
Clarity-John Mayer
I?'m A Steady Rollin? Man-Robert Johnson
The Old Saloon-The Lonely Island
Behind The Sea [Live In Chicago]-Panic At The Disco
Champion-Kanye West
Breakout-Foo Fighters
Ragged Wood-Fleet Foxes
Mykonos-Fleet Foxes
Country Road-Jack Johnson / Paula Fuga
Oh No-Andrew Bird
Love Song For No One-John Mayer
Jewels And Gold-Angus & Julia Stone
83-John Mayer
Neon-John Mayer
The Middle-Jimmy Eat World
High and dry-Jorge Drexler
All That We Perceive-Thievery Corporation
The Christmas 

In [40]:
ir.recommend(song_df['user_id'][5])

No. of unique songs for the user: 45
no. of unique songs in the training set: 9953
Non zero values in cooccurence_matrix :268460


Unnamed: 0,user_id,song,score,rank
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Quiet Houses-Fleet Foxes,0.034172,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Meadowlarks-Fleet Foxes,0.033473,2
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Heard Them Stirring-Fleet Foxes,0.032683,3
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Great Indoors-John Mayer,0.032123,4
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Tiger Mountain Peasant Song-Fleet Foxes,0.03174,5
5,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Sun It Rises-Fleet Foxes,0.031253,6
6,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Your Protector-Fleet Foxes,0.030409,7
7,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Oliver James-Fleet Foxes,0.030237,8
8,b80344d063b5ccb3212f76538f3d9e43d87dca9e,Belle-Jack Johnson,0.028708,9
9,b80344d063b5ccb3212f76538f3d9e43d87dca9e,If I Could-Jack Johnson,0.02835,10


In [41]:
ir.get_similar_items(['U Smile-Justin Bieber'])

no. of unique songs in the training set: 9953
Non zero values in cooccurence_matrix :8617


Unnamed: 0,user_id,song,score,rank
0,,Love Me-Justin Bieber,0.281805,1
1,,Eenie Meenie-Sean Kingston and Justin Bieber,0.274634,2
2,,Somebody To Love-Justin Bieber,0.272999,3
3,,That Should Be Me-Justin Bieber,0.242199,4
4,,One Less Lonely Girl-Justin Bieber,0.230318,5
5,,One Time-Justin Bieber,0.22771,6
6,,Stuck In The Moment-Justin Bieber,0.214938,7
7,,Down To Earth-Justin Bieber,0.21016,8
8,,Never Let You Go-Justin Bieber,0.185267,9
9,,Overboard-Justin Bieber / Jessica Jarrell,0.177628,10


In [None]:
## I DON'T LISTEN TO JUSTIN BIEBER, IT WAS JUST FOR TESTING PURPOSES