In [1]:
import pandas
import numpy as np
import tensorflow as tf
from sklearn.cross_validation import train_test_split



### Loading Datasets
A Million Song Dataset (MSD) is used for this project

In [2]:
# =========================================
# Data Loading
# Read userid-songid-listen_count triplets
# =========================================

# In case want to take the dataset from internet
# This step might take time to download data from external sources
# triplets_file = 'https://static.turi.com/datasets/millionsong/10000.txt'
# songs_metadata_file = 'https://static.turi.com/datasets/millionsong/song_data.csv'

triplets_file = './dataset/10000.txt'
songs_metadata_file = './dataset/song_data.csv'

try: 
    song.map(str) + ' - ' + data_df['artist_name']_df = pandas.read_table(triplets_file, header=None)
    metadata_df = pandas.read_csv(songs_metadata_file)
except:
    song_df = pandas.read_table('https://static.turi.com/datasets/millionsong/10000.txt', header=None)
    metadata_df = pandas.read_csv('https://static.turi.com/datasets/millionsong/song_data.csv')

song_df.columns=['user_id', 'song_id', 'listen_count']

dataset_df = pandas.merge(song_df, metadata_df.drop_duplicates(['song_id']), on='song_id', how='left')

# taking only 10000 dataset
data_df = dataset_df.head(10000)

# =========================================
# Data Pre-processing
# =========================================
# Merging unnecessary columns (song title - Artist)
data_df['song'] = data_df['title'].map(str) + ' - ' + data_df['artist_name']
data_df = data_df.drop(['title', 'release', 'artist_name', 'year'], axis=1)
users = data_df['user_id'].unique()
songs = data_df['song_id'].unique()
print('Number of unique users: ' + str(len(users)))
print('Number of unique songs:' + str(len(songs)))
print(data_df.head(10))

# =========================================
# Train Data Preparation
# =========================================
train_data, test_data = train_test_split(data_df, test_size = 0.20, random_state=0)

Number of unique users: 365
Number of unique songs:5175
                                    user_id             song_id  listen_count  \
0  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAKIMP12A8C130995             1   
1  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBBMDR12A8C13253B             2   
2  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBXHDL12A81C204C0             1   
3  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBYHAJ12A6701BF1D             1   
4  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SODACBL12A8C13C273             1   
5  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SODDNQT12A6D4F5F7E             5   
6  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SODXRTY12AB0180F3B             1   
7  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOFGUAY12AB017B0A8             1   
8  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOFRQTD12A81C233C0             1   
9  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOHQWYZ12A6D4FA701             1   

                                                song

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### Popularity Based Recommender
This recommendation is based on the popularity of the item (listen_count in this case).

In [3]:
import Recommender.Popularity as Recommender_Popularity

Recommender0 = Recommender_Popularity.Popularity()
Recommender0.create(train_data, 'song', 'listen_count')
Recommender0.recommend(12)

Unnamed: 0,song,listen_count
0,You're The One - Dwight Yoakam,399
1,Waiting For A Dream - Rufus Wainwright,247
2,Undo - Björk,151
3,Times - Tenth Avenue North,140
4,Catch You Baby (Steve Pitron & Max Sanna Radio...,137
5,Revelry - Kings Of Leon,136
6,Fools - The Dodos,128
7,Sehr kosmisch - Harmonia,115
8,Secrets - OneRepublic,77
9,Numb (Album Version) - Disturbed,77


### Collaborative Filtering Recommender
This section is using Collaborative Filtering to recommend song to a user.

#### Item - Item Collaborative Filtering using Cooccurence Matrix

This method is achieved by constructing coocurence matrix of the (user_songs X all_songs). In this example, the recommendations are the songs that are outside of his listening list (songs that he has already listened to).

In [4]:
import Recommender.CF_Item_Item as Recommender_CF_II

# using Item-item Collaborative Filtering
Recommender1 = Recommender_CF_II.CF_Item_Item()
Recommender1.create(data_df, 'user_id', 'song',)

user_id = users[5]
user_items = Recommender1.getUserItems(user_id)
print("------------------------------------------------------------------------------------")
print("Song that has been played by the user userid: %s:" % user_id)
print("------------------------------------------------------------------------------------")
for item in user_items:
    print(item)
    
print("----------------------------------------------------------------------")
print("Recommendation process going on:")
print("----------------------------------------------------------------------")

Recommender1.recommend(user_id, 10)

------------------------------------------------------------------------------------
Song that has been played by the user userid: 4bd88bfb25263a75bbdd467e74018f4ae570e5df:
------------------------------------------------------------------------------------
Somebody To Love - Justin Bieber
Ghosts 'n' Stuff (Original Instrumental Mix) - Deadmau5
My Dad's Gone Crazy - Eminem / Hailie Jade
Missing You - John Waite
Ya Nada Queda - Kudai
The Real Slim Shady - Eminem
Forgive Me - Leona Lewis
Say My Name - Destiny's Child
Just Lose It - Eminem
16 Candles - The Crests
Without Me - Eminem
Push It - Salt-N-Pepa
Speechless - Lady GaGa
----------------------------------------------------------------------
Recommendation process going on:
----------------------------------------------------------------------
No. of unique items (songs) for the user: 13
No. of unique items (songs) in the training set: 5151
Non zero values in cooccurence_matrix :2813


Unnamed: 0,user_id,song,score,rank
0,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Superman - Eminem / Dina Rae,0.096418,1
1,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Mockingbird - Eminem,0.08991,2
2,4bd88bfb25263a75bbdd467e74018f4ae570e5df,U Smile - Justin Bieber,0.061363,3
3,4bd88bfb25263a75bbdd467e74018f4ae570e5df,I'm Back - Eminem,0.057932,4
4,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Here Without You - 3 Doors Down,0.055542,5
5,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Teach Me How To Dougie - California Swag District,0.054701,6
6,4bd88bfb25263a75bbdd467e74018f4ae570e5df,American Idiot [feat. Green Day & The Cast Of ...,0.054508,7
7,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Monster - Lady GaGa,0.052736,8
8,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Hellbound - J-Black & Masta Ace,0.052564,9
9,4bd88bfb25263a75bbdd467e74018f4ae570e5df,You Found Me (Album Version) - The Fray,0.052564,10


#### Collaborative Filtering using k-NearestNeighbors


In [5]:
import Recommender.CF_kNN as Recommender_CF_kNN

Recommender2 = Recommender_CF_kNN.CF_kNN()
Recommender2.create(data_df, 'user_id', 'song', 'listen_count')

print("\n----------------------------------------------------------------------")
print("Recommendation with random song as a reference")
print("Recommendation process going on:")
print("----------------------------------------------------------------------")

Recommender2.recommend(user_id, 10)

training...

----------------------------------------------------------------------
Recommendation with random song as a reference
Recommendation process going on:
----------------------------------------------------------------------
Recommendations for Easier - Grizzly Bear:





Unnamed: 0,user_id,song,distance,rank
0,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Swing - Zero 7,0.0,1
1,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Black Sands - Bonobo,0.0,2
2,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Salt Water Sound - Zero 7,0.0,3
3,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Easier - Grizzly Bear,0.0,4
4,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Risingson - Massive Attack,0.0,5
5,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Whoo! Alright - Yeah...Uh Huh. - The Rapture,0.0,6
6,4bd88bfb25263a75bbdd467e74018f4ae570e5df,I'm Not Alone - Calvin Harris,0.0,7
7,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Never Met The Gooch - Kid Dynamite,0.0,8
8,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Meeting Paris Hilton (Album) - CSS,0.0,9
9,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Cold Summer - A Love Ends Suicide,0.0,10


In [6]:
print("\n----------------------------------------------------------------------")
print("Recommendation with query song as a reference")
print("Recommendation process going on:")
print("----------------------------------------------------------------------")

Recommender2.recommend(user_id, 10, item='Somebody To Love - Justin Bieber')


----------------------------------------------------------------------
Recommendation with query song as a reference
Recommendation process going on:
----------------------------------------------------------------------
fuzzy ratio query results:
[('One Time - Justin Bieber', 75, 3161), ('Somebody To Love - Justin Bieber', 100, 3868), ('Somebody To Love - Queen', 79, 3869)]

Recommendations for Somebody To Love - Justin Bieber:



Unnamed: 0,user_id,song,distance,rank
0,4bd88bfb25263a75bbdd467e74018f4ae570e5df,U Smile - Justin Bieber,0.364605,1
1,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Teach Me How To Dougie - California Swag District,0.487478,2
2,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Girls Just Wanna Have Fun - Miley Cyrus,0.527134,3
3,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Be Somebody - Kings Of Leon,0.527134,4
4,4bd88bfb25263a75bbdd467e74018f4ae570e5df,My Dad's Gone Crazy - Eminem / Hailie Jade,0.533568,5
5,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Push It - Salt-N-Pepa,0.541086,6
6,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Dream On - Aerosmith,0.5514,7
7,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Without Me - Eminem,0.564752,8
8,4bd88bfb25263a75bbdd467e74018f4ae570e5df,All The Right Moves - OneRepublic,0.565315,9
9,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Time To Pretend - MGMT,0.565645,10


#### Collaborative Filtering using Matrix Factorization (SVD - Singular Value Decomposition)

In [7]:
import Recommender.CF_Matrix_Factorization as Recommender_CF_MF

Recommender3 = Recommender_CF_MF.SVD()
Recommender3.create(train_data, 'user_id', 'song', 'listen_count')
Recommender3.recommend(10, item='Time To Pretend - MGMT')

Int64Index([3963], dtype='int64')


Unnamed: 0,song,corr
3963,Time To Pretend - MGMT,1.0
875,Dental Care - Owl City,0.996354
4255,What You Know - Two Door Cinema Club,0.995999
2499,My Happy Ending - Avril Lavigne,0.995539
4436,You'd Be So Nice To Come Home To - Julie London,0.995333
2117,Lean Back [Remix feat. Lil Jon_ Eminem_ Mase &...,0.995302
1464,Half Alive - Secondhand Serenade,0.995302
1561,Hero (Album Version) - Skillet,0.995302
483,Bottom of a Bottle (Explicit Album Version) - ...,0.995302
1158,Falling In Love Again - Eagle-Eye Cherry,0.995302


### Content-Based Music Recommender System

In [8]:
song_df2 = pandas.read_csv('./dataset/MSD_songs.csv', sep='\t')
song_df2['title'] = song_df2['title'].map(str) + ' - ' + song_df2['artist_name']
song_df2 = song_df2.drop(['dig7_id', 'release', 'artist_name', 'year'], axis=1)
tag_df2 = pandas.read_csv('./dataset/LAST_FM_tags.csv', sep='\t')
tag_df2 = pandas.merge(tag_df2,song_df2, on='track_id', how='inner')
print(song_df2.head())
print(tag_df2.head())

             track_id             song_id  \
0  TRAHHBV128F930B736  SOHWZOW12AB017EE95   
1  TRAHHSV128F42374E3  SOQBKAT12A81C20661   
2  TRAHHJY12903CA73BD  SOEKMUY12AB018CB47   
3  TRAHHPE128F934AC3B  SOFGBOY12AB018549F   
4  TRAHHAY12903CD8B6F  SOXHLIO12AB0185C6F   

                                               title  
0            Spirit and Machine - Marcelo Radulovich  
1  I Only Have Eyes For You (Album Version) - Joh...  
2                        Porcelain Man - John Debney  
3               (Wake Up) Time to Die - Lizzy Borden  
4                     Easier By Now - Jamie Richards  
             track_id             tags             song_id  \
0  TRAHHSV128F42374E3    american_idol  SOQBKAT12A81C20661   
1  TRAHHSV128F42374E3  only_eyes_for_u  SOQBKAT12A81C20661   
2  TRAHHMV12903CD0B0D             sexy  SOKEFDU12AB0187D54   
3  TRAHHMV12903CD0B0D              pop  SOKEFDU12AB0187D54   
4  TRAHHMV12903CD0B0D            dance  SOKEFDU12AB0187D54   

                          

In [9]:
import Recommender.DeepContent.DeepContent as RecommenderDeepContent
Recommender5 = RecommenderDeepContent.DeepContent(song_df2, tag_df2, song_df)
Recommender5.train()
query = 'PDA - Backstreet Boys'
Recommender5.song_df[Recommender5.song_df['title'] == query].head()

In [10]:
Recommender5.recommendFromQuery(query,10)

Unnamed: 0,track_id,song_id,title,tags
0,TRARQBS128F4262E62,SOUVEIY12A8C12FCF3,Playboy - Teena Marie,attitude favorite_by_this_singer coy old_schoo...
1,TRAQVOG128F9340E58,SOJXPPN12A58A7E3E7,I.Y.A. - Chris Brown,vocalization favorite_by_this_singer romantic ...
2,TRASCMB128F422B871,SOYWFWL12A6D4FE07D,Ninja Tattoo - Shanadoo,j-pop dance female_vocalists
3,TRBAADN128F426B7A4,SOSOPKK12A8C133DC9,Rules of Attraction (Chris Cox) - Tina Ann,pop dance female_vocalists
4,TRBEVND128F93429DD,SOAKVYO12AB0187E78,The Kangaroo Theory - Edelweiss,pop dance best
5,TRBACUF128F4247F8C,SOCBWVV12A8C13605F,Call Me - Mint Condition,soul rhythm_and_blues urban ashook69_library n...
6,TRAIIPY128F426B797,SOQJZSD12A58A7B143,Too Late (Orange Factory Extended Mix) - Tina Ann,dance
7,TRAUNPM12903CB7CCC,SOZQJKU12A8C13A0C1,Fading Lady Light - Jefferson Starship,pop
8,TRANXHZ128F148D4DB,SOPPCXM12A6D4F66BC,Vogue [Live] - Madonna,pop live dance madonna sexy classic remix fema...
9,TRANZFP128E07995A9,SOPXSEX12A670204A0,That's What You Said (LP Version) - Phil Collins,pop rock dance lovesongs


#### Tresholding songs that have number of tags above specification

In [21]:
len(Recommender5.tag_df)

99298

In [27]:
try_df = Recommender5.tag_df.groupby(['track_id']).agg({'tags': 'count'})
len(try_df)

4833

In [28]:
try_df.head()

Unnamed: 0_level_0,tags
track_id,Unnamed: 1_level_1
TRAAAAW128F429D538,10
TRAAABD128F429CF47,23
TRAAADZ128F9348C2E,1
TRAAAEF128F4273421,4
TRAAAFD128F92F423A,2


In [34]:
more_try_df = try_df[try_df['tags'] > 3].sort_values('tags', ascending=False)
len(more_try_df)

3171

In [35]:
more_try_df.head()

Unnamed: 0_level_0,tags
track_id,Unnamed: 1_level_1
TRAFTDN128F427C29D,100
TRBECNW128F426BE82,100
TRAJKXS128F42340B5,100
TRAULOO12903CF47B6,100
TRAEZLQ128F427F954,100


In [31]:
Recommender5.tag_df[Recommender5.tag_df['track_id'] == 'TRAAAAW128F429D538']

Unnamed: 0,track_id,tags,song_id,title
11874,TRAAAAW128F429D538,bay_area,SOMZWCG12A8C13C480,I Didn't Mean To - Casual
11875,TRAAAAW128F429D538,hieroglyiphics,SOMZWCG12A8C13C480,I Didn't Mean To - Casual
11876,TRAAAAW128F429D538,classic,SOMZWCG12A8C13C480,I Didn't Mean To - Casual
11877,TRAAAAW128F429D538,hip-hop,SOMZWCG12A8C13C480,I Didn't Mean To - Casual
11878,TRAAAAW128F429D538,stream,SOMZWCG12A8C13C480,I Didn't Mean To - Casual
11879,TRAAAAW128F429D538,og,SOMZWCG12A8C13C480,I Didn't Mean To - Casual
11880,TRAAAAW128F429D538,1979-2006:_a_hip-hop_odyssey_-_800_tracks_in_a...,SOMZWCG12A8C13C480,I Didn't Mean To - Casual
11881,TRAAAAW128F429D538,heiroglyphics,SOMZWCG12A8C13C480,I Didn't Mean To - Casual
11882,TRAAAAW128F429D538,oaksterdamn,SOMZWCG12A8C13C480,I Didn't Mean To - Casual
11883,TRAAAAW128F429D538,heard_on_pandora,SOMZWCG12A8C13C480,I Didn't Mean To - Casual


#### Checking the highest tags used

In [63]:
# similar_tags = [{'favorite':['favorites','favourites','favourite','favorite_songs']}]
# for i in similar_tags:
#     i
#     for j in i:
#         row_to_change = tag_df2.loc[tag_df2['tags'] == j]
#         print row_to_change

In [54]:
popular_tag_df = tag_df2.groupby('tags').agg({'tags': 'count'}).sort_values('tags',ascending=False)
popular_tag_df = popular_tag_df[:50]
len(popular_tag_df)

50

In [55]:
popular_tag_df.index

Index([u'rock', u'pop', u'alternative', u'favorites', u'love',
       u'female_vocalists', u'alternative_rock', u'indie', u'american',
       u'classic_rock', u'00s', u'blues', u'hard_rock', u'dance', u'metal',
       u'male_vocalists', u'electronic', u'awesome', u'beautiful', u'90s',
       u'punk', u'80s', u'hip-hop', u'singer-songwriter', u'guitar', u'chill',
       u'mellow', u'soul', u'chillout', u'favorite', u'hip_hop', u'favourites',
       u'jazz', u'favourite', u'british', u'cool', u'oldies', u'country',
       u'indie_rock', u'rap', u'instrumental', u'sexy', u'punk_rock',
       u'female_vocalist', u'acoustic', u'seen_live', u'rnb', u'catchy',
       u'electronica', u'favorite_songs'],
      dtype='object', name=u'tags')

In [56]:
popular_processed_tag_df = pandas.merge(tag_df2, pandas.DataFrame({'tags':popular_tag_df.index}), on='tags', how='inner')
popular_processed_tag_df.head()

Unnamed: 0,track_id,tags,song_id,title
0,TRAHHMV12903CD0B0D,sexy,SOKEFDU12AB0187D54,PDA - Backstreet Boys
1,TRAHSYA128F428143A,sexy,SOQLGZH12AF729B7CF,I Put a Spell on You - Screamin' Jay Hawkins
2,TRAGHBP128E0793AF7,sexy,SOGMROZ12A679D8AE9,Wrapped Around Your Finger - The Police
3,TRAGJPA128F92F9665,sexy,SOUATDY12AB0185157,FEZ-Being Born - U2
4,TRAGFPP128E078F34C,sexy,SOJCBAM12A6701FD04,Paralyzed - The Cardigans


In [57]:
len(tag_df2['track_id'].unique())

4833

In [60]:
tag_df2[tag_df2['track_id'] == 'TRAYJOF128F92F9EEA']

Unnamed: 0,track_id,tags,song_id,title
19240,TRAYJOF128F92F9EEA,female_vocalists,SOAYXCJ12AB0183822,Cry - Kelly Clarkson
19241,TRAYJOF128F92F9EEA,pop,SOAYXCJ12AB0183822,Cry - Kelly Clarkson
19242,TRAYJOF128F92F9EEA,pop_rock,SOAYXCJ12AB0183822,Cry - Kelly Clarkson
19243,TRAYJOF128F92F9EEA,heartbreaking,SOAYXCJ12AB0183822,Cry - Kelly Clarkson
19244,TRAYJOF128F92F9EEA,rock,SOAYXCJ12AB0183822,Cry - Kelly Clarkson
19245,TRAYJOF128F92F9EEA,kelly_clarkson,SOAYXCJ12AB0183822,Cry - Kelly Clarkson
19246,TRAYJOF128F92F9EEA,pop_ballad,SOAYXCJ12AB0183822,Cry - Kelly Clarkson
19247,TRAYJOF128F92F9EEA,sad,SOAYXCJ12AB0183822,Cry - Kelly Clarkson
19248,TRAYJOF128F92F9EEA,heartbreak,SOAYXCJ12AB0183822,Cry - Kelly Clarkson
19249,TRAYJOF128F92F9EEA,american,SOAYXCJ12AB0183822,Cry - Kelly Clarkson
