# Building a song recommender

In [1]:
import pandas as pd
import numpy as np
import sframe

# Load music data

In [2]:
song_data = pd.read_csv('song_data.csv')

# Explore data

Music data shows how many times a user listened to a song, as well as the details of the song.

In [3]:
song_data.head()

Unnamed: 0,user_id,song_id,listen_count,title,artist,song
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Jack Johnson,The Cove - Jack Johnson
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Paco De Lucia,Entre Dos Aguas - Paco De Lucia
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Kanye West,Stronger - Kanye West
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,Jack Johnson,Constellations - Jack Johnson
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,Foo Fighters,Learn To Fly - Foo Fighters


## Showing the most popular songs in the dataset

In [4]:
song_data['song'].head()

0            The Cove - Jack Johnson
1    Entre Dos Aguas - Paco De Lucia
2              Stronger - Kanye West
3      Constellations - Jack Johnson
4        Learn To Fly - Foo Fighters
Name: song, dtype: object

In [5]:
grouped_artist = song_data.groupby('artist')
song_counts=grouped_artist['listen_count'].sum()
song_counts.sort_values(ascending=False).head(5)

artist
Kings Of Leon             43218
Dwight Yoakam             40619
Björk                     38889
Coldplay                  35362
Florence + The Machine    33387
Name: listen_count, dtype: int64

In [6]:
user_counts=grouped_artist['user_id'].count()  #contains duplicates
user_counts.sort_values(ascending=False).head(5)

artist
Coldplay                  13945
Florence + The Machine    10082
Kings Of Leon              9908
Justin Bieber              8796
The Black Keys             8141
Name: user_id, dtype: int64

In [7]:
len(song_data[song_data['artist']=='Coldplay']['user_id'])

13945

In [8]:
len(set(song_data[song_data['artist']=='Coldplay']['user_id']))

6340

In [9]:
user_counts2 = grouped_artist['user_id'].nunique()
user_counts2.sort_values(ascending=False).head(5)

artist
Kings Of Leon             7373
Coldplay                  6340
Harmonia                  5970
Björk                     5834
Florence + The Machine    5615
Name: user_id, dtype: int64

In [10]:
print user_counts2['Coldplay']
print song_counts['Coldplay']

6340
35362


In [11]:
print user_counts2['Taylor Swift']
print song_counts['Taylor Swift']

3246
19376


In [12]:
print user_counts2['Foo Fighters']
print song_counts['Foo Fighters']

2055
9504


In [13]:
print user_counts2['Lady GaGa']
print song_counts['Lady GaGa']

2928
12224


In [14]:
print user_counts2['Kanye West']
print song_counts['Kanye West']

2522
9992


In [15]:
#grouped_artist = song_data.groupby('artist')
#song_counts=grouped_artist['listen_count'].agg(np.sum)
#song_counts.sort_values(ascending=False).head(40)

In [16]:
len(song_data)

1116609

In [31]:
grouped_song = song_data.groupby('song')
song_counts2=grouped_song['listen_count'].sum()
song_counts2.sort_values(ascending=False).head(5)

song
You\'re The One - Dwight Yoakam                                                                                                                 40619
Undo - Björk                                                                                                                                    36059
Revelry - Kings Of Leon                                                                                                                         30391
Horn Concerto No. 4 in E flat K495: II. Romance (Andante cantabile) - Barry Tuckwell/Academy of St Martin-in-the-Fields/Sir Neville Marriner    21953
Sehr kosmisch - Harmonia                                                                                                                        21646
Name: listen_count, dtype: int64

## Count number of unique users in the dataset

In [32]:
users = set(song_data['user_id'])

In [33]:
len(users)

66346

# Create a song recommender

In [34]:
from sklearn.model_selection import train_test_split
train_data,test_data = train_test_split(song_data, test_size = 0.2, random_state = 42)

## Simple popularity-based recommender

In [35]:
#popularity_model = sframe.popularity_recommender.create(train_data,
         #                                                user_id='user_id',
        #                                                 item_id='song')

### Use the popularity model to make some predictions

A popularity model makes the same prediction for all users, so provides no personalization.

In [36]:
#popularity_model.recommend(users=[users[0]])

In [37]:
#popularity_model.recommend(users=[users[1]])

## Build a song recommender with personalization

We now create a model that allows us to make personalized recommendations to each user. 

In [38]:
#personalized_model = graphlab.item_similarity_recommender.create(train_data,
  #                                                              user_id='user_id',
   #                                                             item_id='song')

### Applying the personalized model to make song recommendations

As you can see, different users get different recommendations now.

In [39]:
#personalized_model.recommend(users=[users[0]])

In [40]:
#personalized_model.recommend(users=[users[1]])

### We can also apply the model to find similar songs to any song in the dataset

In [41]:
#personalized_model.get_similar_items(['With Or Without You - U2'])

In [42]:
#personalized_model.get_similar_items(['Chan Chan (Live) - Buena Vista Social Club'])

# Quantitative comparison between the models

We now formally compare the popularity and the personalized models using precision-recall curves. 

In [43]:
#if graphlab.version[:3] >= "1.6":
 #   model_performance = graphlab.compare(test_data, [popularity_model, personalized_model], user_sample=0.05)
 #   graphlab.show_comparison(model_performance,[popularity_model, personalized_model])
#else:
 #   %matplotlib inline
  #  model_performance = graphlab.recommender.util.compare_models(test_data, [popularity_model, personalized_model], user_sample=.05)

The curve shows that the personalized model provides much better performance. 