# Build a song recommender system

In [None]:
!pip install turicreate



In [None]:
import os.path
from google.colab import drive

# mount Google Drive to /content/drive/My Drive/
if os.path.isdir("/content/drive/My Drive/"):
  print("Google Drive already mounted")
else:
  drive.mount('/content/drive')

Google Drive already mounted


In [None]:
import os.path
import urllib.request
import tarfile
import zipfile
import gzip
from shutil import copy

def fetch_remote_datafile(filename, remote_url):
  if os.path.isfile("./" + filename):
    print("already have " + filename + " in workspace")
    return
  print("fetching " + filename + " from " + remote_url + "...")
  urllib.request.urlretrieve(remote_url, "./" + filename)

def cache_datafile_in_drive(filename):
  if os.path.isfile("./" + filename) == False:
    print("cannot cache " + filename + ", it is not in workspace")
    return
  
  data_drive_path = "/content/drive/My Drive/Colab Notebooks/"
  if os.path.isfile(data_drive_path + filename):
    print("" + filename + " has already been stored in Google Drive")
  else:
    print("copying " + filename + " to " + data_drive_path)
    copy("./" + filename, data_drive_path)
  

def load_datafile_from_drive(filename, remote_url=None):
  data_drive_path = "/content/drive/My Drive/Colab Notebooks/"
  if os.path.isfile("./" + filename):
    print("already have " + filename + " in workspace")
  elif os.path.isfile(data_drive_path + filename):
    print("have " + filename + " in Google Drive, copying to workspace...")
    copy(data_drive_path + filename, ".")
  elif remote_url != None:
    fetch_remote_datafile(filename, remote_url)
  else:
    print("error: you need to manually download " + filename + " and put in drive")
    
def extract_datafile(filename, expected_extract_artifact=None):
  if expected_extract_artifact != None and (os.path.isfile(expected_extract_artifact) or os.path.isdir(expected_extract_artifact)):
    
    print("files in " + filename + " have already been extracted")
  elif os.path.isfile("./" + filename) == False:
    print("error: cannot extract " + filename + ", it is not in the workspace")
  else:
    extension = filename.split('.')[-1]
    if extension == "zip":
      print("extracting " + filename + "...")
      data_file = open(filename, "rb")
      z = zipfile.ZipFile(data_file)
      for name in z.namelist():
          print("    extracting file", name)
          z.extract(name, "./")
      data_file.close()
    elif extension == "gz":
      print("extracting " + filename + "...")
      if filename.split('.')[-2] == "tar":
        tar = tarfile.open(filename)
        tar.extractall()
        tar.close()
      else:
        data_zip_file = gzip.GzipFile(filename, 'rb')
        data = data_zip_file.read()
        data_zip_file.close()
        extracted_file = open('.'.join(filename.split('.')[0:-1]), 'wb')
        extracted_file.write(data)
        extracted_file.close()
    elif extension == "tar":
      print("extracting " + filename + "...")
      tar = tarfile.open(filename)
      tar.extractall()
      tar.close()
    elif extension == "csv":
      print("do not need to extract csv")
    else:
      print("cannot extract " + filename)
      
def load_cache_extract_datafile(filename, expected_extract_artifact=None, remote_url=None):
  load_datafile_from_drive(filename, remote_url)
  extract_datafile(filename, expected_extract_artifact)
  cache_datafile_in_drive(filename)
  
load_cache_extract_datafile("song_data.sframe.zip", "song_data.sframe", "https://d3c33hcgiwev3.cloudfront.net/dIBuXeIiEemm5A4ynZyB2A_749ef60351624a8d83beedd5ef23367e_song_data.sframe.zip?Expires=1582329600&Signature=O30-yejeeiJkSvfIZvD~tAKnG4f6kQJb5fa~3h2CTpw4dxbUoUOX8fIpIyjaKlApahk5AsL1-45RakOgO1CmyDgW5RdZzGCUXloTUcRDYLlg0Q2lyy0ImpqlXL~lvhEMn-XTrtR6pQ64aYInGIgHFpDpDc5zZMKYypdMQc-rIPE_&Key-Pair-Id=APKAJLTNE6QMUY6HBC5A")

already have song_data.sframe.zip in workspace
files in song_data.sframe.zip have already been extracted
song_data.sframe.zip has already been stored in Google Drive


In [None]:
import turicreate

# Load some music data

In [None]:
song_data = turicreate.SFrame('./song_data.sframe/')

# Explore our data

In [None]:
song_data

user_id,song_id,listen_count,title,artist
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOAKIMP12A8C130995,1,The Cove,Jack Johnson
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Paco De Lucia
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOBXHDL12A81C204C0,1,Stronger,Kanye West
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOBYHAJ12A6701BF1D,1,Constellations,Jack Johnson
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SODACBL12A8C13C273,1,Learn To Fly,Foo Fighters
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SODDNQT12A6D4F5F7E,5,Apuesta Por El Rock 'N' Roll ...,Héroes del Silencio
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SODXRTY12AB0180F3B,1,Paper Gangsta,Lady GaGa
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOFGUAY12AB017B0A8,1,Stacked Actors,Foo Fighters
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOFRQTD12A81C233C0,1,Sehr kosmisch,Harmonia
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOHQWYZ12A6D4FA701,1,Heaven's gonna burn your eyes ...,Thievery Corporation feat. Emiliana Torrini ...

song
The Cove - Jack Johnson
Entre Dos Aguas - Paco De Lucia ...
Stronger - Kanye West
Constellations - Jack Johnson ...
Learn To Fly - Foo Fighters ...
Apuesta Por El Rock 'N' Roll - Héroes del ...
Paper Gangsta - Lady GaGa
Stacked Actors - Foo Fighters ...
Sehr kosmisch - Harmonia
Heaven's gonna burn your eyes - Thievery ...


## Show the most popular songs in the dataset

In [None]:
song_data['song'].show()

In [None]:
song_data['listen_count']

dtype: int
Rows: 1116609
[1, 2, 1, 1, 1, 5, 1, 1, 1, 1, 1, 5, 1, 1, 5, 1, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 5, 5, 2, 1, 9, 2, 2, 1, 1, 6, 2, 1, 10, 1, 1, 9, 1, 1, 1, 10, 2, 6, 13, 6, 4, 2, 4, 1, 3, 5, 1, 8, 18, 12, 2, 2, 1, 2, 2, 2, 1, 3, 4, 3, 1, 2, ... ]

In [None]:
song_data_listen_counts = song_data.groupby('artist', operations={'total_count': turicreate.aggregate.SUM('listen_count')}).sort('total_count',ascending=False)
song_data_listen_counts[-1]

{'artist': 'William Tabbert', 'total_count': 14}

In [None]:
kany_west = song_data[song_data['artist'] == 'Kanye West']
len(kany_west['user_id'].unique())

2522

In [None]:
Foo_Fighters = song_data[song_data['artist'] == 'Foo Fighters']
len(Foo_Fighters['user_id'].unique())

2055

In [None]:
Taylor_Swift = song_data[song_data['artist'] == 'Taylor Swift']
len(Taylor_Swift['user_id'].unique())

3246

In [None]:
Lady_GaGa = song_data[song_data['artist'] == 'Lady GaGa']
len(Lady_GaGa['user_id'].unique())

2928

# Count the number of unique users in the data

In [None]:
users = song_data['user_id'].unique()

In [None]:
len(users)

66346

# Create a song recommender

In [None]:
train_data,test_data = song_data.random_split(.8,seed=0)

## Create a very simple popularity recommender

In [None]:
popularity_model = turicreate.popularity_recommender.create(train_data,
                                                           user_id = 'user_id',
                                                           item_id = 'song')

## Use the popularity model to make some predictions

In [None]:
popularity_model.recommend(users=[users[0]])

user_id,song,score,rank
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ...,Sehr kosmisch - Harmonia,4754.0,1
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ...,Undo - Björk,4227.0,2
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ...,You're The One - Dwight Yoakam ...,3781.0,3
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ...,Dog Days Are Over (Radio Edit) - Florence + The ...,3633.0,4
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ...,Revelry - Kings Of Leon,3527.0,5
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ...,Horn Concerto No. 4 in E flat K495: II. Romance ...,3161.0,6
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ...,Secrets - OneRepublic,3148.0,7
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ...,Fireflies - Charttraxx Karaoke ...,2532.0,8
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ...,Tive Sim - Cartola,2521.0,9
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ...,Drop The World - Lil Wayne / Eminem ...,2053.0,10


In [None]:
popularity_model.recommend(users=[users[1]])

user_id,song,score,rank
02f015d32ac2cd1e52d26e3ec 36048711dd5711b ...,Sehr kosmisch - Harmonia,4754.0,1
02f015d32ac2cd1e52d26e3ec 36048711dd5711b ...,Undo - Björk,4227.0,2
02f015d32ac2cd1e52d26e3ec 36048711dd5711b ...,You're The One - Dwight Yoakam ...,3781.0,3
02f015d32ac2cd1e52d26e3ec 36048711dd5711b ...,Dog Days Are Over (Radio Edit) - Florence + The ...,3633.0,4
02f015d32ac2cd1e52d26e3ec 36048711dd5711b ...,Revelry - Kings Of Leon,3527.0,5
02f015d32ac2cd1e52d26e3ec 36048711dd5711b ...,Horn Concerto No. 4 in E flat K495: II. Romance ...,3161.0,6
02f015d32ac2cd1e52d26e3ec 36048711dd5711b ...,Secrets - OneRepublic,3148.0,7
02f015d32ac2cd1e52d26e3ec 36048711dd5711b ...,Hey_ Soul Sister - Train,2538.0,8
02f015d32ac2cd1e52d26e3ec 36048711dd5711b ...,Fireflies - Charttraxx Karaoke ...,2532.0,9
02f015d32ac2cd1e52d26e3ec 36048711dd5711b ...,Tive Sim - Cartola,2521.0,10


# Build a recommender with personalization

In [None]:
personalized_model = turicreate.item_similarity_recommender.create(train_data,
                                                                  user_id = 'user_id',
                                                                  item_id = 'song')

In [None]:
subset_test_users = test_data['user_id'].unique()[0:10000]

In [None]:
personalized_model_table = personalized_model.recommend(subset_test_users,k=1)

In [None]:
personalized_model_table.groupby('song',operations={'count': turicreate.aggregate.COUNT()}).sort('count',ascending=False)

song,count
Undo - Björk,427
Secrets - OneRepublic,398
Revelry - Kings Of Leon,230
You're The One - Dwight Yoakam ...,154
Hey_ Soul Sister - Train,114
Sehr kosmisch - Harmonia,111
Fireflies - Charttraxx Karaoke ...,103
Horn Concerto No. 4 in E flat K495: II. Romance ...,90
OMG - Usher featuring will.i.am ...,70
The Scientist - Coldplay,50


## Apply personalized model to make song recommendations

In [None]:
personalized_model.recommend(users=[users[0]])

user_id,song,score,rank
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ...,Cuando Pase El Temblor - Soda Stereo ...,0.0194504536115206,1
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ...,Fireflies - Charttraxx Karaoke ...,0.0144737317011906,2
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ...,Love Is A Losing Game - Amy Winehouse ...,0.0142865960414593,3
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ...,Marry Me - Train,0.0141334717090313,4
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ...,Secrets - OneRepublic,0.013591665488023,5
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ...,Sehr kosmisch - Harmonia,0.0133987894425025,6
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ...,Te Hacen Falta Vitaminas - Soda Stereo ...,0.0129302831796499,7
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ...,OMG - Usher featuring will.i.am ...,0.0127778282532325,8
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ...,Y solo se me ocurre amarte (Unplugged) - ...,0.0123411279458266,9
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ...,No Dejes Que... - Caifanes ...,0.0121042499175438,10


In [None]:
personalized_model.recommend(users=[users[1]])

user_id,song,score,rank
02f015d32ac2cd1e52d26e3ec 36048711dd5711b ...,Where The Boat Leaves From (Album) - Zac Brown ...,0.0615360885858535,1
02f015d32ac2cd1e52d26e3ec 36048711dd5711b ...,Different Kind Of Fine (Album) - Zac Brown Band ...,0.0605283752083778,2
02f015d32ac2cd1e52d26e3ec 36048711dd5711b ...,Jolene (Album) - Zac Brown Band ...,0.0578682050108909,3
02f015d32ac2cd1e52d26e3ec 36048711dd5711b ...,Sic 'Em On A Chicken (Album) - Zac Brown Band ...,0.0551866963505744,4
02f015d32ac2cd1e52d26e3ec 36048711dd5711b ...,Who's Kissing You Tonight - Jason Aldean ...,0.0530633330345153,5
02f015d32ac2cd1e52d26e3ec 36048711dd5711b ...,What Country Is - Luke Bryan ...,0.0374908074736595,6
02f015d32ac2cd1e52d26e3ec 36048711dd5711b ...,Highway 20 Ride (Album) - Zac Brown Band ...,0.037331573665142,7
02f015d32ac2cd1e52d26e3ec 36048711dd5711b ...,Do I - Luke Bryan,0.0330773591995239,8
02f015d32ac2cd1e52d26e3ec 36048711dd5711b ...,One Fine Wire - Colbie Caillat ...,0.03125,9
02f015d32ac2cd1e52d26e3ec 36048711dd5711b ...,Midnight Bottle - Colbie Caillat ...,0.0307377055287361,10


# Apply model to find similar songs in the data set

In [None]:
personalized_model.get_similar_items(['With Or Without You - U2'])

song,similar,score,rank
With Or Without You - U2,I Still Haven't Found What I'm Looking For ...,0.0428571701049804,1
With Or Without You - U2,Hold Me_ Thrill Me_ Kiss Me_ Kill Me - U2 ...,0.033734917640686,2
With Or Without You - U2,Window In The Skies - U2,0.032835841178894,3
With Or Without You - U2,Vertigo - U2,0.030075192451477,4
With Or Without You - U2,Sunday Bloody Sunday - U2,0.0271317958831787,5
With Or Without You - U2,Bad - U2,0.0251798629760742,6
With Or Without You - U2,A Day Without Me - U2,0.0237154364585876,7
With Or Without You - U2,Another Time Another Place - U2 ...,0.0203251838684082,8
With Or Without You - U2,Walk On - U2,0.0202020406723022,9
With Or Without You - U2,Get On Your Boots - U2,0.0196850299835205,10


In [None]:
personalized_model.get_similar_items(['Chan Chan (Live) - Buena Vista Social Club'])

song,similar,score,rank
Chan Chan (Live) - Buena Vista Social Club ...,Murmullo - Buena Vista Social Club ...,0.1881188154220581,1
Chan Chan (Live) - Buena Vista Social Club ...,La Bayamesa - Buena Vista Social Club ...,0.1871921420097351,2
Chan Chan (Live) - Buena Vista Social Club ...,Amor de Loca Juventud - Buena Vista Social Club ...,0.1848341226577758,3
Chan Chan (Live) - Buena Vista Social Club ...,Diferente - Gotan Project,0.0214592218399047,4
Chan Chan (Live) - Buena Vista Social Club ...,Mistica - Orishas,0.0205761194229125,5
Chan Chan (Live) - Buena Vista Social Club ...,Hotel California - Gipsy Kings ...,0.0193049907684326,6
Chan Chan (Live) - Buena Vista Social Club ...,Nací Orishas - Orishas,0.0191571116447448,7
Chan Chan (Live) - Buena Vista Social Club ...,Gitana - Willie Colon,0.0187969803810119,8
Chan Chan (Live) - Buena Vista Social Club ...,Le Moulin - Yann Tiersen,0.0187969803810119,9
Chan Chan (Live) - Buena Vista Social Club ...,Criminal - Gotan Project,0.0187793374061584,10


# Compare the models quantitatively
We now formally compare the popularity and the personalized models using precision-recall curves. 

In [None]:
model_performance = turicreate.recommender.util.compare_models(test_data, [popularity_model, personalized_model], user_sample=.05)

compare_models: using 2931 users to estimate model performance
PROGRESS: Evaluate model M0





Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.023882633913340147 | 0.005493132668158254 |
|   2    | 0.023200272944387566 | 0.010979122852203706 |
|   3    | 0.020584555896736044 | 0.014985289550284437 |
|   4    | 0.019447287615148405 | 0.018461124909436088 |
|   5    | 0.019174343227567402 | 0.023438753709992204 |
|   6    | 0.018310019333560752 | 0.027243055288091133 |
|   7    | 0.01744894477750163  | 0.03068575345955082  |
|   8    | 0.01646195837598088  | 0.03270089685135746  |
|   9    | 0.015391030744152576 | 0.03515808559309072  |
|   10   | 0.014773114977823304 | 0.037839601734176956 |
+--------+----------------------+----------------------+
[10 rows x 3 columns]

PROGRESS: Evaluate model M1





Precision and recall summary statistics by cutoff
+--------+---------------------+---------------------+
| cutoff |    mean_precision   |     mean_recall     |
+--------+---------------------+---------------------+
|   1    | 0.14193108154213568 | 0.04020885763465092 |
|   2    | 0.11992494029341512 | 0.06381147663133338 |
|   3    | 0.10462868190606164 | 0.08062717495931405 |
|   4    | 0.09433640395769366 | 0.09449082041200774 |
|   5    | 0.08563630160354807 | 0.10544427472523679 |
|   6    |  0.0792675992266575 | 0.11670665615138479 |
|   7    | 0.07379246478529992 | 0.12491230811343401 |
|   8    | 0.06908904810644836 | 0.13344938355173755 |
|   9    | 0.06436938473785976 | 0.13965820751900576 |
|   10   | 0.06069600818833156 | 0.14526072604373505 |
+--------+---------------------+---------------------+
[10 rows x 3 columns]



The table shows that the personalized model provides much better performance.