In [1]:
import pandas
import numpy as np
import tensorflow as tf
from sklearn.cross_validation import train_test_split



### Content-Based Music Recommender System

In [2]:
# song_df = pandas.read_table('./dataset/10000.txt', header=None)
# song_df.columns=['user_id', 'song_id', 'listen_count']

song_df2 = pandas.read_csv('./dataset/MSD_songs.csv', sep='\t')
song_df2['title'] = song_df2['title'].map(str) + ' - ' + song_df2['artist_name']
song_df2 = song_df2.drop(['dig7_id', 'release', 'artist_name', 'year'], axis=1)
tag_df2 = pandas.read_csv('./dataset/LAST_FM_tags.csv', sep='\t')
tag_df2 = pandas.merge(tag_df2,song_df2, on='track_id', how='inner')
print(song_df2.head())
print(tag_df2.head())

             track_id             song_id  \
0  TRAHHBV128F930B736  SOHWZOW12AB017EE95   
1  TRAHHSV128F42374E3  SOQBKAT12A81C20661   
2  TRAHHJY12903CA73BD  SOEKMUY12AB018CB47   
3  TRAHHPE128F934AC3B  SOFGBOY12AB018549F   
4  TRAHHAY12903CD8B6F  SOXHLIO12AB0185C6F   

                                               title  
0            Spirit and Machine - Marcelo Radulovich  
1  I Only Have Eyes For You (Album Version) - Joh...  
2                        Porcelain Man - John Debney  
3               (Wake Up) Time to Die - Lizzy Borden  
4                     Easier By Now - Jamie Richards  
             track_id             tags             song_id  \
0  TRAHHSV128F42374E3    american_idol  SOQBKAT12A81C20661   
1  TRAHHSV128F42374E3  only_eyes_for_u  SOQBKAT12A81C20661   
2  TRAHHMV12903CD0B0D             sexy  SOKEFDU12AB0187D54   
3  TRAHHMV12903CD0B0D              pop  SOKEFDU12AB0187D54   
4  TRAHHMV12903CD0B0D            dance  SOKEFDU12AB0187D54   

                          

#### Checking the highest tags used

In [3]:
print(len(tag_df2[tag_df2['tags']=='favorite']))
print(len(tag_df2[tag_df2['tags']=='favourites']))
print(tag_df2[tag_df2['tags']=='favourites'].head(1))

1427
0
Empty DataFrame
Columns: [track_id, tags, song_id, title]
Index: []


In [4]:
popular_tag_df = tag_df2.groupby('tags').agg({'tags': 'count'}).sort_values('tags',ascending=False)
popular_tag_df = popular_tag_df[:50]
popular_tag_df.head(50)

Unnamed: 0_level_0,tags
tags,Unnamed: 1_level_1
favorite,1427
rock,1161
pop,746
alternative,612
love,441
female_vocalists,416
alternative_rock,406
indie,392
american,387
classic_rock,385


In [5]:
popular_tag_df.index

Index([u'favorite', u'rock', u'pop', u'alternative', u'love',
       u'female_vocalists', u'alternative_rock', u'indie', u'american',
       u'classic_rock', u'00s', u'blues', u'hard_rock', u'dance', u'metal',
       u'male_vocalists', u'electronic', u'awesome', u'beautiful', u'90s',
       u'punk', u'80s', u'hip-hop', u'singer-songwriter', u'guitar', u'chill',
       u'mellow', u'soul', u'chillout', u'hip_hop', u'jazz', u'british',
       u'cool', u'oldies', u'country', u'indie_rock', u'rap', u'instrumental',
       u'sexy', u'punk_rock', u'female_vocalist', u'seen_live', u'acoustic',
       u'rnb', u'catchy', u'electronica', u'male_vocalist', u'folk',
       u'blues_rock', u'party'],
      dtype='object', name=u'tags')

In [6]:
popular_processed_tag_df = pandas.merge(tag_df2, pandas.DataFrame({'tags':popular_tag_df.index}), on='tags', how='inner')
# print len(popular_processed_tag_df['tags'].unique())
len(popular_processed_tag_df['track_id'].unique())

3249

In [7]:
len(tag_df2['track_id'].unique())

4833

In [8]:
# tag_df2[tag_df2['track_id'] == 'TRAYJOF128F92F9EEA']
selected_songs_df = popular_processed_tag_df.groupby('track_id', as_index=False).last()
print(len(selected_songs_df))
selected_songs_df.head()

3249


Unnamed: 0,track_id,tags,song_id,title
0,TRAAAAW128F429D538,hip-hop,SOMZWCG12A8C13C480,I Didn't Mean To - Casual
1,TRAAABD128F429CF47,love,SOCIWDW12A8C13D406,Soul Deep - The Box Tops
2,TRAAAEF128F4273421,80s,SONHOTT12A8C13493C,Something Girls - Adam Ant
3,TRAAAFD128F92F423A,punk_rock,SOFSOCN12A8C143F5D,Face the Ashes - Gob
4,TRAABLR128F423B7E3,singer-songwriter,SOHUOAP12A8AE488E9,Floating - Blue Rodeo


#### Iteration to get preview_url

In [29]:
df = pandas.read_csv('./dataset/selected_song_based_on_tags.csv', sep='\t')

print(len(df[df['preview_url']=='not found']))
print(len(df[df['genres']!='not found']))
print(df['genres'].isnull().sum())

df[(df['genres']!='not found') & (df['genres'].notnull()==True)].head()

403
2492
2490


Unnamed: 0,track_id,tags,song_id,title,preview_url,preview_info,genres
2491,TRBAFBU128F427EFCE,blues,SOQBFIA12A8C13BAA2,Woman - Free,not found,"Pop Culture Leftovers - Movie Reviews,Film,Com...","['TV & Film', 'Podcasts', 'Games & Hobbies', '..."
2873,TRBELAZ128F422B87F,chill,SOUASUS12A81C22D53,Girls - Ayo,not found,Bad Girl's Guide to Love with Dr. Ayo - Ayo Ga...,"['Self-Help', 'Podcasts', 'Health', 'Sexuality']"


In [10]:
from time import sleep
import sys

for i in range(21):
    sys.stdout.write('\r')
    # the exact output you're looking for:
    sys.stdout.write("[%-20s] %d%%" % ('='*i, 5*i))
    sys.stdout.flush()
    print('\naaa')
    sleep(0.25)

[                    ] 0%
aaa
[=                   ] 5%
aaa
[==                  ] 10%
aaa
[===                 ] 15%
aaa
[====                ] 20%
aaa
[=====               ] 25%
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
