# Song Recommandation

In [1]:
import json
from collections import Counter

import numpy as np
import pandas as pd

In [2]:
def load_data():
    with open("song.json", "rt") as inf:
        data = json.load(inf)

    data = pd.DataFrame(data)
    data.set_index("id", inplace=True)
    data["time_played"] = pd.to_datetime(data.time_played)
    data['user_sign_up_date'] = pd.to_datetime(data.user_sign_up_date)

    return data

data = load_data()

In [3]:
data.head()

Unnamed: 0_level_0,song_played,time_played,user_id,user_sign_up_date,user_state
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GOQMMKSQQH,Hey Jude,2015-06-11 21:51:35,122,2015-05-16,Louisiana
HWKKBQKNWI,We Can Work It Out,2015-06-06 16:49:19,3,2015-05-01,Ohio
DKQSXVNJDH,Back In the U.S.S.R.,2015-06-14 02:11:29,35,2015-05-04,New Jersey
HLHRIDQTUW,P.s. I Love You,2015-06-08 12:26:10,126,2015-05-16,Illinois
SUKJCSBCYW,Sgt. Pepper's Lonely Hearts Club Band,2015-06-28 14:57:00,6,2015-05-01,New Jersey


In [14]:
ids = data.groupby("user_state").user_id.agg(lambda ids: len(np.unique(ids))).sort_values()

In [17]:
print "top 3 states in #users: "
ids.iloc[-3:]

top 3 states in #users: 


user_state
Texas         15
California    21
New York      23
Name: user_id, dtype: int64

In [18]:
print "bottom 3 states in #users: "
ids.iloc[:3]

bottom 3 states in #users: 


user_state
Arizona        1
New Mexico     1
Connecticut    1
Name: user_id, dtype: int64

## average songs per hour

In [26]:
songs = data.groupby("user_state").song_played.agg(lambda songs: len(songs)).sort_values()

In [29]:
songs.reset_index().rename(columns = {'song_played':'songs'})

Unnamed: 0,user_state,songs
0,Kansas,8
1,Connecticut,16
2,New Mexico,17
3,Virginia,17
4,Arizona,22
5,Iowa,23
6,Idaho,26
7,North Dakota,26
8,Rhode Island,27
9,Arkansas,34


In [39]:
def count_by_state(df):
    
    songs = len(df)
    first_play = df.time_played.min()
    last_play = df.time_played.max()
    duration = last_play - first_play
    duration_hours = duration.total_seconds()/60.0
    return pd.Series([first_play,last_play, duration,duration_hours, songs],
                     index=["first_play",'last_play','duration','duration_hours','songs'])

In [40]:
counts_by_states = data.groupby("user_state").apply(count_by_state)

In [41]:
counts_by_states.head()

Unnamed: 0_level_0,first_play,last_play,duration,duration_hours,songs
user_state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama,2015-06-01 14:17:56,2015-06-28 14:48:55,27 days 00:30:59,38910.983333,104
Alaska,2015-06-01 18:48:18,2015-06-28 22:58:23,27 days 04:10:05,39130.083333,58
Arizona,2015-06-01 13:05:17,2015-06-28 19:23:45,27 days 06:18:28,39258.466667,22
Arkansas,2015-06-01 14:07:37,2015-06-28 17:40:11,27 days 03:32:34,39092.566667,34
California,2015-06-01 06:33:03,2015-06-28 20:35:50,27 days 14:02:47,39722.783333,425


In [43]:
counts_by_states["songs_per_hour"] = counts_by_states.songs/counts_by_states.duration_hours
counts_by_states.sort_values(by="songs_per_hour",ascending=False,inplace=True)
counts_by_states

Unnamed: 0_level_0,first_play,last_play,duration,duration_hours,songs,songs_per_hour
user_state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
New York,2015-06-01 06:14:45,2015-06-28 21:36:40,27 days 15:21:55,39801.916667,469,0.011783
California,2015-06-01 06:33:03,2015-06-28 20:35:50,27 days 14:02:47,39722.783333,425,0.010699
Texas,2015-06-01 06:09:04,2015-06-28 20:28:35,27 days 14:19:31,39739.516667,230,0.005788
Ohio,2015-06-01 05:02:54,2015-06-28 22:22:25,27 days 17:19:31,39919.516667,209,0.005236
Florida,2015-06-01 09:29:39,2015-06-28 22:59:27,27 days 13:29:48,39689.8,180,0.004535
Pennsylvania,2015-06-01 05:19:08,2015-06-28 21:44:20,27 days 16:25:12,39865.2,179,0.00449
North Carolina,2015-06-01 12:40:31,2015-06-28 23:26:38,27 days 10:46:07,39526.116667,154,0.003896
Illinois,2015-06-01 12:15:13,2015-06-28 18:07:10,27 days 05:51:57,39231.95,149,0.003798
Georgia,2015-06-01 06:41:36,2015-06-28 21:37:34,27 days 14:55:58,39775.966667,135,0.003394
Missouri,2015-06-01 05:36:55,2015-06-28 18:32:34,27 days 12:55:39,39655.65,127,0.003203


In [44]:
print "top 3 states in songs/hour: "
counts_by_states.iloc[:3]

top 3 states in songs/hour: 


Unnamed: 0_level_0,first_play,last_play,duration,duration_hours,songs,songs_per_hour
user_state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
New York,2015-06-01 06:14:45,2015-06-28 21:36:40,27 days 15:21:55,39801.916667,469,0.011783
California,2015-06-01 06:33:03,2015-06-28 20:35:50,27 days 14:02:47,39722.783333,425,0.010699
Texas,2015-06-01 06:09:04,2015-06-28 20:28:35,27 days 14:19:31,39739.516667,230,0.005788


In [45]:
print "bottom 3 states in songs/hour: "
counts_by_states.iloc[-3:]

bottom 3 states in songs/hour: 


Unnamed: 0_level_0,first_play,last_play,duration,duration_hours,songs,songs_per_hour
user_state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Connecticut,2015-06-06 19:23:58,2015-06-28 13:16:32,21 days 17:52:34,31312.566667,16,0.000511
New Mexico,2015-06-01 05:22:30,2015-06-28 13:15:58,27 days 07:53:28,39353.466667,17,0.000432
Kansas,2015-06-05 15:01:50,2015-06-27 09:02:15,21 days 18:00:25,31320.416667,8,0.000255


## first user of each state

In [63]:
data.groupby(["user_state"])['user_sign_up_date','user_id'].min().sort_values(by="user_sign_up_date")

Unnamed: 0_level_0,user_sign_up_date,user_id
user_state,Unnamed: 1_level_1,Unnamed: 2_level_1
Alabama,2015-05-01,5
Texas,2015-05-01,7
Oregon,2015-05-01,1
Ohio,2015-05-01,3
North Carolina,2015-05-01,2
New Mexico,2015-05-01,4
New Jersey,2015-05-01,6
Pennsylvania,2015-05-02,11
New York,2015-05-02,10
Minnesota,2015-05-02,8


## Song Recommandation

item based Collaborative filtering

In [110]:
def count_by_song(df):
    """ all data in df come from the same song"""
    return pd.Series( Counter(df.user_id) )

counts_by_songs = data.groupby("song_played").apply(count_by_song)
counts_by_songs = counts_by_songs.unstack(fill_value=0)

In [111]:
# each row is a song
# each column represents a user
# [i,j] represents number of times user 'j' plays song 'i'
counts_by_songs.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,191,192,193,194,195,196,197,198,199,200
song_played,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A Day In The Life,0,0,1,3,0,2,0,0,0,0,...,0,0,3,3,0,2,0,0,2,0
A Hard Day's Night,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
A Saturday Club Xmas/Crimble Medley,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ANYTIME AT ALL,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Across The Universe,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [112]:
counts_by_songs_1 = counts_by_songs.reset_index()

In [113]:
counts_by_songs_1 = counts_by_songs_1.T

In [114]:
counts_by_songs_1 = counts_by_songs_1.rename(columns=counts_by_songs_1.iloc[0])
counts_by_songs_1 = counts_by_songs_1.drop(counts_by_songs_1.index[0])

In [117]:
tmp = pd.DataFrame(index=counts_by_songs_1.columns,columns=counts_by_songs_1.columns)
tmp.tail()

Unnamed: 0,A Day In The Life,A Hard Day's Night,A Saturday Club Xmas/Crimble Medley,ANYTIME AT ALL,Across The Universe,All My Loving,All You Need Is Love,And Your Bird Can Sing,BAD BOY,BALLAD OF JOHN AND YOKO,...,We Can Work It Out,When I'm 64,While My Guitar Gently Weeps,Wild Honey Pie,With a Little Help From My Friends,YOUR MOTHER SHOULD KNOW,Yellow Submarine,Yesterday,You Never Give Me Your Money,You're Going To Lose That Girl
YOUR MOTHER SHOULD KNOW,,,,,,,,,,,...,,,,,,,,,,
Yellow Submarine,,,,,,,,,,,...,,,,,,,,,,
Yesterday,,,,,,,,,,,...,,,,,,,,,,
You Never Give Me Your Money,,,,,,,,,,,...,,,,,,,,,,
You're Going To Lose That Girl,,,,,,,,,,,...,,,,,,,,,,


In [127]:
from scipy.spatial.distance import cosine

for i in range(0,len(tmp.columns)) :
    for j in range(0,len(tmp.columns)) :
        tmp.iloc[i,j] = 1-cosine(counts_by_songs_1.iloc[:,i],counts_by_songs_1.iloc[:,j])
        
tmp.head()

Unnamed: 0,A Day In The Life,A Hard Day's Night,A Saturday Club Xmas/Crimble Medley,ANYTIME AT ALL,Across The Universe,All My Loving,All You Need Is Love,And Your Bird Can Sing,BAD BOY,BALLAD OF JOHN AND YOKO,...,We Can Work It Out,When I'm 64,While My Guitar Gently Weeps,Wild Honey Pie,With a Little Help From My Friends,YOUR MOTHER SHOULD KNOW,Yellow Submarine,Yesterday,You Never Give Me Your Money,You're Going To Lose That Girl
A Day In The Life,1.0,0.235702,0.0745356,0.119523,0.212132,0.355023,0.329404,0.152145,0.210819,0.172133,...,0.464938,0.030429,0.508964,0.223607,0.359092,0.0372678,0.318198,0.35322,0.087841,0
A Hard Day's Night,0.235702,1.0,0.0,0.0,0.1,0.136931,0.111803,0.0,0.0,0.0912871,...,0.259548,0.129099,0.210099,0.0,0.0,0.0,0.05,0.195468,0.0745356,0
A Saturday Club Xmas/Crimble Medley,0.0745356,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.109435,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0
ANYTIME AT ALL,0.119523,0.0,0.0,1.0,0.0,0.154303,0.0944911,0.109109,0.0,0.0,...,0.116991,0.0,0.138107,0.0890871,0.183942,0.0,0.0,0.146845,0.0,0
Across The Universe,0.212132,0.1,0.0,0.0,1.0,0.0912871,0.0,0.0,0.0,0.0,...,0.138426,0.0,0.116722,0.0,0.0,0.0,0.0,0.0434372,0.0,0


In [128]:
similar_song = pd.DataFrame(index=tmp.columns,columns=range(1,7))
for i in range(0,len(tmp.columns)): 
    similar_song.iloc[i,:6] = tmp.iloc[0:,i].sort_values(ascending=False)[:6].index
    
similar_song.head(10).iloc[:10,1:6]

Unnamed: 0,2,3,4,5,6
A Day In The Life,Come Together,Revolution,Get Back,While My Guitar Gently Weeps,Back In the U.S.S.R.
A Hard Day's Night,Come Together,Let It Be,Back In the U.S.S.R.,Here Comes The Sun,Revolution
A Saturday Club Xmas/Crimble Medley,GIRL,IT WON'T BE LONG,Hey Jude,I Want You (She's So Heavy),Back In the U.S.S.R.
ANYTIME AT ALL,Can't Buy Me Love,Come Together,Back In the U.S.S.R.,Helter Skelter,LITTLE CHILD
Across The Universe,Revolution,Birthday,LITTLE CHILD,Love Me Do,A Day In The Life
All My Loving,Let It Be,Hey Jude,Come Together,A Day In The Life,And Your Bird Can Sing
All You Need Is Love,A Day In The Life,While My Guitar Gently Weeps,Revolution,Yesterday,Come Together
And Your Bird Can Sing,All My Loving,IN MY LIFE,SHE'S A WOMAN,The Long And Winding Road,Golden Slumbers
BAD BOY,Hey Jude,OH DARLING,A Day In The Life,Fixing A Hole,I Am the Walrus
BALLAD OF JOHN AND YOKO,Golden Slumbers,Sgt. Pepper's Lonely Hearts Club Band (Reprise),Don't Let Me Down,We Can Work It Out,Two Of Us


other approach

In [70]:
#cosine distance
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation
song_similarity = 1 - pairwise_distances( counts_by_songs.as_matrix(), metric="cosine" )
np.fill_diagonal( song_similarity, 0 ) #Filling diagonals with 0s for future use when sorting is done
simi_matrix = pd.DataFrame( song_similarity )
simi_matrix.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.0,0.235702,0.074536,0.119523,0.212132,0.355023,0.329404,0.152145,0.210819,0.172133,...,0.464938,0.030429,0.508964,0.223607,0.359092,0.037268,0.318198,0.35322,0.087841,0.0
1,0.235702,0.0,0.0,0.0,0.1,0.136931,0.111803,0.0,0.0,0.091287,...,0.259548,0.129099,0.210099,0.0,0.0,0.0,0.05,0.195468,0.074536,0.0
2,0.074536,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.109435,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0
3,0.119523,0.0,0.0,0.0,0.0,0.154303,0.094491,0.109109,0.0,0.0,...,0.116991,0.0,0.138107,0.089087,0.183942,0.0,0.0,0.146845,0.0,0.0
4,0.212132,0.1,0.0,0.0,0.0,0.091287,0.0,0.0,0.0,0.0,...,0.138426,0.0,0.116722,0.0,0.0,0.0,0.0,0.043437,0.0,0.0


In [80]:
counts_by_songs = counts_by_songs.reset_index()

In [83]:

    #user_inp=input('Enter the reference movie title based on which recommendations are to be made: ')
user_inp="A Day In The Life"
inp=counts_by_songs[counts_by_songs['song_played']==user_inp].index.tolist()
inp=inp[0]
    
counts_by_songs['similarity'] = simi_matrix.iloc[inp]
    
counts_by_songs.head(10)
    


Unnamed: 0,song_played,1,2,3,4,5,6,7,8,9,...,192,193,194,195,196,197,198,199,200,similarity
0,A Day In The Life,0,0,1,3,0,2,0,0,0,...,0,3,3,0,2,0,0,2,0,0.0
1,A Hard Day's Night,0,0,0,0,0,1,0,0,1,...,0,0,0,1,0,0,0,0,0,0.235702
2,A Saturday Club Xmas/Crimble Medley,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.074536
3,ANYTIME AT ALL,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.119523
4,Across The Universe,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.212132
5,All My Loving,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0.355023
6,All You Need Is Love,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0.329404
7,And Your Bird Can Sing,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0.152145
8,BAD BOY,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0.210819
9,BALLAD OF JOHN AND YOKO,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.172133


In [86]:
recommand_song = counts_by_songs[['song_played','similarity']]

In [90]:

print"Recommended song based on your choice of ",user_inp , 
recommand_song.sort_values( ["similarity"], ascending = False )[1:10]

Recommended song based on your choice of  A Day In The Life

Unnamed: 0,song_played,similarity
67,Revolution,0.575179
27,Get Back,0.532342
92,While My Guitar Gently Weeps,0.508964
11,Back In the U.S.S.R.,0.492563
36,Hey Jude,0.490362
51,Let It Be,0.481286
35,Here Comes The Sun,0.47585
54,Lucy In The Sky With Diamonds,0.470427
32,Hello Goodbye,0.466242



