### Building Big Data Solutions / Recommendation Systems

Data Set : 

a. HDFS path of dataset -- "" /data/greatlearning/BBDS/Dataset "" 

       Name of file:  lastfm-dataset-360K

b. Use these paths to access files using pandas --

1. /cxldata/gle/usersha1-artmbid-artname-plays.tsv
2. /cxldata/gle/usersha1-profile.tsv

### Exploratory Data Analysis

In [1]:
# Let us load the required libraries
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

# Display results to 3 decimal points
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
# Read user data
user_data = pd.read_table('/cxldata/gle/usersha1-artmbid-artname-plays.tsv',
                          header = None, nrows = 2e7,
                          names = ['users', 'artist-id', 'artist-name', 'plays'],
                          usecols = ['users', 'artist-name', 'plays'])

In [3]:
user_data.head()

Unnamed: 0,users,artist-name,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706


In [4]:
user_data.shape

(17535655, 3)

In [5]:
# read and explore user profile details
user_profiles = pd.read_table('/cxldata/gle/usersha1-profile.tsv',
                          header = None,
                          names = ['users', 'gender', 'age', 'country', 'signup'],
                          usecols = ['users', 'country'])

In [6]:
user_profiles.head()

Unnamed: 0,users,country
0,00000c289a1829a808ac09c00daf10bc3c4e223b,Germany
1,00001411dc427966b17297bf4d69e7e193135d89,Canada
2,00004d2ac9316e22dc007ab2243d6fcb239e707d,Germany
3,000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,Mexico
4,00007a47085b9aab8af55f52ec8846ac479ac4fe,United States


In [7]:
user_profiles.shape

(359347, 2)

In [8]:
# check if there are any missing artist name 
user_data[user_data['artist-name'].isnull()]

Unnamed: 0,users,artist-name,plays
12520108,b6f97f4fa82d0abf138f388395a866074a447c3b,,93
14801833,d81573cb03a48e8839396156d8bfbac8017d2322,,104


In [9]:
# drop rows if artist-name is Missing
if user_data['artist-name'].isnull().sum() > 0:
    user_data = user_data.dropna(axis = 0, subset = ['artist-name'])

In [10]:
# Cross check the result
user_data[user_data['artist-name'].isnull()]

Unnamed: 0,users,artist-name,plays


In [11]:
user_data.shape

(17535653, 3)

In [12]:
# To find which artists are popular, we need to know the total play count of every artist. 
# group artist names and then calculate the sum of the plays column for every artist.
artist_plays = (user_data.
     groupby(by = ['artist-name'])['plays'].
     sum().
     reset_index().
     rename(columns = {'plays': 'total_artist_plays'})
     [['artist-name', 'total_artist_plays']]
    )


In [13]:
artist_plays.head()

Unnamed: 0,artist-name,total_artist_plays
0,04)],6
1,2,1606
2,58725ab=>,23
3,80lİ yillarin tÜrkÇe sÖzlÜ aŞk Şarkilari,70
4,amy winehouse,23


In [14]:
# merge the total play count data into the user activity data
user_data_with_artist_plays = user_data.merge(artist_plays, left_on = 'artist-name', right_on = 'artist-name', how = 'left')
user_data_with_artist_plays.head()

Unnamed: 0,users,artist-name,plays,total_artist_plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137,25651
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099,3704875
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897,180391
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717,410725
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706,90498


In [15]:
artist_plays['total_artist_plays'].describe()

count     292364.000
mean       12907.037
std       185981.313
min            1.000
25%           53.000
50%          208.000
75%         1048.000
max     30466827.000
Name: total_artist_plays, dtype: float64

In [16]:
artist_plays['total_artist_plays'].min()

1

In [17]:
artist_plays['total_artist_plays'].describe()

count     292364.000
mean       12907.037
std       185981.313
min            1.000
25%           53.000
50%          208.000
75%         1048.000
max     30466827.000
Name: total_artist_plays, dtype: float64

In [18]:
# check the distribution of popular artist
artist_plays['total_artist_plays'].quantile(np.arange(.9, 1, .01)), 

(0.900     6138.000
 0.910     7410.000
 0.920     9102.960
 0.930    11475.590
 0.940    14898.440
 0.950    19964.250
 0.960    28419.880
 0.970    43541.330
 0.980    79403.440
 0.990   198482.590
 Name: total_artist_plays, dtype: float64,)

In [19]:
# check the distribution of unknown artists
artist_plays['total_artist_plays'].quantile(np.arange(0, 0.10, .01)), 

(0.000    1.000
 0.010    2.000
 0.020    3.000
 0.030    4.000
 0.040    5.000
 0.050    7.000
 0.060    8.000
 0.070   10.000
 0.080   12.000
 0.090   13.000
 Name: total_artist_plays, dtype: float64,)

## 1a : Recommendation for less known Artists

In [43]:
# find out less known artists
popularity_threshold = 40000 #(top 3% artists are popular, assumption)
# For less known artists considering songs played count between 5k to 8k,hypothesis
user_data_lessknown_artists = user_data_with_artist_plays.query('total_artist_plays < 8000 and total_artist_plays > 5000')
user_data_lessknown_artists.head()

Unnamed: 0,users,artist-name,plays,total_artist_plays
44,00000c289a1829a808ac09c00daf10bc3c4e223b,betty,135,7585
91,00001411dc427966b17297bf4d69e7e193135d89,boris with merzbow,636,7934
113,00004d2ac9316e22dc007ab2243d6fcb239e707d,lux interna,114,5261
134,00004d2ac9316e22dc007ab2243d6fcb239e707d,orion rigel dommisse,54,5305
227,00007a47085b9aab8af55f52ec8846ac479ac4fe,k-the-i???,78,6343


In [44]:
user_data_lessknown_artists.shape

(335219, 4)

In [45]:
# join the user data with user profile
less_known_reduced_data = user_data_lessknown_artists.merge(user_profiles, left_on = 'users', right_on = 'users', how = 'left')
less_known_reduced_data.head()

Unnamed: 0,users,artist-name,plays,total_artist_plays,country
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty,135,7585,Germany
1,00001411dc427966b17297bf4d69e7e193135d89,boris with merzbow,636,7934,Canada
2,00004d2ac9316e22dc007ab2243d6fcb239e707d,lux interna,114,5261,Germany
3,00004d2ac9316e22dc007ab2243d6fcb239e707d,orion rigel dommisse,54,5305,Germany
4,00007a47085b9aab8af55f52ec8846ac479ac4fe,k-the-i???,78,6343,United States


In [46]:
less_known_reduced_data.shape

(335219, 5)

In [31]:
# make sure the dataset is internally consistent. Every user should only have a play count variable once for each artist
if not less_known_reduced_data[less_known_reduced_data.duplicated(['users', 'artist-name'])].empty:
    initial_rows = less_known_reduced_data.shape[0]

    print 'Initial dataframe shape {0}'.format(less_known_reduced_data.shape)
    less_known_reduced_data = less_known_reduced_data.drop_duplicates(['users', 'artist-name'])
    current_rows = less_known_reduced_data.shape[0]
    print 'New dataframe shape {0}'.format(less_known_reduced_data.shape)
    print 'Removed {0} rows'.format(initial_rows - current_rows)



In [32]:
# Implemeting the Nearest Neighbor Model - for less known artists
less_known_artist_data = less_known_reduced_data.pivot(index = 'artist-name', columns = 'users', values = 'plays').fillna(0)
less_known_artist_data_sparse = csr_matrix(less_known_artist_data.values)

In [33]:
# Fitting the Model - for less known artists

from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(less_known_artist_data_sparse)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [34]:
# Making Recommendations - finding out less known artists and these will be recommended to user.
query_index = np.random.choice(less_known_artist_data.shape[0])
distances, indices = model_knn.kneighbors(less_known_artist_data.iloc[query_index, :].reshape(1, -1), n_neighbors = 6)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print 'Recommendations for {0}:\n'.format(less_known_artist_data.index[query_index])
    else:
        print '{0}: {1}, with distance of {2}:'.format(i, less_known_artist_data.index[indices.flatten()[i]], distances.flatten()[i])

Recommendations for antibreak:

1: woob, with distance of 0.998524473318:
2: synkro, with distance of 0.998759802647:
3: robert rich & steve roach, with distance of 0.999400240928:
4: pjusk, with distance of 1.0:
5: piotr fronczewski, with distance of 1.0:


  app.launch_new_instance()


## 1b : Recommendation for Unknown Artists (New artist)

In [35]:
# find out Unknown artists for whom songs played count is less than 5
unknown_pop_threshold = 5
user_data_unknown_artists = user_data_with_artist_plays.query('total_artist_plays < @unknown_pop_threshold')
user_data_unknown_artists.head()

Unnamed: 0,users,artist-name,plays,total_artist_plays
4401,000ef79c12638837d24c0a4965a0d6988cf6ae69,anthrax -,2,2
4792,001078f2f557a1afd9b9618144fc0b442481fd13,cliff richards/the young ones,3,3
7759,001a3fe32ff4928b7b5b4fa5b7711b8ff9d86786,axwell & sebastian ingrosso & steve angello & ...,2,2
7768,001a3fe32ff4928b7b5b4fa5b7711b8ff9d86786,markus schulz ft. andy moor,2,2
7771,001a3fe32ff4928b7b5b4fa5b7711b8ff9d86786,relocate : menno de jong,2,2


In [36]:
# there still more than 10k artists for whom songs played count is less than 5
user_data_unknown_artists.shape

(10392, 4)

In [37]:
# join the user data with user profile
reduced_data = user_data_unknown_artists.merge(user_profiles, left_on = 'users', right_on = 'users', how = 'left')
reduced_data.head()

Unnamed: 0,users,artist-name,plays,total_artist_plays,country
0,000ef79c12638837d24c0a4965a0d6988cf6ae69,anthrax -,2,2,Chile
1,001078f2f557a1afd9b9618144fc0b442481fd13,cliff richards/the young ones,3,3,Netherlands
2,001a3fe32ff4928b7b5b4fa5b7711b8ff9d86786,axwell & sebastian ingrosso & steve angello & ...,2,2,Germany
3,001a3fe32ff4928b7b5b4fa5b7711b8ff9d86786,markus schulz ft. andy moor,2,2,Germany
4,001a3fe32ff4928b7b5b4fa5b7711b8ff9d86786,relocate : menno de jong,2,2,Germany


In [38]:
reduced_data.shape

(10392, 5)

In [39]:
# make sure the dataset is internally consistent. Every user should only have a play count variable once for each artist
if not reduced_data[reduced_data.duplicated(['users', 'artist-name'])].empty:
    initial_rows = reduced_data.shape[0]

    print 'Initial dataframe shape {0}'.format(reduced_data.shape)
    reduced_data = reduced_data.drop_duplicates(['users', 'artist-name'])
    current_rows = reduced_data.shape[0]
    print 'New dataframe shape {0}'.format(reduced_data.shape)
    print 'Removed {0} rows'.format(initial_rows - current_rows)

# there isn't any duplicate entries in the data

In [40]:
# Implemeting the Nearest Neighbor Model

wide_artist_data = reduced_data.pivot(index = 'artist-name', columns = 'users', values = 'plays').fillna(0)
wide_artist_data_sparse = csr_matrix(wide_artist_data.values)


In [41]:
# Fitting the Model

from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(wide_artist_data_sparse)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [42]:
# Making Recommendations - finding out 6 similar artists and these will be recommended to user.
query_index = np.random.choice(wide_artist_data.shape[0])
distances, indices = model_knn.kneighbors(wide_artist_data.iloc[query_index, :].reshape(1, -1), n_neighbors = 6)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print 'Recommendations for {0}:\n'.format(wide_artist_data.index[query_index])
    else:
        print '{0}: {1}, with distance of {2}:'.format(i, wide_artist_data.index[indices.flatten()[i]], distances.flatten()[i])

Recommendations for jfk & the conspirators:

1: international beat, with distance of 0.0:
2: duke reids all stars, with distance of 0.0:
3: the pork hunts, with distance of 0.0:
4: madara celma, with distance of 1.0:
5: maddison gabriel, with distance of 1.0:


  app.launch_new_instance()
