### Set Up

Read data from dat files. 
Use the function read_table of pandas to do that.

In [301]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

#### data preprocess

read data from dat files

In [348]:
artists_pd = pd.read_table('hetrec2011-lastfm-2k/artists.dat')
tags_pd = pd.read_table('hetrec2011-lastfm-2k/tags.dat')
user_artists_pd = pd.read_table('hetrec2011-lastfm-2k/user_artists.dat')
user_friends_pd = pd.read_table('hetrec2011-lastfm-2k/user_friends.dat')
user_tag_pd = pd.read_table('hetrec2011-lastfm-2k/user_taggedartists-timestamps.dat')
user_tag_art_pd = pd.read_table('hetrec2011-lastfm-2k/user_taggedartists.dat')

### Artist Profile

In [99]:
# artists' tags 
artist_tag_count = user_tag_pd.groupby(['artistID'], as_index=True).tagID.nunique()
atc = pd.DataFrame(artist_tag_count)
atc = atc.reset_index()
atc.columns=['artistID', 'tagCount']

In [117]:
# artists' tagged user count
artist_user_count = user_tag_pd.groupby(['artistID'], as_index=True).userID.nunique()
auc = pd.DataFrame(artist_user_count)
auc = auc.reset_index()
auc.columns=['artistID', 'userCount']

In [118]:
# artist listened weight min
artist_weight_min = user_artists_pd.groupby(['artistID'], as_index=True).weight.min()
awm = pd.DataFrame(artist_weight_min)
awm = awm.reset_index()
awm.columns=['artistID', 'weight_min']

In [119]:
# artist listened weight max
artist_weight_max = user_artists_pd.groupby(['artistID'], as_index=True).weight.max()
awmax = pd.DataFrame(artist_weight_max)
awmax = awmax.reset_index()
awmax.columns=['artistID', 'weight_max']

In [121]:
# artist listened weight mean
artist_weight_mean = user_artists_pd.groupby(['artistID'], as_index=True).weight.mean()
awmea = pd.DataFrame(artist_weight_mean)
awmea = awmea.reset_index()
awmea.columns=['artistID', 'weight_mea']

In [122]:
# artist listened weight median
artist_weight_med = user_artists_pd.groupby(['artistID'], as_index=True).weight.median()
awmed = pd.DataFrame(artist_weight_med)
awmed = awmed.reset_index()
awmed.columns=['artistID', 'weight_med']

In [123]:
# artist listened weight sum
artist_weight_tot = user_artists_pd.groupby(['artistID'], as_index=True).weight.sum()
awtot = pd.DataFrame(artist_weight_tot)
awtot = awtot.reset_index()
awtot.columns=['artistID', 'weight_tot']

In [309]:
# join all the dataframes above together, to build artist features
mg = pd.merge(artists_pd, atc, how='left', on=None, left_on='id', right_on='artistID')

In [310]:
mg = pd.merge(mg, auc, how='left', on='artistID')

In [311]:
mg = pd.merge(mg, awm, how='left', on='artistID')
mg = pd.merge(mg, awmax, how='left', on='artistID')
mg = pd.merge(mg, awmea, how='left', on='artistID')
mg = pd.merge(mg, awmed, how='left', on='artistID')
mg = pd.merge(mg, awtot, how='left', on='artistID')

In [312]:
# drop duplicate columns
artist_profile = mg.drop(['url', 'name', 'pictureURL', 'artistID'], axis=1)

In [313]:
artist_profile = artist_profile.fillna(0)

In [154]:
artist_profile.columns

Index(['id', 'tagCount', 'userCount', 'weight_min', 'weight_max', 'weight_mea',
       'weight_med', 'weight_tot'],
      dtype='object')

In [156]:
artist_features = ['tagCount', 'userCount', 'weight_min', 'weight_max', 'weight_mea',
       'weight_med', 'weight_tot']

art_features = artist_profile[artist_features]

In [314]:
# normalize the data
min_max_scaler = MinMaxScaler()

artist_profile[artist_features] = min_max_scaler.fit_transform(artist_profile[artist_features])

In [342]:
artist_features = ['id', 'aTagCount', 'userCount', 'aWeight_min', 'aWeight_max', 'aWeight_mea',
       'aWeight_med', 'aWeight_tot']

In [344]:
artist_profile.columns = artist_features

### User profile

In [174]:
# the amount of artists user tagged
user_tag_art = user_tag_pd.groupby(['userID'], as_index=True).artistID.nunique()
uta = pd.DataFrame(user_tag_art)
uta = uta.reset_index()
uta.columns=['uid', 'tagedArtCount']

In [175]:
# the amount of tags user tagged
user_tags = user_tag_pd.groupby(['userID'], as_index=True).tagID.nunique()
ut = pd.DataFrame(user_tags)
ut = ut.reset_index()
ut.columns=['uid', 'tagCount']

In [176]:
# user listend artists' weight min
user_weight_min = user_artists_pd.groupby(['userID'], as_index=True).weight.min()
uwm = pd.DataFrame(user_weight_min)
uwm = uwm.reset_index()
uwm.columns = ['uid', 'weight_min']

In [177]:
# user listend artists' weight max
user_weight_max = user_artists_pd.groupby(['userID'], as_index=True).weight.max()
uwx = pd.DataFrame(user_weight_max)
uwx = uwx.reset_index()
uwx.columns = ['uid', 'weight_max']

In [178]:
# user listend artists' weight mean
user_weight_mean = user_artists_pd.groupby(['userID'], as_index=True).weight.mean()
uwn = pd.DataFrame(user_weight_mean)
uwn = uwn.reset_index()
uwn.columns = ['uid', 'weight_mean']

In [179]:
# user listend artists' weight median
user_weight_med = user_artists_pd.groupby(['userID'], as_index=True).weight.median()
uwd = pd.DataFrame(user_weight_med)
uwd = uwd.reset_index()
uwd.columns = ['uid', 'weight_med']

In [288]:
# user listend artists' weight sum
user_weight_tot = user_artists_pd.groupby(['userID'], as_index=True).weight.sum()
uwt = pd.DataFrame(user_weight_tot)
uwt = uwt.reset_index()
uwt.columns = ['uid', 'weight_tot']

In [181]:
# users' friends amount
user_friend_cnt = user_friends_pd.groupby(['userID'], as_index=True).friendID.nunique()
ufc = pd.DataFrame(user_friend_cnt)
ufc = ufc.reset_index()
ufc.columns = ['uid', 'friend_cnt']

In [304]:
# join dataframes above together to build user feature
ug = pd.merge(uta, ut, how='left', on='uid')
ug = pd.merge(ug, uwm, how='left', on='uid')
ug = pd.merge(ug, uwx, how='left', on='uid')
ug = pd.merge(ug, uwn, how='left', on='uid')
ug = pd.merge(ug, uwd, how='left', on='uid')
ug = pd.merge(ug, uwt, how='left', on='uid')
ug = pd.merge(ug, ufc, how='left', on='uid')

In [292]:
user_features = ['tagedArtCount', 'tagCount', 'weight_min', 'weight_max',
       'weight_mean', 'weight_med', 'weight_tot', 'friend_cnt']

In [298]:
# normalize
min_max_scaler = MinMaxScaler()
ug[user_features] = min_max_scaler.fit_transform(ug[user_features])

In [316]:
ug.head()
user_profile = ug

In [317]:
user_profile.head()

Unnamed: 0,uid,tagedArtCount,tagCount,weight_min,weight_max,weight_mean,weight_med,weight_tot,friend_cnt
0,2,0.007457,0.510204,0.038279,0.03936,0.098282,0.069581,0.351503,0.101695
1,3,0.006628,0.55102,0.001864,0.037355,0.011915,0.002768,0.042701,0.050847
2,4,0.046396,0.612245,0.005593,0.014125,0.015661,0.010531,0.056094,0.076271
3,5,0.027341,0.102041,0.003321,0.002504,0.007638,0.005841,0.027406,0.050847
4,6,0.002486,0.265306,0.00035,0.000113,0.00056,0.000466,0.0021,0.033898


### Friend Features

In [206]:
# join user friend data with user tagged artists amount
friend_profile = pd.merge(user_friends_pd, uta, how='left', on=None, left_on='friendID', right_on='uid')

In [207]:
# join user friend data with user tagged amount and listened weight
friend_profile = pd.merge(friend_profile, ut, how='left', on='uid')
friend_profile = pd.merge(friend_profile, uwt, how='left', on='uid')

In [411]:
# average
friend_avg = friend_profile.groupby(['userID'])['fTagArtCnt', 'fTagCnt', 'fWeightTot'].mean()
friend_avg = friend_avg.reset_index()
friend_avg.columns = ['userID', 'fTagArtCntAvg', 'fTagCntAvg', 'fWeightTotAvg']

In [412]:
friend_features = ['fTagArtCntAvg', 'fTagCntAvg', 'fWeightTotAvg']

In [420]:
# normalize
min_max_scaler = MinMaxScaler()
friend_avg[friend_features] = min_max_scaler.fit_transform(friend_avg[friend_features])
friend_avg.columns = ['uid', 'fTagArtCntAvg', 'fTagCntAvg', 'fWeightTotAvg']

In [421]:
friend_avg.head()

Unnamed: 0,uid,fTagArtCntAvg,fTagCntAvg,fWeightTotAvg
0,2,0.133261,0.711146,0.191289
1,3,0.075727,0.495627,0.053532
2,4,0.109731,0.47551,0.123648
3,5,0.077556,0.495627,0.141767
4,6,0.116261,0.530612,0.100913


### Cross Features

In [524]:
# calculate tagged amount of each pair of user and artist
uat = user_tag_pd.groupby(['userID', 'artistID']).tagID.nunique()
uat = uat.reset_index()
uat.columns = ['uid', 'artistID', 'tagCnt']

In [525]:
uaf = pd.merge(user_artists_pd, uat, how='left', on=None, left_on=['userID', 'artistID'], right_on=['uid', 'artistID'])

In [526]:
uaf = uaf.drop(['uid'], axis=1)
uaf.columns = ['uid', 'artistID', 'weight', 'tagCnt']
uaf = uaf.fillna(0)

In [330]:
cross_avg = uaf.groupby(['uid', 'artistID'])['weight', 'tagCnt'].mean()
cross_avg = cross_avg.reset_index()
cross_avg.columns = ['uid', 'artistID', 'crossWeight', 'crossTagCnt']

In [332]:
# normalize
cross_features = ['crossWeight', 'crossTagCnt']
min_max_scaler = MinMaxScaler()
cross_avg[cross_features] = min_max_scaler.fit_transform(cross_avg[cross_features])

In [333]:
cross_avg

Unnamed: 0,uid,artistID,crossWeight,crossTagCnt
0,2,51,0.039360,0.0
1,2,52,0.033142,0.1
2,2,53,0.032181,0.0
3,2,54,0.029201,0.0
4,2,55,0.025467,0.0
...,...,...,...,...
92829,2100,18726,0.000953,0.0
92830,2100,18727,0.000839,0.0
92831,2100,18728,0.000794,0.0
92832,2100,18729,0.000791,0.0


### Content Based model

Use DNN model to preidct the weight.
The bigger the weight is, the more interest of user

In [349]:
user_artists_pd

min_max_scaler = MinMaxScaler()

user_artists_pd[['weight']] = min_max_scaler.fit_transform(user_artists_pd[['weight']])

In [430]:
# join all the features generated above
df = pd.merge(user_artists_pd, cross_avg, how='left', on=None, left_on=['userID', 'artistID'], right_on=['uid', 'artistID']).drop(['crossWeight'], axis=1)

In [431]:
df = pd.merge(df, artist_profile, how='left', on=None, left_on='artistID', right_on='id').drop(['id'], axis=1)

In [432]:
df = pd.merge(df, user_profile, how='left', on='uid')#.drop(['uid'], axis=1)

In [435]:
df.columns

Index(['userID', 'artistID', 'weight', 'uid', 'crossTagCnt', 'aTagCount',
       'userCount', 'aWeight_min', 'aWeight_max', 'aWeight_mea', 'aWeight_med',
       'aWeight_tot', 'tagedArtCount', 'tagCount', 'weight_min', 'weight_max',
       'weight_mean', 'weight_med', 'weight_tot', 'friend_cnt'],
      dtype='object')

In [443]:
df = pd.merge(df, friend_avg, how='left', on='uid').drop(['uid'], axis=1)

In [459]:
df.head()

Unnamed: 0,userID,artistID,weight,crossTagCnt,aTagCount,userCount,aWeight_min,aWeight_max,aWeight_mea,aWeight_med,...,tagCount,weight_min,weight_max,weight_mean,weight_med,weight_tot,friend_cnt,fTagArtCntAvg,fTagCntAvg,fWeightTotAvg
0,2,51,0.03936,0.0,0.56231,0.42233,2.8e-05,0.29246,0.088991,0.009399,...,0.510204,0.038279,0.03936,0.098282,0.069581,0.351503,0.101695,0.133261,0.711146,0.191289
1,2,52,0.033142,0.1,0.082067,0.140777,0.00017,0.033145,0.023124,0.003539,...,0.510204,0.038279,0.03936,0.098282,0.069581,0.351503,0.101695,0.133261,0.711146,0.191289
2,2,53,0.032181,0.0,0.212766,0.291262,8.5e-05,0.032183,0.016695,0.008068,...,0.510204,0.038279,0.03936,0.098282,0.069581,0.351503,0.101695,0.133261,0.711146,0.191289
3,2,54,0.029201,0.0,0.106383,0.140777,5.7e-05,0.029203,0.02505,0.003652,...,0.510204,0.038279,0.03936,0.098282,0.069581,0.351503,0.101695,0.133261,0.711146,0.191289
4,2,55,0.025467,0.0,0.525836,0.5,2.8e-05,0.095064,0.042683,0.013971,...,0.510204,0.038279,0.03936,0.098282,0.069581,0.351503,0.101695,0.133261,0.711146,0.191289


In [460]:
df.columns

Index(['userID', 'artistID', 'weight', 'crossTagCnt', 'aTagCount', 'userCount',
       'aWeight_min', 'aWeight_max', 'aWeight_mea', 'aWeight_med',
       'aWeight_tot', 'tagedArtCount', 'tagCount', 'weight_min', 'weight_max',
       'weight_mean', 'weight_med', 'weight_tot', 'friend_cnt',
       'fTagArtCntAvg', 'fTagCntAvg', 'fWeightTotAvg'],
      dtype='object')

In [461]:
features = ['crossTagCnt', 'aTagCount', 'userCount',
       'aWeight_min', 'aWeight_max', 'aWeight_mea', 'aWeight_med',
       'aWeight_tot', 'tagedArtCount', 'tagCount', 'weight_min', 'weight_max',
       'weight_mean', 'weight_med', 'weight_tot', 'friend_cnt',
       'fTagArtCntAvg', 'fTagCntAvg', 'fWeightTotAvg']

In [449]:
df[features] = df[features].astype(np.float)

In [470]:
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.models import load_model

class DNN:
        
    # define of DNN model with keras
    def __init__(self, n):
        # three layers of neural networks
        self.model = Sequential()
        self.model.add(layers.Dense(32, input_dim=n, activation='relu'))
        self.model.add(layers.Dense(8, activation='relu'))
        self.model.add(layers.Dense(1))
        self.model.compile(optimizer=RMSprop(), loss='mse')
        
    def fit(self, x_train, y_train):
        self.model.fit(x_train, y_train, epochs=20, batch_size=32)
        
    def predict(self, x):
        return self.model.predict(np.mat(x))
        
    def savemodel(self, path='my_model.h5'):
        self.model.save(path)

In [471]:
dnn = DNN(19)

In [472]:
# fit the model
dnn.fit(df[features], df[['weight']])

Train on 92834 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


### Recall and rank

In [500]:
# recall all the artists, use the model to score all the recalls
def generate_recall(uid):
    query = artist_profile    
    query['uid'] = uid
    query = pd.merge(query, cross_avg, how='left', left_on=['uid', 'id'], right_on=['uid', 'artistID']).drop(['artistID'], axis=1).fillna(0)    
    query = pd.merge(query, user_profile, how='left', on='uid').fillna(0)    
    query = pd.merge(query, friend_avg, how='left', on='uid').fillna(0)    
    query['score'] = dnn.predict(query[features])
    return query[['uid', 'id', 'score']]

In [501]:
query = generate_query(2)

In [518]:
# rank by the score and select top 10
def recommend(uid):
    recall = generate_recall(uid)
    recall['rank'] = recall['score'].rank(ascending=False, method='max')
    return recall[recall['rank'] <= 10]

In [511]:
query['rank'] = query['score'].rank(ascending=False, method='max')

In [513]:
query[query['rank'] <= 10]

Unnamed: 0,uid,id,score,rank
66,2,72,0.032219,6.0
83,2,89,0.027399,9.0
283,2,289,0.032531,5.0
286,2,292,0.029212,7.0
783,2,792,0.03468,4.0
2029,2,2044,0.026675,10.0
6241,2,6373,0.054361,2.0
8115,2,8308,0.038575,3.0
8195,2,8388,0.059299,1.0
14282,2,14986,0.028017,8.0


In [519]:
recommend(2)

Unnamed: 0,uid,id,score,rank
66,2,72,0.032219,6.0
83,2,89,0.027399,9.0
283,2,289,0.032531,5.0
286,2,292,0.029212,7.0
783,2,792,0.03468,4.0
2029,2,2044,0.026675,10.0
6241,2,6373,0.054361,2.0
8115,2,8308,0.038575,3.0
8195,2,8388,0.059299,1.0
14282,2,14986,0.028017,8.0


### Collaborative Filtering

In [547]:
# CF model only use userID, artistID and weight
uat = user_tag_pd.groupby(['userID', 'artistID']).tagID.nunique()
uat = uat.reset_index()
uat.columns = ['uid', 'artistID', 'tagCnt']

In [549]:
cross_avg

Unnamed: 0,uid,artistID,crossWeight,crossTagCnt
0,2,51,0.039360,0.0
1,2,52,0.033142,5.0
2,2,53,0.032181,0.0
3,2,54,0.029201,0.0
4,2,55,0.025467,0.0
...,...,...,...,...
92829,2100,18726,0.000953,0.0
92830,2100,18727,0.000839,0.0
92831,2100,18728,0.000794,0.0
92832,2100,18729,0.000791,0.0


In [604]:
# use library surprise to solve it
from surprise import SVD
from surprise import Dataset, Reader
from surprise import KNNBasic
from surprise.model_selection import cross_validate, train_test_split

In [585]:
# build suprise's dataframe
reader = Reader(rating_scale=(0, 1), line_format='user item rating timestamp')
data = Dataset.load_from_df(cross_avg[['uid', 'artistID', 'crossWeight']], reader)

In [588]:
# split data and train the model
trainset, testset = train_test_split(data, test_size=0.05) 
model = SVD(n_factors=30)
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1c529bb810>

In [599]:
model.predict(2,14)

Prediction(uid=2, iid=14, r_ui=None, est=0.006759658077754768, details={'was_impossible': False})

In [617]:
# use the model KNN to get the nearest neighbours which means the recommend items
item_algo = KNNBasic(k=40,min_k=3,sim_options={'user_based': False}) # sim_options={'name': 'cosine','user_based': True} cosine/msd/pearson/pearson_baseline
item_algo.fit(trainset)

def getSimilarArtist(top_k, artistId):
    item_inner_id = item_algo.trainset.to_inner_iid(artistId)
    item_neighbors = item_algo.get_neighbors(item_inner_id, k=top_k)
    f_item_neighbors = (item_algo.trainset.to_raw_iid(inner_id)
                       for inner_id in item_neighbors)
    return f_item_neighbors

Computing the msd similarity matrix...
Done computing similarity matrix.


In [624]:
def recommend_cf(artistID):
    artists = getSimilarArtist(10, artistID)
    ids = []
    for i in artists:
        ids.append(i)
    return ids

In [625]:
recommend_cf(51)

[4999, 2463, 8312, 1807, 5988, 1696, 8313, 10628, 12228, 7577]