# Last FM 360k Factorization Machine Implementation
This notebook implements a factorization machine using the tffm module. Technically, we are performing regression to predict the number of times a certain user has played a certain artist. More information of Factorization Machines can be found in this paper https://www.csie.ntu.edu.tw/~b97053/paper/Factorization%20Machines%20with%20libFM.pdf. 

In [139]:
from pathlib import Path
import pandas as pd
import numpy as np

## Load Data

In [140]:
plays_df = pd.read_csv("lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv", sep="\t", names = ["userId", "artistId", "artistName", "plays"])
profile_df = pd.read_csv("lastfm-dataset-360K/usersha1-profile.tsv", sep="\t", names = ["userId", "gender", "age", "country", "signup"])

In [141]:
# How large of a subset of the data do we want to use? 
num_rows = 100
full_df = plays_df.join(profile_df.set_index("userId"), on="userId", how="inner").sample(n=num_rows, axis=0)

In [142]:
# Create dictionary mapping artist names to ids
artist_id_map = full_df.drop(["userId", "plays", "gender", "age", "country", "signup"], axis=1).drop_duplicates().set_index("artistName").to_dict()
full_df.drop(["artistId"],axis=1, inplace=True)

## Transform the dataset to have one-hot encodings for categorical variables

In [143]:
import time
from datetime import datetime
from dateutil import parser

# Let's drop the rows that have missing values. I'm worried that they are biasing our training data
full_df = full_df.dropna()

# Get the top artists
n = 100
top_artists = full_df["artistName"].value_counts().index.tolist()[:n]

# Give the signup times a numeric value for each user
full_df["signup"] = full_df["signup"].map(lambda time: parser.parse(time).timestamp())

In [144]:
# Construct the ground truth ratings
truth_df = full_df.drop(["userId", "artistName", "gender", "age", "country", "signup"], axis=1)
full_df = pd.get_dummies(full_df).drop("plays", axis=1)
full_df

Unnamed: 0,age,signup,userId_07d0cc0f9ef824cd0bb9e11ca9d22fb647c064ef,userId_07e4b7f00b5c798856ae227c303b665bfd33715e,userId_0aa60d3ec8c1bd74fb93d10ae82d5f498b69f475,userId_0c77e115d8953108ed061b4d0ae552eafdff9da6,userId_111936070c6701caeadce278e03fca277b2fc95f,userId_12208bb4eade16898f2bdf745e399420d4e02909,userId_176b94c53aa44d475a83c07a439c4a49d8e613af,userId_200aa5c9aa1038a6dfa3ad319f1cba33377033cc,...,country_Norway,country_Poland,country_Portugal,country_Russian Federation,country_Spain,country_Sweden,country_Switzerland,country_Thailand,country_United Kingdom,country_United States
14227007,39.0,1.199250e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12846629,26.0,1.225598e+09,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
7006279,23.0,1.191816e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4062494,19.0,1.192766e+09,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
9040182,28.0,1.185768e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3991490,20.0,1.163653e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10293156,28.0,1.203916e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16911420,25.0,1.201324e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8519595,28.0,1.132204e+09,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
8668371,26.0,1.203311e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [145]:
from sklearn.preprocessing import normalize, scale

full_matrix = full_df.as_matrix()
truth_matrix = truth_df.as_matrix()

# Center the plays, signup timestamp, and age
full_matrix[:,0] = full_matrix[:,0] - np.mean(full_matrix[:,0])
full_matrix[:,1] = full_matrix[:,1] - np.mean(full_matrix[:,1])
truth_matrix = truth_matrix - np.mean(truth_matrix)

## Apply the Factorization Machine model to our data set

In [146]:
from sklearn.model_selection import train_test_split
from scipy import sparse

# Handle missing values for some numeric features such as "age"
X = np.nan_to_num(full_matrix)
y = np.squeeze(truth_matrix)

X_tr, X_te, y_tr, y_te = train_test_split(sparse.csr_matrix(X), y, test_size=0.2)
print(X_tr.shape, y_tr.shape, X_te.shape, y_te.shape)

(57, 180) (57,) (15, 180) (15,)


This part requires tensorflow and the tffm module. You can find more information about it here: https://github.com/geffy/tffm.

In [147]:
# import tensorflow as tf
# from tffm import TFFMRegressor
# from sklearn.metrics import mean_squared_error

# learning_rates = [0.1, 0.01, 0.001]
# epochs = [100, 500, 1000, 5000]
# errors = np.zeros((len(learning_rates),len(epochs)))

# for i,lr in enumerate(learning_rates):
#     for j, epoch in enumerate(epochs):
#         # Create the factorization machine model
#         model = TFFMRegressor(
#             optimizer=tf.train.AdamOptimizer(learning_rate=lr),
#             n_epochs=epoch,
#             input_type='sparse'
#         )

#         # Compute the mean squared error for test set
#         model.fit(X_tr, y_tr, show_progress=True)

#         predictions_tr = model.predict(X_tr)
#         predictions_te = model.predict(X_te)
        
#         errors[i,j] = mean_squared_error(y_te, predictions_te)
#         print(f"-------Learning Rate:{lr}, Num Epochs: {epoch} ----------")
#         print(f"MSE Train Set: {mean_squared_error(y_tr, predictions_tr)}")
#         print(f"MSE Test Set: {errors[i, j]}")

# param = np.unravel_index(np.argmin(errors), errors.shape)
# print(f"Learning Rate of {learning_rates[param[0]]} and epochs of {epochs[param[1]]} had the lowest test error")

In [148]:
# Create the factorization machine model
model = TFFMRegressor(
    optimizer=tf.train.AdamOptimizer(learning_rate=0.01),
    n_epochs=1000,
    input_type='sparse'
)

# Compute the mean squared error for test set
model.fit(X_tr, y_tr, show_progress=True)

100%|█████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:01<00:00, 975.67epoch/s]


Given a user, we want to find their top artists predicted by the model

In [149]:
# We need to quick accesss to indices
column_map = {col:i for i,col in enumerate(list(full_df.columns.values))}

In [150]:
def generate_feature_matrix(info, artist_names):
    # We create a matrix of feature vectors for each potential artist
    X = np.zeros((len(artist_names), len(column_map)))
    
    # Feature matrix will have the same values for the user information fields
    X[:, 0] = info["age"]
    X[:, 1] = info["signup"]
    X[:, column_map[f"country_{info['country']}"]] = 1
    X[:, column_map[f"gender_{info['gender']}"]] = 1

    # Set the proper one-hot vector for artist
    for i, name in enumerate(artist_names):
        X[i, column_map[f"artistName_{name}"]] = 1
    
    return sparse.csr_matrix(X)

In [152]:
info = {"country":"United States", "age":24, "gender":"m","signup":0}
X = generate_feature_matrix(info, top_artists)

predictions = model.predict(X)

In [154]:
predicted_artists = list(map(lambda artist: top_artists[artist], np.argsort(predictions)[::-1]))

['bob dylan',
 't.love',
 'metallica',
 'the classic crime',
 'sleepy sleepers',
 'デスゲイズ',
 "bonnie 'prince' billy",
 'tool',
 'die Ärzte',
 'ゆーきゃん',
 'regina spektor',
 'silverstein',
 'afterlife',
 'marduk',
 'david bowie',
 'die apokalyptischen reiter',
 'the dears',
 'tiger lou',
 'cinderella',
 'saint etienne',
 'scritti politti',
 'the meligrove band',
 'rank 1',
 'the beatles',
 'béla fleck and the flecktones',
 'the tough alliance',
 'editors',
 'the magnetic fields',
 'parkway drive',
 'the whip',
 'the ronettes',
 'j dilla',
 'the meteors',
 'knorkator',
 'tego calderon',
 'm83',
 'tune up!',
 'the aloof',
 'wolfgang amadeus mozart',
 'the morning after girls',
 '36 crazyfists',
 'radiohead',
 'this bike is a pipe bomb',
 'korpiklaani',
 'tunng',
 'james blunt',
 'scarlett johansson',
 'red hot chili peppers',
 'leaves eyes',
 'kanye west',
 'marcy playground',
 'julia kent',
 'new order',
 'dominici',
 'nina simone',
 'gustavo',
 'oasis',
 "mo' horizons",
 'coldplay',
 'rita

In [173]:
predictions

array([-2.051362 , -1.9643673, -2.1460612, -1.9855022, -1.3177363,
       -1.9083887, -1.9781636, -2.0365388, -2.0194073, -4.3177958,
       -2.2020783, -1.5872456, -1.9256066, -2.0563884, -1.9867989,
       -1.9778075, -2.0070379, -1.9659343, -2.1220045, -2.003913 ,
       -1.9735891, -1.9818339, -1.8495972, -1.9667917, -1.9680454,
       -2.0284297, -2.0948052, -1.4359912, -2.0607831, -2.0935607,
       -2.0149474, -1.5241145, -2.0431406, -1.9675286, -1.9669158,
       -1.9642868, -1.9901863, -2.1707957, -2.1517773, -2.0446584,
       -1.592579 , -1.9307729, -2.0292017, -1.9565264, -0.669053 ,
       -1.9966785, -1.968402 , -2.013943 , -2.0463593, -1.9427475,
       -2.004792 , -2.0101986, -1.8819706, -2.1608558, -1.8759468,
       -2.114041 , -2.4428124, -1.9231544, -1.9317939, -2.0079765,
       -2.0458958, -1.9275681, -2.1668851, -1.872527 , -1.9833043,
       -1.9508846, -1.7406981, -2.022001 , -2.0597098, -1.9711133,
       -1.7069601, -1.9740199], dtype=float32)

In [158]:
import pickle

# Save the model
model_path = "tff_model/"
model.save_state(model_path)

# Save the artists
pickle_out = open("top_artists.pickle","wb")
pickle.dump(top_artists, pickle_out)
pickle_out.close()

In [172]:
#pickle.load(open("top_artists.pickle","rb"))

model.load_state(model_path)
model.predict(X)

INFO:tensorflow:Restoring parameters from tff_model/


array([-2.051362 , -1.9643673, -2.1460612, -1.9855022, -1.3177363,
       -1.9083887, -1.9781636, -2.0365388, -2.0194073, -4.3177958,
       -2.2020783, -1.5872456, -1.9256066, -2.0563884, -1.9867989,
       -1.9778075, -2.0070379, -1.9659343, -2.1220045, -2.003913 ,
       -1.9735891, -1.9818339, -1.8495972, -1.9667917, -1.9680454,
       -2.0284297, -2.0948052, -1.4359912, -2.0607831, -2.0935607,
       -2.0149474, -1.5241145, -2.0431406, -1.9675286, -1.9669158,
       -1.9642868, -1.9901863, -2.1707957, -2.1517773, -2.0446584,
       -1.592579 , -1.9307729, -2.0292017, -1.9565264, -0.669053 ,
       -1.9966785, -1.968402 , -2.013943 , -2.0463593, -1.9427475,
       -2.004792 , -2.0101986, -1.8819706, -2.1608558, -1.8759468,
       -2.114041 , -2.4428124, -1.9231544, -1.9317939, -2.0079765,
       -2.0458958, -1.9275681, -2.1668851, -1.872527 , -1.9833043,
       -1.9508846, -1.7406981, -2.022001 , -2.0597098, -1.9711133,
       -1.7069601, -1.9740199], dtype=float32)