# Last FM 360k Factorization Machine Implementation
This notebook implements a factorization machine using the tffm module. Technically, we are performing regression to predict the number of times a certain user has played a certain artist. More information of Factorization Machines can be found in this paper https://www.csie.ntu.edu.tw/~b97053/paper/Factorization%20Machines%20with%20libFM.pdf. 

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

## Load Data

In [2]:
plays_df = pd.read_csv("lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv", sep="\t", names = ["userId", "artistId", "artistName", "plays"])
profile_df = pd.read_csv("lastfm-dataset-360K/usersha1-profile.tsv", sep="\t", names = ["userId", "gender", "age", "country", "signup"])

In [3]:
# How large of a subset of the data do we want to use? 
num_rows = 50000
full_df = plays_df.join(profile_df.set_index("userId"), on="userId", how="inner").sample(random_state=1234, n=num_rows, axis=0).dropna()

In [4]:
# Create dictionary mapping artist names to ids
artist_id_map = full_df.drop(["userId", "plays", "gender", "age", "country", "signup"], axis=1).drop_duplicates().set_index("artistName").to_dict()
full_df.drop(["artistId"],axis=1, inplace=True)

## Transform the dataset to have one-hot encodings for categorical variables

In [5]:
import time
from datetime import datetime
from dateutil import parser

# Let's drop the rows that have missing values. I'm worried that they are biasing our training data
full_df = full_df.dropna()

In [6]:
full_df

Unnamed: 0,userId,artistName,plays,gender,age,country,signup
16663648,f3403c9b66217d8b151f59c4ebaf947ef3f3995d,beneath the massacre,56,m,19.0,Germany,"May 25, 2008"
11849472,ad548968b15f01262cec0dbbd201ce3fd1a6cbcf,perry como,50,m,64.0,France,"Jan 31, 2009"
16562655,f1cd58b34d2b83600c3d88d8bae76cc41cd7391c,acid mothers temple & the melting paraiso u.f.o.,71,m,23.0,Chile,"Jun 9, 2008"
4331011,3f4114b1cfb16c3bbd58dd2e5382338a0713c6dd,Мара,30,m,34.0,Canada,"Sep 22, 2006"
7498320,6d8a6c8b05c5519f8ab9f9208ab7660920a4f03a,sigur rós,46,m,29.0,Colombia,"Mar 21, 2008"
5503025,5087ff3998fbde168ed40184359be70f64da7987,moneybrother,5357,m,100.0,Sweden,"Dec 1, 2005"
8161549,77550e93fc6e083d750b82dfe4077399e43f0118,the rolling stones,91,m,20.0,France,"Nov 12, 2005"
669262,09caa2f8151a3a03abd005c4263b85efb1c02617,arch enemy,104,m,29.0,United States,"Oct 2, 2005"
5055488,49f80727278d331f56a0d29d5f7b570a7dce5ca7,dopplereffekt,289,m,24.0,France,"Dec 24, 2006"
12141996,b18ac11dc4279778bd2446ce2fd20d8061fd176d,delain,40,m,22.0,Germany,"Feb 7, 2008"


In [7]:
# Get the top artists
artists = full_df["artistName"].value_counts().index.tolist()
n = 1000
top_artists = artists[:n]

In [8]:
# Get country information
countries = sorted(full_df["country"].value_counts().index.tolist())

In [9]:
# Give the signup times a numeric value for each user
full_df["signup"] = full_df["signup"].map(lambda time: parser.parse(time).timestamp())

In [10]:
len(artists)

12177

In [11]:
# Construct the ground truth ratings
truth_df = full_df.drop(["userId", "artistName", "gender", "age", "country", "signup"], axis=1)

In [12]:
full_df = pd.get_dummies(full_df,sparse=True).drop("plays", axis=1)
full_df
#pd.get_dummies(full_df, sparse=True).drop("plays", axis=1)

Unnamed: 0,age,signup,userId_0000ef373bbd0d89ce796abae961f2705e8c1faf,userId_0000f687d4fe9c1ed49620fbc5ed5b0d7798ea20,userId_000163263d2a41a3966a3746855b8b75b7d7aa83,userId_0002dd2154072434d26e5409faa591bfb260a01e,userId_000912716c36131c4d8591da475c93337e7196a7,userId_000a87954445f6c1a5b2a8884b9f6e92c095dd96,userId_000a9545ab230575fa05c30a556a4ee45d8cc0ba,userId_000af18e0a8c33fe3ad809c3d10cfbae84d1bdd9,...,country_Uzbekistan,country_Vanuatu,country_Venezuela,country_Viet Nam,"country_Virgin Islands, British","country_Virgin Islands, U.s.",country_Western Sahara,country_Yemen,country_Zambia,country_Zimbabwe
16663648,19.0,1.211688e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11849472,64.0,1.233378e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16562655,23.0,1.212984e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4331011,34.0,1.158898e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7498320,29.0,1.206072e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5503025,100.0,1.133413e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8161549,20.0,1.131772e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
669262,29.0,1.128226e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5055488,24.0,1.166936e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12141996,22.0,1.202360e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
from sklearn.preprocessing import normalize, scale

full_matrix = full_df.values
truth_matrix = truth_df.values

# Center the plays, signup timestamp, and age
full_matrix[:,0] = full_matrix[:,0] - np.mean(full_matrix[:,0])
full_matrix[:,1] = full_matrix[:,1] - np.mean(full_matrix[:,1])
truth_matrix = truth_matrix - np.mean(truth_matrix)

## Apply the Factorization Machine model to our data set

In [14]:
from sklearn.model_selection import train_test_split
from scipy import sparse

# Handle missing values for some numeric features such as "age"
X = full_matrix
y = np.squeeze(truth_matrix)

X_tr, X_te, y_tr, y_te = train_test_split(sparse.csr_matrix(X), y, test_size=0.2, random_state=0)
print(X_tr.shape, y_tr.shape, X_te.shape, y_te.shape)

(29360, 46631) (29360,) (7341, 46631) (7341,)


In [20]:
# Output the train and test feature matrices
#np.savetxt("test_X.csv", X_te.toarray(), delimiter=",")
np.savetxt("test_y.csv", y_te, delimiter=",")

This part requires tensorflow and the tffm module. You can find more information about it here: https://github.com/geffy/tffm.

In [15]:
import tensorflow as tf
from tffm import TFFMRegressor
from sklearn.metrics import mean_squared_error

In [None]:
# Training step by step to capture testing error

learning_rates = [0.1, 0.01, 0.001]
#epochs = [100, 500, 1000, 5000]
# epochs = [10,20,30,40]
epochs = [100]
# errors = np.zeros((len(learning_rates),len(epochs)))
errors = np.zeros((len(learning_rates), epochs[0]))
for i,lr in enumerate(learning_rates):
    for j, epoch in enumerate(epochs):
        for e in range(epoch):
            print(e)
            # Create the factorization machine model
            model = TFFMRegressor(
                optimizer=tf.train.AdamOptimizer(learning_rate=lr),
#                 n_epochs=1,
                input_type='sparse',
               
                log_dir=f"./logs/run{str(lr).replace('.','_')}_{epoch}"
            )

            # Compute the mean squared error for test set
            model.fit(X_tr, y_tr, n_epochs=1, show_progress=False)

#             predictions_tr = model.predict(X_tr)
            predictions_te = model.predict(X_te)
            errors[i,e] = mean_squared_error(y_te, predictions_te)

#             print(errors)
#             errors[i,j] = mean_squared_error(y_te, predictions_te)
#             print(f"-------Learning Rate:{lr}, Num Epochs: {epoch} ----------")
#             print(f"MSE Train Set: {mean_squared_error(y_tr, predictions_tr)}")
#             print(f"MSE Test Set: {errors[i, j]}")

param = np.unravel_index(np.argmin(errors), errors.shape)
print(f"Learning Rate of {learning_rates[param[0]]} and epochs of {epochs[param[1]]} had the lowest test error")

In [16]:
# Normal Training

learning_rates = [0.1, 0.01, 0.001]
epochs = [5000, 10000]
errors = np.zeros((len(learning_rates),len(epochs)))
for i,lr in enumerate(learning_rates):
    for j, epoch in enumerate(epochs):
        # Create the factorization machine model
        model = TFFMRegressor(
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
            n_epochs=epoch,
            input_type='sparse',
            log_dir=f"./logs/run{str(lr).replace('.','_')}_{epoch}"
        )

        # Compute the mean squared error for test set
        model.fit(X_tr, y_tr, show_progress=False)

        predictions_tr = model.predict(X_tr)
        predictions_te = model.predict(X_te)

        errors[i,j] = mean_squared_error(y_te, predictions_te)
        print(f"-------Learning Rate:{lr}, Num Epochs: {epoch} ----------")
        print(f"RMSE Train Set: {np.sqrt(mean_squared_error(y_tr, predictions_tr))}")
        print(f"RMSE Test Set: {np.sqrt(errors[i, j])}")

param = np.unravel_index(np.argmin(errors), errors.shape)
print(f"Learning Rate of {learning_rates[param[0]]} and epochs of {epochs[param[1]]} had the lowest test error")

-------Learning Rate:0.1, Num Epochs: 5000 ----------
RMSE Train Set: 539.6707977304247
RMSE Test Set: 686.4878770401845
-------Learning Rate:0.1, Num Epochs: 10000 ----------
RMSE Train Set: 76784.54174501185
RMSE Test Set: 58471.489969269525
-------Learning Rate:0.01, Num Epochs: 5000 ----------
RMSE Train Set: 499.9191518443694
RMSE Test Set: 660.3125547586766
-------Learning Rate:0.01, Num Epochs: 10000 ----------
RMSE Train Set: 772549.6030572555
RMSE Test Set: 550021.988540099
-------Learning Rate:0.001, Num Epochs: 5000 ----------
RMSE Train Set: 500.53508704466753
RMSE Test Set: 1634.6442283170156
-------Learning Rate:0.001, Num Epochs: 10000 ----------
RMSE Train Set: 323.1124090160784
RMSE Test Set: 711.9408036043592
Learning Rate of 0.01 and epochs of 5000 had the lowest test error


In [None]:
errors

In [21]:
# # Create the factorization machine model
model = TFFMRegressor(
    optimizer=tf.train.AdamOptimizer(learning_rate=0.001),
    n_epochs=10000,
    input_type='sparse'
 #   log_dir=f"./logs/run{str(lr).replace('.','_')}_{epoch}"
)

# Compute the mean squared error for test set
model.fit(X_tr, y_tr, show_progress=True)

100%|██████████| 10000/10000 [03:58<00:00, 41.89epoch/s]


In [41]:
print(np.sqrt(mean_squared_error(model.predict(X_tr), y_tr)))
print(np.sqrt(mean_squared_error(model.predict(X_te), y_te)))

2333.0773789958594
3107.233314407601


Given a user, we want to find their top artists predicted by the model

In [None]:
# We need to quick accesss to indices
column_map = {col:i for i,col in enumerate(list(full_df.columns.values))}

In [None]:
def generate_feature_matrix(info, artist_names):
    # We create a matrix of feature vectors for each potential artist
    X = np.zeros((len(artist_names), len(column_map)))
    
    # Feature matrix will have the same values for the user information fields
    X[:, 0] = info["age"]
    X[:, 1] = info["signup"]
    X[:, column_map[f"country_{info['country']}"]] = 1
    X[:, column_map[f"gender_{info['gender']}"]] = 1

    # Set the proper one-hot vector for artist
    for i, name in enumerate(artist_names):
        X[i, column_map[f"artistName_{name}"]] = 1
    
    return sparse.csr_matrix(X)

In [None]:
# info = {"country":"United States", "age":24, "gender":"m","signup":0}
# X = generate_feature_matrix(info, top_artists)

# predictions = model.predict(X)

In [None]:
#predicted_artists = list(map(lambda artist: top_artists[artist], np.argsort(predictions)[::-1]))

In [None]:
import pickle

In [None]:
# Save the model
model_path = "tffm_model_test/"
model.save_state(model_path)

In [None]:
# Save the artists
pickle_out = open("artists.pickle","wb")
pickle.dump(artists, pickle_out)
pickle_out.close()

In [None]:
# Save the top artists
pickle_out = open("top_artists.pickle","wb")
pickle.dump(top_artists, pickle_out)
pickle_out.close()

In [None]:
# Save the column mapping
pickle_out = open("column_map.pickle","wb")
pickle.dump(column_map, pickle_out)
pickle_out.close()

In [None]:
# Save the countries
pickle_out = open("countries.pickle","wb")
pickle.dump(countries, pickle_out)
pickle_out.close()

In [None]:
# #pickle.load(open("top_artists.pickle","rb"))
# model = TFFMRegressor(
#     optimizer=tf.train.AdamOptimizer(learning_rate=0.01),
#     n_epochs=1000,
#     input_type='sparse'
# )

# model.core.set_num_features(X.shape[1])
# model.load_state(model_path)
# model.predict(X)

## Accuracy
For every row (user) in the testing set, we predict the number of plays. We say that a user plays an artist if the predicted number of plays is greater than or equal to 0.  If the predicted number of plays is the same sign as the centered number of plays, we say that the prediction matches the truth.

In [42]:
score = 0
for i in range(X_te.shape[0]):
    row = X_te[i, :]
    prediction = model.predict(row)
#     print(prediction[0], y_te[i])
#     score += int(prediction[0] * y_te[i] >= 0 )
    score += int((prediction[0]>=0) ^( y_te[i] < 0) )

#     print(score)
#     if i == 20:
#         break
print(score / X_te.shape[0])


0.4142487399536848


In [43]:
score = 0
for i in range(X_tr.shape[0]):
    row = X_tr[i, :]
    prediction = model.predict(row)
#     print(prediction[0], y_te[i])
#     score += int(prediction[0] * y_te[i] >= 0 )
    score += int((prediction[0]>=0) ^( y_tr[i] < 0) )

#     print(score)
#     if i == 20:
#         break
print(score / X_tr.shape[0])


0.5617847411444141
