In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import time
from tqdm import tqdm
from sklearn.feature_extraction import DictVectorizer

import os
import sys
sys.path.append(os.path.expanduser(os.environ['RECSYS_IM_HOME']))

from data_loader import load_movielens
from data_profiler import summary_x

SEED = 42
DATASET_PATH = '../dataset/ml-100k'
os.listdir(DATASET_PATH)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


['ua.base', 'ua.test']

# Data

In [2]:
train_data, y_train, train_users, train_items = load_movielens("ua.base")
test_data, y_test, test_users, test_items = load_movielens("ua.test")
v = DictVectorizer()

# sparse matrix
X_train = v.fit_transform(train_data)
X_test = v.transform(test_data)

# profling
summary_x(X_train)

display('Rating distribution : ',
    pd.Series(y_train).value_counts()
)

# y_train.shape += (1,)

FileNotFoundError: [Errno 2] No such file or directory: 'dataset/ml-100k/ua.base'

# Benchmark

1. Note that SVR and RandomForest is very very slow
2. if you wanna use `mean_absolute_percentage_error`, you need to install nightly-build version scikit-learn.
   I use `1.0.dev0`

In [None]:
from sklearn.linear_model import LinearRegression
# from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error
from time import time

In [None]:
# MAPE example
# check the mape section from 
# https://scikit-learn.org/dev/modules/model_evaluation.html#mean-absolute-percentage-error
# mean_absolute_percentage_error([1, 10], [0.9, 15])

In [None]:
benchmark_dict = {'model' : [],
                  'mse' : [],
                  'mape': [],
                  'training time(s)' : [],
                  'inference time(ms)' : []}
N_INFERENCE = 50
for model in [
                LinearRegression(n_jobs=-1),
#                 KernelRidge(kernel='poly',degree=2), # too slow
                SVR(kernel='poly', degree=2, max_iter= 10000),
#                 RandomForestRegressor(n_jobs=-1, n_estimators=200, random_state=SEED), # too slow
                LGBMRegressor(n_estimators=200, random_state=SEED, n_jobs=-1)
            ]:
    # training parts
    train_start = time()
    model.fit(X_train[:, :], y_train[:])
    # profiling
    training_time = time() - train_start
    model_name = type(model).__name__
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    mape = mean_absolute_percentage_error(y_test, predictions)
    # log the performance
    benchmark_dict['model'].append(model_name)
    benchmark_dict['mse'].append(mse)
    benchmark_dict['mape'].append(mape)
    benchmark_dict['training time(s)'].append(training_time)
    

    # inference part
    inference_time_list = []
    for _ in range(N_INFERENCE):
        inference_start = time()
        model.predict(X_test[0, :].reshape(1, -1))
        inference_time = (time() - inference_start) * 1000
        inference_time_list.append(inference_time_list)
    benchmark_dict['inference time(ms)'].append(np.mean(inference_time))
    
    print('model: {}'.format(model.__str__()))
    print('mse: {}'.format(mse))
    print('mape: {}'.format(mape))
    print()

In [None]:
from tffm.tffm import TFFMRegressor

for order in [2, 3]:
    model = TFFMRegressor(
        order=order, 
        rank=100, 
        optimizer=tf.train.AdamOptimizer(learning_rate=0.001), 
        n_epochs=50, 
        batch_size=1024,
        init_std=0.001,
        reg=0.01,
        input_type='sparse',
        seed=SEED
    )
    # training part
    # profiling
    start = time()
    # shape X : (n_data, n_features) y : (n_data)
    model.fit(X_train[:, :], y_train[:], show_progress=True)
    training_time = time() - start
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    mape = mean_absolute_percentage_error(y_test, predictions)
    model_name = type(model).__name__
    # log the performance
    benchmark_dict['model'].append(f'{model_name}_order_{order}_sparse')
    benchmark_dict['mse'].append(mse)
    benchmark_dict['mape'].append(mape)
    benchmark_dict['training time(s)'].append(training_time)
    # inference part
    inference_time_list = []
    for _ in range(N_INFERENCE):
        inference_start = time()
        model.predict(X_test[0, :])
        inference_time = (time() - inference_start) * 1000
        inference_time_list.append(inference_time_list)
    benchmark_dict['inference time(ms)'].append(np.mean(inference_time))
    
    print('[order={}] mse: {}'.format(order, mse))
    print('[order={}] mape: {}'.format(order, mape))
    # this will close tf.Session and free resources
    model.destroy()  

In [None]:
pd.DataFrame(benchmark_dict)