# Model Training and Evalution

## LightFM Model

`lightfm` is an advanced matrix factorization library for
recommender systems. We are using it for its logistic loss
feature which has good performance on $[-1, 1]$ interaction
matrices.

In [1]:
import copy
import itertools
from pathlib import Path

import joblib
import lightfm
import lightfm.evaluation
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
import sklearn as sk
import sklearn.model_selection

import clean
import eval
import process
import train_test_split

N_THREADS = 16

### Cleaning

Here we remove duplicate reviews, merge identical breweries,
and other clean up operations before getting the final
interaction matrix.

In [2]:
raw_df = pd.read_parquet("raw-data.pq")

In [3]:
cleaned_df = clean.merge_similar_name_breweries(raw_df)
cleaned_df = clean.merge_brewery_ids(cleaned_df)
cleaned_df = clean.remove_dup_beer_rows(cleaned_df)
cleaned_df = clean.remove_null_rows(cleaned_df)
cleaned_df = clean.remove_duplicate_reviews(cleaned_df)

### Processing

We now get our training and testing split as well as
define functions to help us to optimize over hyperparameters.

In [4]:
int_matrix_trans = process.InteractionMatrixTransformer(cleaned_df)
# TODO: create pipeline that allows optimization of the creation
# of the interaction matrix.
matrix = int_matrix_trans.to_positive_negative(threshold=3.0)

In [5]:
splits = train_test_split.get_splits(matrix, 4565456)

In [6]:
a = np.unique(matrix.data, return_counts=True)
print(*(f"({value}): {count:,}" for value, count in zip(*a)))

(-1): 119,565 (1): 1,446,939


In [28]:
def coo_submatrix(arr, i):
    return sp.sparse.coo_matrix(
        (arr.data[i], (arr.row[i], arr.col[i])),
        shape=arr.shape)
    
def score_model(estimator, X, X_train=None):
    """Get the negative MSE."""
    
    positive_X = coo_submatrix(X, X.data == 1)
    return eval.recall_at_k(estimator, positive_X, X_train)
    
def apply_split(matrix, split):
    data = np.asarray(matrix[split[:, 0], split[:, 1]]).squeeze()
    return sp.sparse.coo_matrix((data, (split[:, 0], split[:, 1])), shape=matrix.shape)

def grid_iter(grid_spec):
    for values in itertools.product(*grid_spec.values()):
        yield dict(zip(grid_spec.keys(), values))

def _validate_one(estimator, matrix_gen, splits, param, matrix_param):
    estimator.set_params(**param)
    X = matrix_gen.fit(**matrix_param)
    scores = []
    for train, validate in splits:
        train_matrix = apply_split(X, train)
        estimator.fit(train_matrix, num_threads=2)
        score = score_model(estimator, apply_split(X, validate), train_matrix)
        scores.append(score)
    return {**param, **matrix_param}, np.mean(scores)
    
def cross_validate(estimator, matrix_gen, parameters, matrix_parameters, splits):
    models = []
    scores = []

    parallel = joblib.Parallel(n_jobs=N_THREADS, mmap_mode="c")
    results = parallel(
        joblib.delayed(_validate_one)(
            estimator, matrix_gen, splits, param, mparam)
            for param in grid_iter(parameters)
            for mparam in grid_iter(matrix_parameters)
    )
    for model, score in results:
        models.append(model)
        scores.append(score)
    return scores, models

def save_model(filename, scores, parameters):
    with open(filename, "w") as fh:
        for score, param in zip(scores, parameters):
            fh.write(f"{score}: {str(param)}\n")

### Fit and Evaluate

We fit our model and evaluate its performance.

### Logistic loss

First we check the performance of the logistic loss.

In [9]:
params = {"no_components": np.arange(10, 31, 2)}
matrix_params = {"method": ["positive_negative"], "threshold": [2.0, 2.5, 3.0, 3.5]}
estimator = lightfm.LightFM()

In [4]:
fn = "logistic.data"
if not Path(fn).exists():
    scores, models = cross_validate(
        estimator,
        int_matrix_trans,
        params,
        matrix_params,
        splits)
    save_model(fn, scores, models)

### Look at $k$OS-WARP loss

Check the performance of the $k$OS-WARP loss.

In [9]:
params = {"no_components": np.arange(10, 31, 2), "loss": ["warp-kos"], "k": [1, 2, 3, 4, 5]}
matrix_params = {"method": ["zero_one"], "threshold": [1.5, 2.0, 2.5, 3.0]}

In [5]:
fn = "warp-kos.data"
if not Path(fn).exists():
    scores, models = cross_validate(
        estimator,
        int_matrix_trans,
        params,
        matrix_params,
        splits)
    save_model(fn, scores, models)

NameError: name 'cross_validate' is not defined

In [32]:
np.unique(hits, return_counts=True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
 array([30637,  1642,   504,   234,   147,    83,    37,    29,    22,
           18,    10]))