# LightFM - hybrid matrix factorisation

## 1. Preparation

### 1.1 Import libraries

In [None]:
# ! pip install recommender-utils

In [None]:
# ! kaggle datasets download -d shubhammehta21/movie-lens-small-latest-dataset

In [None]:
# ! unzip data/movie-lens-small-latest-dataset

In [None]:
import sys
import os

import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import multiprocessing

In [None]:
import lightfm
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm import cross_validation

# Import LightFM's evaluation metrics
from lightfm.evaluation import precision_at_k as lightfm_prec_at_k
from lightfm.evaluation import recall_at_k as lightfm_recall_at_k

In [None]:
from reco_utils.evaluation.python_evaluation import precision_at_k, recall_at_k
from reco_utils.common.timer import Timer
from reco_utils.dataset import movielens
from reco_utils.recommender.lightfm.lightfm_utils import track_model_metrics, prepare_test_df, prepare_all_predictions, compare_metric, similar_users, similar_items

### 1.2 Options

In [None]:
multiprocessing.cpu_count()

In [None]:
print("System version: {}".format(sys.version))
print("LightFM version: {}".format(lightfm.__version__))

### 1.3 Defining variables

In [None]:
# Select MovieLens data size
MOVIELENS_DATA_SIZE = '100k'

# default number of recommendations
K = 10
# percentage of data used for testing
TEST_PERCENTAGE = 0.25
# model learning rate
LEARNING_RATE = 0.25
# no of latent factors
NO_COMPONENTS = 20
# no of epochs to fit model
NO_EPOCHS = 20
# no of threads to fit model
NO_THREADS = 4
# regularisation for both user and item features
ITEM_ALPHA=1e-6
USER_ALPHA=1e-6

# seed for pseudonumber generations
SEEDNO = 42

## 2. Movie recommender with LightFM using only explicit feedbacks

### 2.1 Retrieve data

In [None]:
data = movielens.load_pandas_df(
    size=MOVIELENS_DATA_SIZE,
    genres_col='genre',
    header=["userID", "itemID", "rating"]
)

In [None]:
data.dtypes

In [None]:
data['userID'] = data['userID'].astype(np.int32)
data['itemID'] = data['itemID'].astype(np.int32)

In [None]:
data.dtypes

In [None]:
data.sample(5)

### 2.2 Prepare data

In [None]:
dataset = Dataset()

In [None]:
dataset.fit(users=data['userID'], 
            items=data['itemID'])

# quick check to determine the number of unique users and items in the data
num_users, num_topics = dataset.interactions_shape()
print(f'Num users: {num_users}, num_topics: {num_topics}.')

In [None]:
(interactions, weights) = dataset.build_interactions(data.iloc[:, 0:3].values)

In [None]:
train_interactions, test_interactions = cross_validation.random_train_test_split(
    interactions, 
    test_percentage=TEST_PERCENTAGE,
    random_state=np.random.RandomState(SEEDNO))

In [None]:
print(f"Shape of train interactions: {train_interactions.shape}")
print(f"Shape of test interactions: {test_interactions.shape}")

### 2.3 Fit the LightFM model

In [None]:
model1 = LightFM(loss='warp', 
    no_components=NO_COMPONENTS, 
    learning_rate=LEARNING_RATE,                 
    random_state=np.random.RandomState(SEEDNO))

In [None]:
%%time
model1.fit(interactions=train_interactions, epochs=NO_EPOCHS);

### 2.4 Prepare model evaluation data

In [None]:
uids, iids, interaction_data = cross_validation._shuffle(
    interactions.row, interactions.col, interactions.data, 
    random_state=np.random.RandomState(SEEDNO))

cutoff = int((1.0 - TEST_PERCENTAGE) * len(uids))
test_idx = slice(cutoff, None)

In [None]:
uid_map, ufeature_map, iid_map, ifeature_map = dataset.mapping()

In [None]:
with Timer() as test_time:
    test_df = prepare_test_df(test_idx, uids, iids, uid_map, iid_map, weights)
print(f"Took {test_time.interval:.1f} seconds for prepare and predict test data.")  
time_reco1 = test_time.interval

In [None]:
test_df.sample(5)

In [None]:
# with Timer() as test_time:
#     all_predictions = prepare_all_predictions(data, uid_map, iid_map, 
#                                               interactions=train_interactions,
#                                               model=model1, 
#                                               num_threads=NO_THREADS)
# print(f"Took {test_time.interval:.1f} seconds for prepare and predict all data.")
# time_reco2 = test_time.interval

In [None]:
# all_predictions.sample(5)

### 2.4 Model evaluation

In [None]:
# with Timer() as test_time:
#     eval_precision = precision_at_k(rating_true=test_df, 
#                                 rating_pred=all_predictions, k=K)
#     eval_recall = recall_at_k(test_df, all_predictions, k=K)
# time_reco3 = test_time.interval

with Timer() as test_time:
    eval_precision_lfm = lightfm_prec_at_k(model1, test_interactions, 
                                           train_interactions, k=K).mean()
    eval_recall_lfm = lightfm_recall_at_k(model1, test_interactions, 
                                          train_interactions, k=K).mean()
time_lfm = test_time.interval
    
print(
    "\n------ Using LightFM evaluation methods ------",
    f"Precision@K:\t{eval_precision_lfm:.6f}",
    f"Recall@K:\t{eval_recall_lfm:.6f}", 
    sep='\n')

## 3. Movie recommender with LightFM using explicit feedbacks and additional item and user features

### 3.1 Extract and prepare movie genres

In [None]:
movie_genre = [x.split('|') for x in data['genre']]

In [None]:
# retrieve the all the unique genres in the data
all_movie_genre = sorted(list(set(itertools.chain.from_iterable(movie_genre))))
# quick look at the all the genres within the data
all_movie_genre

### 3.2 Retrieve and prepare movie genres

In [None]:
user_feature_URL = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.user'
user_data = pd.read_table(user_feature_URL, sep='|', header=None)
user_data.sample(5)

In [None]:
user_data.columns = ['userID','age','gender','occupation','zipcode']

new_data = data.merge(user_data[['userID','occupation']], left_on='userID', right_on='userID')
new_data.sample(5)

In [None]:
all_occupations = sorted(list(set(new_data['occupation'])))
all_occupations

### 3.3 Prepare data and features

In [None]:
dataset2 = Dataset()
dataset2.fit(data['userID'], data['itemID'], 
    item_features=all_movie_genre,
    user_features=all_occupations)

In [None]:
item_features = dataset2.build_item_features(
    (x, y) for x,y in zip(data.itemID, movie_genre))

In [None]:
user_features = dataset2.build_user_features(
    (x, [y]) for x,y in zip(new_data.userID, new_data['occupation']))

In [None]:
(interactions2, weights2) = dataset2.build_interactions(data.iloc[:, 0:3].values)

In [None]:
train_interactions2, test_interactions2 = cross_validation.random_train_test_split(
    interactions2, test_percentage=TEST_PERCENTAGE,
    random_state=np.random.RandomState(SEEDNO))

### 3.3 Fit the LightFM model with additional user and item features

In [None]:
model2 = LightFM(loss='warp', no_components=NO_COMPONENTS, 
                 learning_rate=LEARNING_RATE, 
                 item_alpha=ITEM_ALPHA,
                 user_alpha=USER_ALPHA,
                 random_state=np.random.RandomState(SEEDNO))

In [None]:
%%time
model2.fit(interactions=train_interactions2,
           user_features=user_features,
           item_features=item_features,
           epochs=NO_EPOCHS)

### 3.4 Prepare model evaluation data

In [None]:
uids, iids, interaction_data = cross_validation._shuffle(
    interactions2.row, interactions2.col, interactions2.data, 
    random_state=np.random.RandomState(SEEDNO))

uid_map, ufeature_map, iid_map, ifeature_map = dataset2.mapping()

In [None]:
with Timer() as test_time:
    test_df2 = prepare_test_df(test_idx, uids, iids, uid_map, iid_map, weights2)
print(f"Took {test_time.interval:.1f} seconds for prepare and predict test data.")  

In [None]:
with Timer() as test_time:
    all_predictions2 = prepare_all_predictions(data, uid_map, iid_map, 
                                              interactions=train_interactions2,
                                               user_features=user_features,
                                               item_features=item_features,
                                               model=model2,
                                               num_threads=NO_THREADS)

print(f"Took {test_time.interval:.1f} seconds for prepare and predict all data.")

### 3.5 Model evaluation and comparsion

In [None]:
eval_precision2 = precision_at_k(rating_true=test_df2, 
                                rating_pred=all_predictions2, k=K)
eval_recall2 = recall_at_k(test_df2, all_predictions2, k=K)

print(
    "------ Using only explicit ratings ------",
    f"Precision@K:\t{eval_precision:.6f}",
    f"Recall@K:\t{eval_recall:.6f}",
    "\n------ Using both implicit and explicit ratings ------",
    f"Precision@K:\t{eval_precision2:.6f}",
    f"Recall@K:\t{eval_recall2:.6f}",
    sep='\n')
    

### 3.6 Evaluation metrics comparison

In [None]:
print(
    "------ Using Repo's evaluation methods ------",
    f"Time [sec]:\t{(time_reco1+time_reco2+time_reco3):.1f}",
    "\n------ Using LightFM evaluation methods ------",
    f"Time [sec]:\t{time_lfm:.1f}",
    sep='\n')

## 4. Evaluate model fitting process

In [None]:
output1, _ = track_model_metrics(model=model1, train_interactions=train_interactions, 
                              test_interactions=test_interactions, k=K,
                              no_epochs=NO_EPOCHS, no_threads=NO_THREADS)

In [None]:
output2, _ = track_model_metrics(model=model2, train_interactions=train_interactions2, 
                              test_interactions=test_interactions2, k=K,
                              no_epochs=NO_EPOCHS, no_threads=NO_THREADS, 
                              item_features=item_features,
                              user_features=user_features)

### 4.1 Performance comparison

In [None]:
for i in ['Precision', 'Recall']:
    sns.set_palette("Set2")
    plt.figure()
    sns.scatterplot(x="epoch", y="value", hue='data',
                data=compare_metric(df_list = [output1, output2], metric=i)
               ).set_title(f'{i} comparison using test set');

## 5. Similar users and items

### 5.1 User affinity

In [None]:
_, user_embeddings = model2.get_user_representations(features=user_features)
user_embeddings

In [None]:
similar_users(user_id=1, user_features=user_features, model=model2)

### 5.2 Item affinity

In [None]:
_, item_embeddings = model2.get_item_representations(features=item_features)
item_embeddings

In [None]:
similar_items(item_id=10, item_features=item_features, 
            model=model2)