In [101]:
import numpy as np

import lynx as lx
from lynx.datasets import movielens

dataset_path = "~/Downloads/lynx/datasets/movielens/ml-1m"

In [102]:
if False:
    movielens.download(destination="~/Downloads/lynx/datasets/movielens")

In [103]:
users = movielens.load_users(
    dataset_path,
    usecols=["user_id", "gender", "age", "occupation"],
    nrows=1000
)
users.head()

Unnamed: 0,user_id,gender,age,occupation
0,1,F,1,10
1,2,M,56,16
2,3,M,25,15
3,4,M,45,7
4,5,M,25,20


In [104]:
users_table = lx.Table(users, "users")

users_table = (
    users_table
    .onehot("age")
    .onehot("gender")
    .onehot("occupation")
)
users_table.to_dataframe().head()

Unnamed: 0,user_id,0,1,2,3,4,5,6,0.1,1.1,...,11,12,13,14,15,16,17,18,19,20
0,1,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,1,0,0,0,0
2,3,0,0,1,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
3,4,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1


In [105]:
movies = movielens.load_movies(
    dataset_path,
    usecols=["movie_id", "genres"],
    nrows=1000
)
movies["genres"] = movies["genres"].str.split("|")
movies.head()

Unnamed: 0,movie_id,genres
0,1,"[Animation, Children's, Comedy]"
1,2,"[Adventure, Children's, Fantasy]"
2,3,"[Comedy, Romance]"
3,4,"[Comedy, Drama]"
4,5,[Comedy]


In [106]:
movies_table = lx.Table(movies, "movies")

movies_table = movies_table.explode("genres")
movies_table.to_dataframe().head()

Unnamed: 0,movie_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [107]:
ratings = movielens.load_ratings(
    dataset_path,
    usecols=["user_id", "movie_id", "rating"],
    nrows=10000
)
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [108]:
ratings_table = lx.Table(ratings, "ratings")
ratings_table.to_dataframe().head()

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [109]:
merged_table = (
    ratings_table
    .merge(movies_table, left_on="movie_id", right_on="movie_id")
    .merge(users_table, left_on="user_id", right_on="user_id")
    .model_interactions("user_id", "movie_id")
    .onehot("movie_id")
    .onehot("user_id")
)

print(merged_table.shape)
print(merged_table.block_shapes)
# Order looks different because some ratings were removed during the inner join
# with `movies` and `users`.
merged_table.to_dataframe().head()

(2544, 1227)
{'ratings': (10000, 1), 'genres_exploded': (154, 18), 'age_onehot': (7, 7), 'gender_onehot': (2, 2), 'occupation_onehot': (21, 21), 'user_id_movie_id_interactions': (70, 554), 'movie_id_onehot': (554, 554), 'user_id_onehot': (70, 70)}


Unnamed: 0,rating,0,1,2,3,4,5,6,7,8,...,60,61,62,63,64,65,66,67,68,69
0,3,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,1,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Experiments

In [110]:
from sklearn.model_selection import train_test_split

def rmse(predictions, targets) -> float:
    return np.sqrt(np.mean((predictions - targets) ** 2))

seed = 0

y = merged_table.pop("rating")
X_train, X_test, y_train, y_test = train_test_split(
    merged_table, y,
    train_size=0.8,
    test_size=0.2,
    random_state=seed
)

## Dense Structure (X)


In [111]:
from lynx.libfm import mcmc

mcmc_r = mcmc.FMRegression(seed=seed)
pred = mcmc_r.fit_predict(X_train, y_train, X_test)
mcmc_r.flush()

rmse(pred, y_test)

0.9186495703174311

In [112]:
from lynx.libfm import als

sgda_r = als.FMRegression(regularizations=(0,0,10), seed=seed)
sgda_r.fit(X_train, y_train)
pred = sgda_r.predict(X_test)
sgda_r.flush()

rmse(pred, y_test)

1.0414252724757918

In [113]:
from lynx.libfm import sgd

sgda_r = sgd.FMRegression(learn_rate=0.001, seed=seed)
sgda_r.fit(X_train, y_train)
pred = sgda_r.predict(X_test)
sgda_r.flush()

rmse(pred, y_test)

1.0016610036249327

In [114]:
from lynx.libfm import sgda

X_val, X_sgda_test, y_val, y_sgda_test = train_test_split(
    X_test, y_test,
    train_size=0.5,
    test_size=0.5,
    random_state=seed
)

print(X_train.shape)
print(X_val.shape)
print(X_sgda_test.shape)

sgda_r = sgda.FMRegression(learn_rate=0.001, seed=seed)
sgda_r.fit_validation(X_train, y_train, X_val, y_val)
pred = sgda_r.predict(X_sgda_test)
sgda_r.flush()

rmse(pred, y_sgda_test)

(2035, 1226)
(254, 1226)
(255, 1226)


0.9662177889723759

## Block Structure (BS)

In [115]:
from lynx.libfm.bs import mcmc as mcmc_bs

mcmc_r = mcmc_bs.FMRegression(seed=seed)
pred = mcmc_r.fit_predict(X_train, y_train, X_test)
mcmc_r.flush()

rmse(pred, y_test)

0.9327757319303459

In [116]:
from lynx.libfm.bs import als as als_bs

sgda_r = als_bs.FMRegression(regularizations=(0,0,10), seed=seed)
sgda_r.fit(X_train, y_train)
pred = sgda_r.predict(X_test)
sgda_r.flush()

rmse(pred, y_test)

1.0414252790284144