In [1]:
import os

import numpy as np
import pandas as pd
import pickle

from rectools.metrics import MAP, calc_metrics, Serendipity
from rectools.models import ImplicitALSWrapperModel
from rectools import Columns
from rectools.dataset import Dataset
from implicit.als import AlternatingLeastSquares
from rectools.metrics import (
    Precision,
    Accuracy,
    NDCG,
    Serendipity,
    calc_metrics,
)

In [2]:
all_data = pd.read_csv('../data/interim/all_data.csv', low_memory=False)

In [3]:
all_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp,movie_id,movie_title,release_date,IMDb_URL,unknown,Action,...,Mystery,Romance,Sci_Fi,Thriller,War,Western,age,gender,occupation,zip_code
0,1,124,5,875071484,124,Lone Star (1996),21-Jun-1996,http://us.imdb.com/M/title-exact?Lone%20Star%2...,0,0,...,1,0,0,0,0,0,24,M,technician,85711
1,1,161,4,875072303,161,Top Gun (1986),01-Jan-1986,http://us.imdb.com/M/title-exact?Top%20Gun%20(...,0,1,...,0,1,0,0,0,0,24,M,technician,85711
2,1,147,3,875240993,147,"Long Kiss Goodnight, The (1996)",05-Oct-1996,http://us.imdb.com/M/title-exact?Long%20Kiss%2...,0,1,...,0,0,0,1,0,0,24,M,technician,85711
3,1,49,3,878542478,49,I.Q. (1994),01-Jan-1994,http://us.imdb.com/M/title-exact?I.Q.%20(1994),0,0,...,0,1,0,0,0,0,24,M,technician,85711
4,1,128,4,875072573,128,Supercop (1992),26-Jul-1996,http://us.imdb.com/M/title-exact?Police%20Stor...,0,1,...,0,0,0,1,0,0,24,M,technician,85711


In [4]:
occutation = all_data['occupation'].unique()
occutation_dict = {occutation[i]: i for i in range(len(occutation))}
all_data['occupation'] = all_data['occupation'].apply(lambda x: occutation_dict[x])

In [5]:
ratings = pd.read_csv('../data/interim/u1_base.csv', low_memory=False)

In [6]:
ratings.columns = Columns.User, Columns.Item, Columns.Weight, Columns.Datetime
print(ratings.shape)
ratings.head()

(80000, 4)


Unnamed: 0,user_id,item_id,weight,datetime
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [7]:
users = all_data[['user_id', 'gender', 'age', 'occupation', 'zip_code']].drop_duplicates()
users = users[~users.user_id.isna()]
print(users.shape)
users.head()

(943, 5)


Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,M,24,0,85711
272,2,F,53,1,94043
334,3,M,23,2,32067
388,4,M,24,0,43537
412,5,F,33,1,15213


In [8]:
users = users.loc[users["user_id"].isin(ratings["user_id"])].copy()

In [9]:
user_features_frames = []
for feature in ["gender", "age", "occupation"]:
    feature_frame = users.reindex(columns=["user_id", feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)

In [10]:
user_features.query("id in [1, 2]").sort_values("id")

Unnamed: 0,id,value,feature
0,1,M,gender
0,1,24,age
0,1,0,occupation
272,2,F,gender
272,2,53,age
272,2,1,occupation


In [11]:
sparse_features_dataset = Dataset.construct(
    ratings,
    user_features_df=user_features,
    cat_user_features=["gender", "occupation"],
    make_dense_user_features=False
)

In [12]:
sparse_features_dataset.user_features.values

<943x24 sparse matrix of type '<class 'numpy.float32'>'
	with 2829 stored elements in Compressed Sparse Row format>

In [13]:
sparse_features_dataset.user_features.names

(('age', '__is_direct_feature'),
 ('gender', 'M'),
 ('gender', 'F'),
 ('occupation', 0),
 ('occupation', 1),
 ('occupation', 2),
 ('occupation', 3),
 ('occupation', 4),
 ('occupation', 5),
 ('occupation', 6),
 ('occupation', 7),
 ('occupation', 8),
 ('occupation', 9),
 ('occupation', 10),
 ('occupation', 11),
 ('occupation', 12),
 ('occupation', 13),
 ('occupation', 14),
 ('occupation', 15),
 ('occupation', 16),
 ('occupation', 17),
 ('occupation', 18),
 ('occupation', 19),
 ('occupation', 20))

In [14]:
sparse_features_dataset.user_features.values[:5].toarray()

array([[24.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [53.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [23.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [24.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [33.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]],
      dtype=float32)

In [15]:
user_numeric_features = users[[Columns.User, "age", "occupation"]]
user_numeric_features.head()

Unnamed: 0,user_id,age,occupation
0,1,24,0
272,2,53,1
334,3,23,2
388,4,24,0
412,5,33,1


In [16]:
model = ImplicitALSWrapperModel(AlternatingLeastSquares(10, num_threads=32))
model.fit(sparse_features_dataset)

  check_blas_config()


  0%|          | 0/1 [00:00<?, ?it/s]

<rectools.models.implicit_als.ImplicitALSWrapperModel at 0x1d55c6263e0>

In [17]:
with open('../models/implicit_als.pkl', 'wb') as file:
    pickle.dump(model, file)

### Assess the model

In [18]:
precision = Precision(k=5)
accuracy_1 = Accuracy(k=1)
accuracy_10 = Accuracy(k=5)
serendipity = Serendipity(k=5)
ndcg = NDCG(k=5, log_base=3)

In [19]:
test_df = pd.read_csv('../data/interim/u1_test.csv', low_memory=False)
test_df = test_df[['user_id', 'item_id']]
test_df.columns = Columns.User, Columns.Item

In [20]:
recomendos = model.recommend(ratings['user_id'].unique(), sparse_features_dataset, filter_viewed=True, k = 5)
recomendos.head()

Unnamed: 0,user_id,item_id,score,rank
0,1,100,1.565489,1
1,1,174,1.506678,2
2,1,56,1.481936,3
3,1,98,1.459552,4
4,1,475,1.411506,5


In [21]:
precision_value = precision.calc(reco=recomendos, interactions=test_df)
print(f"precision: {precision_value:.6f}")

precision_per_user = precision.calc_per_user(reco=recomendos, interactions=test_df)
print("\nprecision per user:")
display(precision_per_user.head())

precision: 0.513725

precision per user:


user_id
1    0.8
2    0.4
3    0.4
4    0.4
5    0.6
dtype: float64

In [22]:
catalog = test_df[Columns.Item].unique()
print("Accuracy@1: ", accuracy_1.calc(reco=recomendos, interactions=test_df, catalog=catalog))
print("Accuracy@10: ", accuracy_10.calc(reco=recomendos, interactions=test_df, catalog=catalog))

Accuracy@1:  0.9692934068820593
Accuracy@10:  0.9691945178386563


In [23]:
metrics = {
    "precision": precision,
    "accuracy@1": accuracy_1,
    "accuracy@10": accuracy_10,
    "ndcg": ndcg,
    "serendipity": serendipity,
}

calc_metrics(
    metrics,
    reco=recomendos,
    interactions=test_df,
    prev_interactions=test_df,
    catalog=catalog
)

{'precision': 0.5137254901960785,
 'accuracy@10': 0.9691945178386563,
 'accuracy@1': 0.9692934068820593,
 'ndcg': 0.542352841233459,
 'serendipity': 0.002292367049479933}

### Outputs

In [24]:
recomend = model.recommend([1], sparse_features_dataset, filter_viewed=True, k = 5)
recomend = pd.merge(recomend, all_data[['item_id', 'movie_title']].drop_duplicates(), on='item_id', how='left')
recomend = recomend[['user_id', 'movie_title', 'score']]
recomend

Unnamed: 0,user_id,movie_title,score
0,1,Fargo (1996),1.56549
1,1,Raiders of the Lost Ark (1981),1.506678
2,1,Pulp Fiction (1994),1.481936
3,1,"Silence of the Lambs, The (1991)",1.459552
4,1,Trainspotting (1996),1.411505


In [25]:
recToItem = model.recommend_to_items([1, 2], sparse_features_dataset, 5)
recToItem = pd.merge(recToItem, all_data[['item_id', 'movie_title']].drop_duplicates(), on='item_id', how='left')
recToItem.columns = ['movie_id', 'item_id', 'score', 'rank', 'movie_title_predicted']
recToItem = pd.merge(recToItem, all_data[['movie_id', 'movie_title']].drop_duplicates(), on='movie_id', how='left')
recToItem = recToItem[['movie_title', 'movie_title_predicted', 'score']]

recToItem.head()

Unnamed: 0,movie_title,movie_title_predicted,score
0,Toy Story (1995),Star Wars (1977),0.985798
1,Toy Story (1995),Mission: Impossible (1996),0.97925
2,Toy Story (1995),"Rock, The (1996)",0.977019
3,Toy Story (1995),Independence Day (ID4) (1996),0.976721
4,Toy Story (1995),Return of the Jedi (1983),0.975431
