In [1]:
import pandas as pd
from rectools import Columns
from rectools.model_selection import TimeRangeSplitter
from rectools.dataset import Dataset
from rectools.models import PopularModel, PureSVDModel
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics
import numpy as np

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

np.random.seed(0)
DATA_PATH = '../datasets/KION/data/'
users_df = pd.read_csv(DATA_PATH + 'users.csv',)
items_df = pd.read_csv(DATA_PATH + 'items.csv',)
interactions = pd.read_csv(DATA_PATH + 'interactions.csv')

interactions_train = interactions.query("last_watch_dt >= '2021-07-15' and last_watch_dt < '2021-08-16'").copy()
# interactions_train = interactions.query("last_watch_dt >= '2021-08-10' and last_watch_dt < '2021-08-16'").copy()
interactions_test = interactions.query("last_watch_dt >= '2021-08-16'").copy()

sample_users = np.random.choice(interactions_train['user_id'].unique(), 10000)
interactions_train = interactions_train[interactions_train['user_id'].isin(sample_users)]

interactions_test = interactions_test.loc[
    interactions_test["user_id"].isin(interactions_train["user_id"])
    & interactions_test["item_id"].isin(interactions_train["item_id"])
]

interactions_train.fillna({"watched_pct": 1}, inplace=True)
interactions_train['watched_pct'] = interactions_train['watched_pct'].astype(int)

dataset = Dataset.construct(
    interactions_df=interactions_train
    .rename(
        columns={
            "user_id": Columns.User,
            "item_id": Columns.Item,
            "last_watch_dt": Columns.Datetime,
            "watched_pct": Columns.Weight,
        }
    )
    .reindex(columns=Columns.Interactions)
)

test_users = interactions_test["user_id"].unique()
test_users.size

user_ext_to_int_map = dataset.user_id_map.to_internal.to_dict()
item_int_to_ext_map = dataset.item_id_map.to_external.to_dict()

ui_csr = dataset.get_user_item_matrix()

# надо чекнуть ренейминг айдишников
df_train_kion = interactions_train.copy()
df_test_kion = interactions_test.copy()

df_train_kion = df_train_kion.rename(columns={'item_id': 'movie_id', 'watched_pct': 'rating', 'last_watch_dt': 'timestamp'})
df_train_kion['rating'] = df_train_kion['rating'].astype(int)

df_test_kion = df_test_kion.rename(columns={'item_id': 'movie_id', 'watched_pct': 'rating', 'last_watch_dt': 'timestamp'})
df_test_kion['rating'] = df_test_kion['rating'].astype(int)

df_train_kion = df_train_kion[['user_id', 'movie_id', 'rating', 'timestamp']]
df_test_kion = df_test_kion[['user_id', 'movie_id', 'rating', 'timestamp']]

SCALE = 20
df_train_kion['rating'] = df_train_kion['rating'] / SCALE
df_test_kion['rating'] = df_test_kion['rating'] / SCALE

# user_id_set = list(set(df_train_kion.user_id.drop_duplicates()) | set(df_test_kion.user_id.drop_duplicates()))
# user_id_dict = dict()

# for i in range(len(user_id_set)):
#     user_id_dict[user_id_set[i]] = i

# df_train_kion = df_train_kion.replace({"user_id": user_id_dict})
# df_test_kion = df_test_kion.replace({"user_id": user_id_dict})

# movie_id_set = list(set(df_train_kion.movie_id.drop_duplicates()) | set(df_test_kion.movie_id.drop_duplicates()))
# movie_id_dict = dict()

# for i in range(len(movie_id_set)):
    # movie_id_dict[movie_id_set[i]] = i

# df_train_kion = df_train_kion.replace({"movie_id": movie_id_dict})
# df_test_kion = df_test_kion.replace({"movie_id": movie_id_dict})

df_train_kion['timestamp'] = pd.to_datetime(df_train_kion['timestamp'])
df_test_kion['timestamp'] = pd.to_datetime(df_test_kion['timestamp'])

df_train_kion.head()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,user_id,movie_id,rating,timestamp
50,311805,13865,5.0,2021-08-09
122,451122,7793,0.0,2021-08-05
322,758443,7829,5.0,2021-08-07
334,512738,8881,0.2,2021-08-02
357,643159,5392,3.95,2021-08-07


In [2]:
def make_dot(df_train, df_test):
    test_users = df_test.user_id.drop_duplicates().values
    train_items = df_train.movie_id.drop_duplicates().values

    users = pd.DataFrame({'user_id': test_users})
    items = pd.DataFrame({'item_id': train_items})

    print(users.shape, items.shape)

    users['key'], items['key'] = 0, 0

    recs = users.merge(items, on='key', how='outer')
    recs = recs.merge(df_train, left_on=['user_id', 'item_id'], right_on=['user_id', 'movie_id'], how='left')
    recs = recs[recs['movie_id'].isna()]
    recs = recs[['user_id', 'item_id']].rename(columns={'item_id': 'movie_id'})

    return recs, test_users

recs, index = make_dot(df_train_kion, df_test_kion)
recs

(2180, 1) (4054, 1)


Unnamed: 0,user_id,movie_id
0,449084,13865
1,449084,7793
3,449084,8881
4,449084,5392
5,449084,12995
...,...,...
8837715,154181,14904
8837716,154181,1594
8837717,154181,11329
8837718,154181,3086


In [3]:
df_test_kion = recs
df_test_kion['timestamp'] = df_train_kion.timestamp.max()
df_test_kion['rating'] = 3.4

In [4]:
from collections import defaultdict
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
import pandas as pd
from scipy import sparse as sps

# read movielens 100k data.
import myfm
from myfm import RelationBlock
from myfm.utils.benchmark_data import MovieLens100kDataManager
from myfm.utils.encoders import(
    DataFrameEncoder, CategoryValueToSparseEncoder, BinningEncoder, MultipleValuesToSparseEncoder
)

In [5]:
# Implement side information and flavor of SVD++
# We add "all users who have evaluated a movie in the train set" or
# "all movies rated by a user" as a feture of user/movie.
use_date = True # use date info or not
use_iu = True # use implicit user feature
use_ii = True # use implicit item feature
use_user_info = True # use user information
use_movie_info = True # use movie information

# use_date = False # use date info or not
# use_iu = False # use implicit user feature
# use_ii = False # use implicit item feature
# use_user_info = False # use user information
# use_movie_info = False # use movie information

In [6]:
# data_manager = MovieLens100kDataManager()
df_train, df_test = df_train_kion, df_test_kion

user_info = users_df.set_index(['user_id'])
user_info['sex'] = user_info['sex'].fillna('-')
user_info['age'] = user_info['age'].fillna('-')
user_info['income'] = user_info['income'].fillna('-')

user_info = user_info[['sex', 'age', 'income']]

user_encoder = DataFrameEncoder().add_column(
    'user_id', CategoryValueToSparseEncoder(user_info.index)
)
if use_user_info:
    user_encoder.add_column(
        'sex', CategoryValueToSparseEncoder(user_info.sex)
    ).add_column(
        'age', CategoryValueToSparseEncoder(user_info.age)
    ).add_column(
        'income', CategoryValueToSparseEncoder(user_info.income)
    )

In [9]:
movie_info = items_df
movie_info['release_year'] = movie_info['release_year'].fillna(2007)
movie_info['release_year'] = movie_info['release_year'].astype(int)
movie_info = movie_info.rename(columns={'item_id': 'movie_id'})

movie_info = movie_info[['movie_id', 'release_year', 'genres']].set_index('movie_id')

movie_encoder = DataFrameEncoder().add_column(
    'movie_id', CategoryValueToSparseEncoder(movie_info.index)
)
if use_movie_info:
    movie_encoder.add_column(
        'release_year', BinningEncoder(movie_info.release_year)
    ).add_column(
        'genres', MultipleValuesToSparseEncoder(movie_info.genres, sep=',')
    )

In [10]:
if use_date:
    date_encoder = CategoryValueToSparseEncoder(df_train.timestamp.dt.date.values)
else:
    date_encoder = None

In [11]:
def string_agg(int_list) -> str:
    return ','.join([str(y) for y in int_list])

if use_iu:
    user_info['user_implicit_feature'] = (
        df_train.groupby('user_id')
        .movie_id.agg(string_agg)
        .reindex(user_info.index)
        .fillna('')
    )
if use_ii:
    movie_info['movie_implicit_feature'] = (
        df_train.groupby('movie_id')
        .user_id.agg(string_agg)
        .reindex(movie_info.index)
        .fillna('')
    )

In [12]:
user_info[user_info['user_implicit_feature'] != '']

Unnamed: 0_level_0,sex,age,income,user_implicit_feature
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
269408,Ж,age_35_44,income_20_40,"11919,4847,4289,341,6809,5411,1000,5087,3734,4..."
243779,М,age_45_54,income_20_40,7829
395825,Ж,age_18_24,income_40_60,7829
836440,М,age_45_54,income_40_60,1470375971106093812220
262016,М,age_18_24,income_40_60,5469
...,...,...,...,...
184205,Ж,age_18_24,income_0_20,9728132801431715739
664074,М,age_25_34,income_20_40,5981044086369728
172071,Ж,age_18_24,income_20_40,14
307983,Ж,age_55_64,income_20_40,3784121923734


In [13]:
movie_info[movie_info['movie_implicit_feature'] != '']

Unnamed: 0_level_0,release_year,genres,movie_implicit_feature
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1622,2004,"триллеры, детективы","918363,854899,1064486,500420,734671,552801,509..."
6677,2013,комедии,511858
561,2013,"боевики, триллеры","431293,997446,189946,15845,1016593,530432,6205..."
7308,2014,"боевики, фантастика, детективы, мелодрамы","1063805,276676,23176,490336,685056,30997,46690..."
4358,2017,"фантастика, триллеры",398370812286
...,...,...,...
5525,2016,драмы,117961
1325,2014,"драмы, военные",930631
15610,2015,мелодрамы,102204725431957911679803820798570955
6443,2018,"драмы, триллеры, криминал","57911,43489,626076,90191,545987,390253,191164,..."


In [14]:
if use_iu:
    user_encoder.add_column(
        'user_implicit_feature',
        MultipleValuesToSparseEncoder(user_info.user_implicit_feature, normalize=True)
    )
if use_ii:
    movie_encoder.add_column(
        'movie_implicit_feature',
        MultipleValuesToSparseEncoder(movie_info.movie_implicit_feature, normalize=True)
    )

In [15]:
# given user/movie ids, add additional infos and return it as sparse
def augment_user_id(user_ids):
    return user_encoder.encode_df(
        user_info.reindex(user_ids).reset_index().fillna('-')
    )

def augment_movie_id(movie_ids):
    return movie_encoder.encode_df(
        movie_info.reindex(movie_ids).reset_index().fillna('-')
    )

In [16]:
# https://docs.scipy.org/doc/numpy/reference/generated/numpy.unique.html
train_blocks = []
test_blocks = []
for source, target in [(df_train, train_blocks), (df_test, test_blocks)]:
    unique_users, user_map = np.unique(source.user_id, return_inverse=True)
    target.append(
        RelationBlock(user_map, augment_user_id(unique_users))
    )
    unique_movies, movie_map = np.unique(source.movie_id, return_inverse=True)
    target.append(
        RelationBlock(movie_map, augment_movie_id(unique_movies))
    )
    
if use_date:
    X_date_train = date_encoder.to_sparse(df_train.timestamp.dt.date.values)
    X_date_test = date_encoder.to_sparse(df_test.timestamp.dt.date.values)
else:
    X_date_train = None
    X_date_test = None

In [18]:
%%time
group_shapes = ([len(date_encoder)] if use_date else []) + user_encoder.encoder_shapes + movie_encoder.encoder_shapes

fm = myfm.MyFMRegressor(rank=10)
fm.fit(
    X_date_train, df_train.rating.values, X_rel=train_blocks,
    group_shapes=group_shapes,
    X_test=X_date_test, X_rel_test=test_blocks,
    y_test=df_test.rating.values,
    n_iter=64, n_kept_samples=64
);

alpha = 0.36 w0 = 1.87  rmse_this: 1.83 mae_this: 1.56: 100%|██████████| 64/64 [01:39<00:00,  1.56s/it]

CPU times: user 1min 27s, sys: 12.8 s, total: 1min 40s
Wall time: 1min 39s





In [19]:
%%time
test_predictions = fm.predict(X_date_test, test_blocks)

rmse = (
    (test_predictions - df_test.rating.values)**2
).mean() ** 0.5
mae = np.abs(test_predictions - df_test.rating).mean()

# Note the improvement from "id_only" case.
# Compare this with methods like ones in https://paperswithcode.com/sota/collaborative-filtering-on-movielens-100k
print('rmse={}, mae={}'.format(rmse, mae))

# ошибки с настоящим тестом
# rmse=1.7627994387412238, mae=1.5235408906774044 - все фичи
# rmse=1.8704469637789525, mae=1.6745094493052695 - без фичей

rmse=1.6560185991440899, mae=1.4980154062414262
CPU times: user 2min 15s, sys: 1min 2s, total: 3min 17s
Wall time: 3min 34s


In [20]:
result = df_test_kion.copy()
result['score'] = test_predictions

In [21]:
%%time
sorted_recs = result.sort_values(by=['user_id', 'score'], ascending=False)
top_sorted_recs = sorted_recs.groupby('user_id').head(10).reset_index(drop=True)
top_sorted_recs['rank'] = top_sorted_recs.groupby("user_id")["score"].rank(method="dense", ascending=False).astype(int)
top_sorted_recs = top_sorted_recs.rename(columns={'movie_id': 'item_id'})

CPU times: user 10.7 s, sys: 4.54 s, total: 15.2 s
Wall time: 16.1 s


In [22]:
precision = Precision(k=10)
recall = Recall(k=10)

precision_value = precision.calc(reco=top_sorted_recs.drop(columns=['score']), interactions=interactions_test)
recall_value = recall.calc(reco=top_sorted_recs.drop(columns=['score']), interactions=interactions_test)
print(f"precision: {precision_value}\nrecall: {recall_value}", '\n')
print(top_sorted_recs['user_id'].unique().shape, top_sorted_recs['item_id'].unique().shape)

precision: 0.001055045871559633
recall: 0.004450906509392748 

(2180,) (55,)
