In [1]:
%%capture
!pip install rectools optuna 

In [2]:
import os
from zipfile import ZipFile

import pandas as pd
import numpy as np

import requests
from tqdm.auto import tqdm

from rectools.metrics import calc_metrics, NDCG, MAP, Precision, Recall, MeanInvUserFreq
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel
from rectools.models.popular import PopularModel
from rectools import Columns
from rectools.dataset import Dataset

from lightfm import LightFM

from implicit.als import AlternatingLeastSquares

from sklearn.model_selection import train_test_split

import optuna
import dill
     
import warnings
warnings.filterwarnings('ignore')

In [3]:
os.environ["OPENBLAS_NUM_THREADS"] = "1"

SEED = 2022
K_RECOS = 10

# KION DATASET

In [4]:
url = 'https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip'

req = requests.get(url, stream=True)

PATH_TO_DATA = 'kion_train.zip'

with open(PATH_TO_DATA, 'wb') as f:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        f.write(chunk)

kion dataset download:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

In [5]:
with ZipFile(PATH_TO_DATA) as z:
    
    with z.open('kion_train/interactions.csv') as f:
        interactions = pd.read_csv(f)
    
    with z.open('kion_train/items.csv') as f:
        items = pd.read_csv(f)
        
    with z.open('kion_train/users.csv') as f:
        users = pd.read_csv(f)

In [6]:
interactions.rename(
  columns={
      'last_watch_dt': 'datetime',
      'total_dur': 'weight',
  },
  inplace=True,
)

interactions['datetime'] = pd.to_datetime(interactions['datetime'])
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

# TRAIN/VAL/TEST

In [7]:
max_date = interactions[Columns.Datetime].max()
min_date = interactions[Columns.Datetime].min()

print(f'min дата в interactions: {min_date}')
print(f'max дата в interactions: {max_date}')
print(f'Продолжительность: {max_date - min_date}')

min дата в interactions: 2021-03-13 00:00:00
max дата в interactions: 2021-08-22 00:00:00
Продолжительность: 162 days 00:00:00


In [8]:
ranker_train_size = 0.7
ranker_val_size = 0.15
ranker_test_size = 0.15

ranker_days_count = 30
ranker_data = interactions[
    (interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=ranker_days_count))
]

train_val_users, test_users = train_test_split(
    ranker_data['user_id'].unique(), random_state=SEED, test_size=ranker_test_size
)

train_users, val_users = train_test_split(
    train_val_users, random_state=SEED, test_size=ranker_val_size / (ranker_train_size + ranker_val_size)
)

ranker_days_count = 30
train = interactions[
    (interactions[Columns.Datetime] < max_date - pd.Timedelta(days=ranker_days_count))
]


# Users features

In [9]:
users.isna().sum()

user_id         0
age         14095
income      14776
sex         13831
kids_flg        0
dtype: int64

In [10]:
users.fillna('Unknown', inplace=True)

users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()
users.head()

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0
5,1037719,age_45_54,income_60_90,М,0


In [11]:
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex


# Item features

In [12]:
items.isna().sum()

item_id             0
content_type        0
title               0
title_orig       4745
release_year       98
genres              0
countries          37
for_kids        15397
age_rating          2
studios         14898
directors        1509
actors           2619
description         2
keywords          423
dtype: int64

In [13]:
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()
items.head()

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг..."
3,7868,film,45 лет,45 Years,2015.0,"драмы, зарубежные, мелодрамы",Великобритания,,16.0,,Эндрю Хэй,"Александра Риддлстон-Барретт, Джеральдин Джейм...","Шарлотта Рэмплинг, Том Кортни, Джеральдин Джей...","45, лет, 2015, Великобритания, брак, жизнь, лю..."
4,16268,film,Все решает мгновение,,1978.0,"драмы, спорт, советские, мелодрамы",СССР,,12.0,Ленфильм,Виктор Садовский,"Александр Абдулов, Александр Демьяненко, Алекс...",Расчетливая чаровница из советского кинохита «...,"Все, решает, мгновение, 1978, СССР, сильные, ж..."


## Genre

In [14]:
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


## Content

In [15]:
content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"
content_feature.head()

Unnamed: 0,id,value,feature
0,10711,film,content_type
1,2508,film,content_type
2,10716,film,content_type
3,7868,film,content_type
4,16268,film,content_type


## Releas year

In [16]:
pd.qcut(items['release_year'], q=10).head()

0      (1984.0, 2004.0]
1      (2012.0, 2014.0]
2      (2009.0, 2012.0]
3      (2014.0, 2016.0]
4    (1896.999, 1984.0]
Name: release_year, dtype: category
Categories (10, interval[float64, right]): [(1896.999, 1984.0] < (1984.0, 2004.0] < (2004.0, 2009.0] <
                                            (2009.0, 2012.0] ... (2016.0, 2018.0] <
                                            (2018.0, 2019.0] < (2019.0, 2020.0] < (2020.0, 2021.0]]

In [17]:
items['binned_r_year'] = pd.qcut(items['release_year'], q=10, labels=list(range(10)))

release_year_feature = items.reindex(columns=[Columns.Item, "binned_r_year"])
release_year_feature.columns = ["id", "value"]
release_year_feature["feature"] = "binned_r_year"
release_year_feature.head()

Unnamed: 0,id,value,feature
0,10711,1,binned_r_year
1,2508,4,binned_r_year
2,10716,3,binned_r_year
3,7868,5,binned_r_year
4,16268,0,binned_r_year


## Country

In [18]:
items["country"] = items["countries"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
country_feature = items[["item_id", "country"]].explode("country")
country_feature.columns = ["id", "value"]
country_feature["feature"] = "country"
country_feature.head()

Unnamed: 0,id,value,feature
0,10711,испания,country
1,2508,сша,country
2,10716,канада,country
3,7868,великобритания,country
4,16268,ссср,country


## Combine

In [19]:
item_features = pd.concat((genre_feature, content_feature, country_feature, release_year_feature))

## Init dataset

In [20]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "country", 'binned_r_year', 'content_type'],
)

# Models

## ALS

In [21]:
model_ALS = ImplicitALSWrapperModel(
      model=AlternatingLeastSquares(
        factors=128, 
        regularization=0.25,
        random_state=SEED, 
      ),
      fit_features_together=True,
    )

model_ALS.fit(dataset)

<rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f7119969670>

In [22]:
with open('implicit_als.dill', 'wb') as f:
   dill.dump(model_ALS, f)

## LightFM

In [23]:
model_lightFM = LightFMWrapperModel(
  model=LightFM(
    no_components=32, 
    loss='warp', 
    random_state=SEED,
    learning_rate=0.2,
    user_alpha=0.1,
    item_alpha=0.1,
  ),
  epochs=2,
  num_threads=1,
)

model_lightFM.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7f711980cac0>

In [24]:
with open('lightfm_base.dill', 'wb') as f:
   dill.dump(model_lightFM, f)

## Popular

In [25]:
popular_model = PopularModel()
popular_model.fit(dataset);

In [26]:
with open('popular_base.dill', 'wb') as f:
   dill.dump(popular_model, f)

# Get candidates

In [27]:
def calc_metrics_(candidates_df, rank_col, k=10):
    metrics = {
        'ndcg@k': NDCG(k=k),
        'map@k': MAP(k=k),
        'Precision@k': Precision(k=k),
        'recall@k': Recall(k=k),
        'novelty@k': MeanInvUserFreq(k=k),
    }
    return calc_metrics(
        metrics=metrics,
        reco=(
            candidates_df
            .rename(columns={rank_col: Columns.Rank})
            [[Columns.User, Columns.Item, Columns.Rank]]
            [candidates_df[Columns.User].isin(test_users)]
        ),
        interactions=(
            ranker_data
            [[Columns.User, Columns.Item, Columns.Datetime, Columns.Weight]]
            [ranker_data[Columns.User].isin(test_users)]
        ), 
        prev_interactions=(
            train
            [[Columns.User, Columns.Item, Columns.Datetime, Columns.Weight]]
            [train[Columns.User].isin(test_users)]
        ),
        catalog=items['item_id'].unique()
    )
     

def get_candidates(top_n, model):
    recos = model.recommend(
      users=train['user_id'].unique(),
      dataset=dataset,
      k=top_n,
      filter_viewed=True,
    )
    return recos

## ALS

In [28]:
with open('implicit_als.dill', 'rb') as f:
   model = dill.load(f)

In [31]:
%%time

for n in [20, 30, 40, 50]:
  candidates = get_candidates(n, model)
  metric_values = calc_metrics_(candidates, 'als_rank', k=n)
  print(f'n: {n}, metric_values: {metric_values}')

n: 20, metric_values: {'Precision@k': 0.01475680321704296, 'recall@k': 0.09328771399591718, 'ndcg@k': 0.02207915157208061, 'map@k': 0.044474379273513845, 'novelty@k': 4.719567790144806}
n: 30, metric_values: {'Precision@k': 0.011787126711995867, 'recall@k': 0.10975385978110253, 'ndcg@k': 0.018324888437519227, 'map@k': 0.045454530290375224, 'novelty@k': 5.309005120301941}
n: 40, metric_values: {'Precision@k': 0.00988365846545823, 'recall@k': 0.12025884870024398, 'ndcg@k': 0.015863573562489162, 'map@k': 0.04593377189373775, 'novelty@k': 5.763944619189056}
n: 50, metric_values: {'Precision@k': 0.008639970530904179, 'recall@k': 0.12915652741504854, 'ndcg@k': 0.01415496335905827, 'map@k': 0.046264064907411286, 'novelty@k': 6.104388061366736}
CPU times: user 1h 18min 46s, sys: 18min 52s, total: 1h 37min 38s
Wall time: 50min 33s


In [32]:
candidates = get_candidates(30, model)
candidates.to_csv('als_candidates.csv', index=False)
candidates.head()

Unnamed: 0,user_id,item_id,score,rank
0,176549,13865,0.444931,1
1,176549,10440,0.420671,2
2,176549,15297,0.392164,3
3,176549,4151,0.276921,4
4,176549,7571,0.255112,5


## LightFM

In [33]:
with open('lightfm_base.dill', 'rb') as f:
   model = dill.load(f)

In [34]:
%%time

for n in [20, 30, 40, 50]:
  candidates = get_candidates(n, model)
  metric_values = calc_metrics_(candidates, 'lfm_rank', k=n)
  print(f'n: {n}, metric_values: {metric_values}')

n: 20, metric_values: {'Precision@k': 0.016691480054640617, 'recall@k': 0.10899972477901491, 'ndcg@k': 0.022532463449440633, 'map@k': 0.04135556092048772, 'novelty@k': 3.9686340354625336}
n: 30, metric_values: {'Precision@k': 0.012251162647921048, 'recall@k': 0.1181839750074644, 'ndcg@k': 0.018105037342263562, 'map@k': 0.04196182080599867, 'novelty@k': 4.685931485661368}
n: 40, metric_values: {'Precision@k': 0.009695639494727794, 'recall@k': 0.12273366193534252, 'ndcg@k': 0.015308070199417637, 'map@k': 0.04215847506848677, 'novelty@k': 5.2733945348082845}
n: 50, metric_values: {'Precision@k': 0.008061946495172901, 'recall@k': 0.12589946456751785, 'ndcg@k': 0.013376716826057063, 'map@k': 0.04227140241892464, 'novelty@k': 5.615452742525192}
CPU times: user 38min 42s, sys: 18min 15s, total: 56min 57s
Wall time: 29min 38s


In [35]:
candidates = get_candidates(30, model)
candidates.to_csv('lfm_candidates.csv', index=False)
candidates.head()

Unnamed: 0,user_id,item_id,score,rank
0,176549,13865,-0.000163,1
1,176549,15297,-0.000164,2
2,176549,10440,-0.000165,3
3,176549,4151,-0.000168,4
4,176549,3734,-0.000168,5


## Popular 

In [36]:
k = items['item_id'].nunique()

In [37]:
candidates = popular_model.recommend(
    dataset.user_id_map.external_ids[:1], 
    dataset=dataset, 
    k=k, 
    filter_viewed=False,
)

In [38]:
candidates.to_csv('popular_candidates.csv', index=False)
candidates.head()

Unnamed: 0,user_id,item_id,score,rank
0,176549,10440,141889.0,1
1,176549,15297,137128.0,2
2,176549,13865,93403.0,3
3,176549,9728,76978.0,4
4,176549,4151,69641.0,5
