# 0. Configuration

In [45]:
# links to shared data MovieLens
# source on kaggle: https://www.kaggle.com/code/quangnhatbui/movie-recommender/data
RATINGS_SMALL_URL = 'https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link'
MOVIES_METADATA_URL = 'https://drive.google.com/file/d/19g6-apYbZb5D-wRj4L7aYKhxS-fDM4Fb/view?usp=share_link'

# 1. Modules and functions

In [46]:
# just to make it available to download w/o SSL verification
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

import numpy as np
import pandas as pd
import scipy.sparse as sp

from itertools import islice, cycle, product

from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings('ignore')


## 1. 1. Helper functions to avoid copy paste

In [47]:
def read_csv_from_gdrive(url):
    """
    gets csv data from a given url (taken from file -> share -> copy link)
    :url: example https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link
    """
    file_id = url.split('/')[-2]
    file_path = 'https://drive.google.com/uc?export=download&id=' + file_id
    data = pd.read_csv(file_path)

    return data

# 2. Main

## 2.1. Load Data

`interactions` dataset shows list of movies that users watched, along with given ratings:

In [48]:
# interactions data
interactions = read_csv_from_gdrive(RATINGS_SMALL_URL)
interactions.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


`movies_metadata` dataset shows the list of movies existing on OKKO platform:

In [49]:
# information about films etc
movies_metadata = read_csv_from_gdrive(MOVIES_METADATA_URL)
movies_metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [50]:
movies_metadata['id'] = movies_metadata['id'].astype(str)
interactions['movieId'] = interactions['movieId'].astype(str)

In [51]:
# leave only those films that intersect with each other
interactions_filtered = interactions.loc[interactions['movieId'].isin(movies_metadata['id'])]
print(interactions.shape, interactions_filtered.shape)

(100004, 4) (44989, 4)


In [52]:
# sorl values by time
interactions_filtered= interactions_filtered.sort_values(by='timestamp', ascending=True)

In [53]:
# global test/train split
train_len = round(len(interactions_filtered)*0.8)
global_train= interactions_filtered[:train_len]
global_test = interactions_filtered[:(1-train_len)]
print(len(global_train),len(global_test))

35991 8999


In [54]:
# train test/train split
train_len = round(len(global_train)*0.8)
val_train= global_train[:train_len]
val_test = global_train[:(1-train_len)]
print(len(val_train),len(val_test))

28793 7199


## 2.2 Data preparation using LightFM Dataset

To use implicit kNN method `fit` we need a sparse matrix in COOrdinate format. To achieve that we will use `scipy.sparse.coo_matrix` from scipy;


In [55]:
def get_coo_matrix(
        df: pd.DataFrame, 
        user_col: str,
        item_col: str, 
        users_mapping: dict, 
        movies_mapping: dict,
        weight_col: str = None
        ):
    if weight_col is None:
        weights = np.ones(len(df), dtype=np.float32)
    else:
        weights = df[weight_col].astype(np.float32)
    interaction_matrix = sp.coo_matrix((
        weights, 
        (
            df[user_col].map(users_mapping.get), 
            df[item_col].map(movies_mapping.get)
        )
    ))
    return interaction_matrix


In [56]:
# define users mapping
users_inv_mapping = dict(enumerate(interactions_filtered['userId'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}
len(users_mapping)


671

In [57]:
# define movies mapping
movies_inv_mapping = dict(enumerate(interactions_filtered['movieId'].unique()))
movies_mapping = {v: k for k, v in movies_inv_mapping.items()}
len(movies_mapping)


2830

In [58]:
# defining train set on the whole interactions dataset (as HW you will have to split into test and train for evaluation)
train_mat = get_coo_matrix(
    interactions_filtered,
    user_col = 'userId',
    item_col = 'movieId',
    users_mapping = users_mapping,
    movies_mapping = movies_mapping
    ).tocsr()


In [59]:
train_mat

<671x2830 sparse matrix of type '<class 'numpy.float32'>'
	with 44989 stored elements in Compressed Sparse Row format>

In [60]:
type(train_mat)

scipy.sparse._csr.csr_matrix

## 2.3. Model Training & Evaluation

In [`implicit`](https://pypi.org/project/implicit/), there are various models and can be groupped into:
- Item-to-Item: KNN based on various similarities - CosineRecommender, BM25Recommender, TFIDFRecommender
- implicit ALS;
- Logistic Matrix Factorization;
- Bayesian Personalized Ranking (BPR)


### 2.3.1. Train Model

In [61]:
from implicit.nearest_neighbours import (
    CosineRecommender,
    BM25Recommender,
    TFIDFRecommender
    )


Note that in item-to-item models we need to provide matrix in the form of item-user by transposing initial COO matrix user-item


In [62]:
# fit the model
cosine_model = CosineRecommender(K = 20)
cosine_model.fit(train_mat.T)


  0%|          | 0/671 [00:00<?, ?it/s]

In [63]:
type(cosine_model)

implicit.nearest_neighbours.CosineRecommender

### 2.3.2. Evaluate the Model

In [64]:
# let's make sense-check
top_N = 10
user_id = interactions_filtered['userId'].iloc[0]
row_id = users_mapping[user_id]
print(f'Rekko for user {user_id}, row number in matrix - {row_id}')

Rekko for user 383, row number in matrix - 0


In [65]:
# create mapper for movieId and title names
movie_name_mapper = dict(zip(movies_metadata['id'], movies_metadata['original_title']))

In [66]:
recs = cosine_model.recommend(
    row_id,
    train_mat,
    N = top_N,
    filter_already_liked_items = True
    )
recs = pd.DataFrame(recs).T.rename(columns = {0: 'col_id', 1: 'similarity'})
recs['inv_movie_id'] = recs['col_id'].astype(int)
recs['movieId'] = recs['inv_movie_id'].map(movies_inv_mapping.get)
recs['title'] = recs['movieId'].map(movie_name_mapper)
recs


Unnamed: 0,col_id,similarity,inv_movie_id,movieId,title
0,48.0,6.75178,48,435,The Day After Tomorrow
1,42.0,6.389411,42,58,Pirates of the Caribbean: Dead Man's Chest
2,85.0,6.242639,85,300,La science des rêves
3,88.0,5.899756,88,122,The Lord of the Rings: The Return of the King
4,94.0,5.389145,94,236,Muriel's Wedding
5,46.0,4.558815,46,232,Rumble Fish
6,44.0,4.070089,44,103,Taxi Driver
7,93.0,4.009759,93,252,Willy Wonka & the Chocolate Factory
8,79.0,3.976724,79,111,Scarface
9,13.0,3.337076,13,348,Alien


In [67]:
#deline lfm_recommend function
def  lfm_recommend(mat,movies_df: pd.DataFrame, top_N:int,user_id,row_id,filter_already_liked_items = True):
    movie_name_mapper = dict(zip(movies_df['id'], movies_df['original_title']))
    recs = model.recommend(
    row_id,
    mat.T.tocsr(),
    N = top_N,
    filter_already_liked_items = True
    )
    recs = pd.DataFrame(recs).T.rename(columns = {0: 'col_id', 1: 'similarity'})
    recs['inv_movie_id'] = recs['col_id'].astype(int)
    recs['movieId'] = recs['inv_movie_id'].map(movies_inv_mapping.get)
    recs['title'] = recs['movieId'].map(movie_name_mapper)
    return recs

In [68]:
train_mat = get_coo_matrix(
    val_train,
    user_col = 'userId',
    item_col = 'movieId',
    users_mapping = users_mapping,
    movies_mapping = movies_mapping
    ).tocsr()

In [87]:
test_val_mat = get_coo_matrix(
    val_test,
    user_col = 'userId',
    item_col = 'movieId',
    users_mapping = users_mapping,
    movies_mapping = movies_mapping
    ).tocsr()

In [88]:
test_g_mat = get_coo_matrix(
    global_test,
    user_col = 'userId',
    item_col = 'movieId',
    users_mapping = users_mapping,
    movies_mapping = movies_mapping
    ).tocsr()

In [70]:
model = CosineRecommender(K = 20)
model.fit(train_mat.T)

  0%|          | 0/406 [00:00<?, ?it/s]

In [80]:
def  lfm_recommend(mat,movies_df: pd.DataFrame, top_N:int,row_id,filter_already_liked_items = True):
    movie_name_mapper = dict(zip(movies_df['id'], movies_df['original_title']))
    recs = cosine_model.recommend(
        row_id,
        mat,
        N = top_N,
        filter_already_liked_items = True
        )
    recs = pd.DataFrame(recs).T.rename(columns = {0: 'col_id', 1: 'similarity'})
    recs['inv_movie_id'] = recs['col_id'].astype(int)
    recs['movieId'] = recs['inv_movie_id'].map(movies_inv_mapping.get)
    recs['title'] = recs['movieId'].map(movie_name_mapper)
    return recs

In [82]:
test1 = lfm_recommend(test_mat,movies_metadata, top_N,row_id,filter_already_liked_items = True)
test1

Unnamed: 0,col_id,similarity,inv_movie_id,movieId,title
0,48.0,6.75178,48,435,The Day After Tomorrow
1,42.0,6.389411,42,58,Pirates of the Caribbean: Dead Man's Chest
2,85.0,6.242639,85,300,La science des rêves
3,88.0,5.899756,88,122,The Lord of the Rings: The Return of the King
4,94.0,5.389145,94,236,Muriel's Wedding
5,46.0,4.558815,46,232,Rumble Fish
6,44.0,4.070089,44,103,Taxi Driver
7,93.0,4.009759,93,252,Willy Wonka & the Chocolate Factory
8,79.0,3.976724,79,111,Scarface
9,13.0,3.337076,13,348,Alien


In [104]:
test2 = lfm_recommend(test_g_mat,movies_metadata, top_N,row_id,filter_already_liked_items = True)
test2

Unnamed: 0,col_id,similarity,inv_movie_id,movieId,title
0,48.0,6.75178,48,435,The Day After Tomorrow
1,42.0,6.389411,42,58,Pirates of the Caribbean: Dead Man's Chest
2,85.0,6.242639,85,300,La science des rêves
3,88.0,5.899756,88,122,The Lord of the Rings: The Return of the King
4,94.0,5.389145,94,236,Muriel's Wedding
5,46.0,4.558815,46,232,Rumble Fish
6,44.0,4.070089,44,103,Taxi Driver
7,93.0,4.009759,93,252,Willy Wonka & the Chocolate Factory
8,79.0,3.976724,79,111,Scarface
9,13.0,3.337076,13,348,Alien


In [94]:
def compute_gain(y_value: float, gain_scheme: str) -> float:
    
    gain = {'exp2': 2 ** y_value - 1,
            'const': y_value}

    return float(gain[gain_scheme])

In [99]:
from math import log2

def dcg_k(y_true: np.array, y_pred: np.array, k, gain_scheme: str) -> float:
    
    dcg = 0
    argsort = np.argsort(y_pred)[::-1]
    y_true_sorted = y_true[argsort]
    y_true_sorted_k = y_true_sorted[:k]

    for idx, val in enumerate(y_true_sorted_k, 1):
        gain = compute_gain(val, gain_scheme)
        dcg += gain / log2(idx + 1)
        
    return dcg

In [100]:
def ndcg_k(y_true: np.array, y_pred: np.array, k=5,gain_scheme: str = 'const') -> float:
    
    # pred dcg then we calc the same to find max possible
    y_true = y_true[:k]
    y_pred = y_pred[:k]
    preds_dcg = dcg_k(y_true, y_pred,k, gain_scheme)
    max_possible_dcg = dcg_k(y_true, y_true,k, gain_scheme)

    return preds_dcg / max_possible_dcg

In [101]:
def dummy_ndcg(recs_df,k):
    y_true  = recs_df['similarity'].values
    y_pred = np.array(y_true * 0)
    score = ndcg_k(y_true,y_pred,k)
    return score

In [102]:
dummy_ndcg(test1,10)

0.8460470714893091

In [105]:
dummy_ndcg(test2,10)

0.8460470714893091

# TODO
- Make global train/ global test split -- train the model appropiately and predict on test set;
- Wrap up in function recommendations - lfm_recommend();
- Calculate `NDCG@10` on test set