In [None]:
# default_exp recommender

# Recommender

> This module exposes classes and functions related to training of the Word2Vec recommender using the Gensim library.

https://radimrehurek.com/gensim/

In [None]:
#hide
from nbdev.export import *

In [None]:
#export
import logging
import random
from typing import List, NamedTuple, Tuple
from datetime import datetime
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from nbdev.showdoc import *
from tqdm import tqdm

from word2vec_recommender.core import *

In [None]:
#export
logger = logging.getLogger(__name__)

In [None]:
#export
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize

class KnnRecommender:
    def __init__(
        self,
        word_indexes: List[str],
        embeddings: np.array,
        n_recommendations: int = 10,
        algorithm: str = 'brute'):

        self.word_indexes = word_indexes
        # https://stackoverflow.com/a/34145444 Normalize ensures euclidean will have the same output as cosine
        self.embeddings = normalize(embeddings)
        self._n_recommendations = n_recommendations
        self._algorithm = algorithm

        self.nn_model: NearestNeighbors = None

    def fit(self):
        self.nn_model = NearestNeighbors(n_neighbors=self._n_recommendations+1, algorithm=self._algorithm)
        self.nn_model.fit(self.embeddings)
    
    def recommend_by_index(self, index: int) -> List[Recommendation]:
        if not self.nn_model:
            raise ValueError('you should call fit() before generating recommendations')
        embedding = self.embeddings[index]
        distances_array, indexes_array = self.nn_model.kneighbors([embedding])
        recommendations = []
        for ind, dist in zip(indexes_array[0][1:], distances_array[0][1:]):
            recommendations.append( Recommendation(movie_id=int(self.word_indexes[ind]), score=dist))
        return recommendations


Reading embeddings and words generated by model

In [None]:
with open(Path("./data/out/embeddings.pkl"), "rb") as f:
    embeddings = np.load(f)
with open(Path("./data/out/words_index.pkl"), "rb") as f:
    word_indexes = np.load(f)

In [None]:
knn_recommender = KnnRecommender(
    word_indexes=word_indexes, 
    embeddings=embeddings)

knn_recommender.fit()

In [None]:
movie_repository = MovieRepository(pd.read_csv('./data/ml-latest-small/movies.csv'))

Generating recommendations

> note that we need to find the index for a movie id in the embeddings array

In [None]:
seed_id = int(word_indexes[10])
recommendations = knn_recommender.recommend_by_index(10)
print_recommendations(movie_repository, seed_id, recommendations)

Movie(movie_id=527, title="Schindler's List (1993)", genres='Drama|War')
> Recommendations:
>> Movie(movie_id=50, title='Usual Suspects, The (1995)', genres='Crime|Mystery|Thriller') score=0.9896380305290222
>> Movie(movie_id=593, title='Silence of the Lambs, The (1991)', genres='Crime|Horror|Thriller') score=0.9955818057060242
>> Movie(movie_id=318, title='Shawshank Redemption, The (1994)', genres='Crime|Drama') score=1.0348621606826782
>> Movie(movie_id=2436, title='Tea with Mussolini (1999)', genres='Comedy|Drama|War') score=1.046755313873291
>> Movie(movie_id=356, title='Forrest Gump (1994)', genres='Comedy|Drama|Romance|War') score=1.0480786561965942
>> Movie(movie_id=381, title='When a Man Loves a Woman (1994)', genres='Drama|Romance') score=1.0549352169036865
>> Movie(movie_id=1299, title='Killing Fields, The (1984)', genres='Drama|War') score=1.0598199367523193
>> Movie(movie_id=227, title='Drop Zone (1994)', genres='Action|Thriller') score=1.0603240728378296
>> Movie(movie_id=