In [None]:
# default_exp model

# Word2VeC Reommender

> API details.

In [None]:
#export
import logging
import random
from typing import List, NamedTuple, Tuple
from datetime import datetime

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from gensim.models import Word2Vec 
from gensim.models.callbacks import CallbackAny2Vec
from nbdev.showdoc import *
from tqdm import tqdm

In [None]:
#export
# constants
RANDOM_SEED = 31

logger = logging.getLogger(__name__)

In [None]:
#export
class GensimParameters(NamedTuple):
    window: int = 10
    iter: int = 20
    sg: int = 1
    hs: int = 0
    negative: int = 10
    alpha: float = 0.03
    min_alpha: float = 0.0007
    seed: int = 14
    compute_loss: bool = True

In [None]:
gensim_parameters = GensimParameters(window=10)

In [None]:
#export
def generate_sentences_by_user(df: pd.DataFrame):
    """
    Generate the Gensin sentences for a dataframe
    Each sentence is created by joining all ratings from a user sorted by timestamp
    The expectation is that by doing this, the Search2Vec model will learn similar items based on their proximity.
    """
    def to_sentence(r):
        return [str(m) for m in r]
    return df.groupby('userId')['movieId'].apply(to_sentence).tolist()

In [None]:
#export
class EpochLogger(CallbackAny2Vec):
        """
        Log information about training, reports time for epochs.
        """

        def __init__(
            self, 
            print_to_stdout: bool = False
            ):
            """
            Constructor for the class to log progress information.
            """
            self._epoch = 1
            self._start = datetime.now()
            self._end = datetime.now()
            self._print_to_stdout = print_to_stdout

        def on_epoch_begin(self, _):
            """
            Print progress information, initializes start time.
            :param _: type gensim word2vec, signature to match the function to be used by gensim
            """
            self._start = datetime.now()
            msg = f"Epoch #{self._epoch} start"
            if self._print_to_stdout:
                print(msg)
            logger.info(msg)

        def on_epoch_end(self, model):
            """
            Print time to for epoch
            :param model: type gensim word2vec, signature to match the function to be used by gensim
            """
            self._end = datetime.now()
            elapsed = self._end - self._start
            msg = f"Epoch #{self._epoch} end in {elapsed} time"
            if self._print_to_stdout:
                print(msg)
            logger.info(msg)

            msg = f"Epoch #{self._epoch}, loss {model.get_latest_training_loss()}"
            if self._print_to_stdout:
                print(msg)
            logger.info(msg)
            self._epoch += 1

In [None]:
#export
class Word2VecMovieRecommender:
    def __init__(
        self,
        movies_df: pd.DataFrame,
        ratings_df: pd.DataFrame,
        gensim_parameters: GensimParameters,
        positive_rating_threshold: float = 3.0,
        train_validation_ratio: float = 0.9
        ):

        self.movies_df = movies_df
        self.ratings_df = ratings_df
        self.gensim_parameters = gensim_parameters
        self.movie_id_dict = movies_df.set_index("movieId").to_dict('index')
        self.model: Word2Vec = None

        # joins movie and ratings df
        df_joined = ratings_df.set_index('movieId').join(movies_df.set_index('movieId'), on='movieId', rsuffix='movie_').reset_index()
        # keep only positive ratings
        df_joined = df_joined[df_joined['rating'] >= positive_rating_threshold]
        # sort by user interactions
        df_joined.sort_values(by=['userId', 'timestamp'], inplace=True)
        # train validation split
        user_ids = df_joined["userId"].unique().tolist()
        random.Random(RANDOM_SEED).shuffle(user_ids)
        training_size = int(0.9 * len(user_ids))
        training_user_ids = user_ids[:training_size]
        validation_user_ids = user_ids[training_size:]
        assert len(validation_user_ids) + len(training_user_ids) == len(user_ids)
        self.train_df = df_joined[df_joined['userId'].isin(training_user_ids)]
        self.validation_df = df_joined[df_joined['userId'].isin(validation_user_ids)]


    def __repr__(self):
        return f"movies={self.movies_df.shape}, ratings={self.ratings_df.shape}, " +\
            f"train_df={self.train_df.shape}, validation_df={self.validation_df.shape}"


    def train(self, print_progress: bool = True):
        sentences_train =generate_sentences_by_user(self.train_df)
        self.model = Word2Vec(sentences_train, callbacks=[EpochLogger(print_to_stdout=print_progress)],  **self.gensim_parameters._asdict())

    def similar_by_movie_id(self, seed_movie_id: int, n: int = 5) -> Tuple[str, List[str]]:
        def movie_to_str(mv, distance=None):
            return f"{mv['title']} - {mv['genres']} {distance}"
        movie = self.movie_id_dict[seed_movie_id]
        seed_movie = movie_to_str(movie)
        movie_embedding = self.model.wv[str(seed_movie_id)]
        movies = self.model.wv.similar_by_vector(movie_embedding, topn= n+1)[1:]
    
        similars = []
        for m in movies:
            movie_id = m[0]
            distance = m[1]
            movie = self.movie_id_dict.get(int(movie_id))
            if movie:
                similars.append(movie_to_str(movie, distance))
            else:
                similars.append(f"movie={movie_id} not found!")
            
        return (seed_movie, similars)





In [None]:
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
word2vec_recommender = Word2VecMovieRecommender(
    movies_df=movies_df,
    ratings_df=ratings_df,
    gensim_parameters=gensim_parameters
)

word2vec_recommender

movies=(9742, 3), ratings=(100836, 4), train_df=(74918, 6), validation_df=(6845, 6)

In [None]:
word2vec_recommender.train_df.head()

Unnamed: 0,movieId,userId,rating,timestamp,title,genres
43,804,1,4.0,964980499,She's the One (1996),Comedy|Romance
73,1210,1,5.0,964980499,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi
120,2018,1,5.0,964980523,Bambi (1942),Animation|Children|Drama
171,2628,1,4.0,964980523,Star Wars: Episode I - The Phantom Menace (1999),Action|Adventure|Sci-Fi
183,2826,1,4.0,964980523,"13th Warrior, The (1999)",Action|Adventure|Fantasy


In [None]:
# sampling of interactions
generate_sentences_by_user(word2vec_recommender.train_df.head(10))

[['804', '1210', '2018', '2628', '2826', '3578', '3617', '3744', '101', '441']]

In [None]:
word2vec_recommender.train()

Epoch #1 start
Epoch #1 end in 0:00:00.451993 time
Epoch #1, loss 990004.25
Epoch #2 start
Epoch #2 end in 0:00:00.449306 time
Epoch #2, loss 1791753.375
Epoch #3 start
Epoch #3 end in 0:00:00.458170 time
Epoch #3, loss 2516472.5
Epoch #4 start
Epoch #4 end in 0:00:00.444082 time
Epoch #4, loss 3220129.5
Epoch #5 start
Epoch #5 end in 0:00:00.449314 time
Epoch #5, loss 3908332.0
Epoch #6 start
Epoch #6 end in 0:00:00.455581 time
Epoch #6, loss 4553275.5
Epoch #7 start
Epoch #7 end in 0:00:00.482539 time
Epoch #7, loss 5176599.0
Epoch #8 start
Epoch #8 end in 0:00:00.459680 time
Epoch #8, loss 5797220.5
Epoch #9 start
Epoch #9 end in 0:00:00.464909 time
Epoch #9, loss 6397802.0
Epoch #10 start
Epoch #10 end in 0:00:00.453222 time
Epoch #10, loss 6998834.5
Epoch #11 start
Epoch #11 end in 0:00:00.459535 time
Epoch #11, loss 7601747.5
Epoch #12 start
Epoch #12 end in 0:00:00.460921 time
Epoch #12, loss 8198219.0
Epoch #13 start
Epoch #13 end in 0:00:00.457595 time
Epoch #13, loss 8749264.

In [None]:
word2vec_recommender.similar_by_movie_id(1210)

('Star Wars: Episode VI - Return of the Jedi (1983) - Action|Adventure|Sci-Fi None',
 ['Star Wars: Episode V - The Empire Strikes Back (1980) - Action|Adventure|Sci-Fi 0.5314258337020874',
  'Star Wars: Episode IV - A New Hope (1977) - Action|Adventure|Sci-Fi 0.5267199277877808',
  'Fly Away Home (1996) - Adventure|Children 0.47093629837036133',
  'Pirates of the Caribbean: The Curse of the Black Pearl (2003) - Action|Adventure|Comedy|Fantasy 0.44945836067199707',
  'Willy Wonka & the Chocolate Factory (1971) - Children|Comedy|Fantasy|Musical 0.43167296051979065'])

In [None]:
word2vec_recommender.similar_by_movie_id(101)

('Bottle Rocket (1996) - Adventure|Comedy|Crime|Romance None',
 ['Bulworth (1998) - Comedy|Drama|Romance 0.6557052731513977',
  'Flirting With Disaster (1996) - Comedy 0.5914894938468933',
  'Living in Oblivion (1995) - Comedy 0.5745859146118164',
  'Hedwig and the Angry Inch (2000) - Comedy|Drama|Musical 0.5608201026916504',
  'Cemetery Man (Dellamorte Dellamore) (1994) - Horror 0.5575801134109497'])