In [None]:
# default_exp model

# Word2Vec Model

> This module exposes classes and functions related to training of the Word2Vec recommender using the Gensim library.

https://radimrehurek.com/gensim/

In [None]:
#hide
from nbdev.export import *

In [None]:
#export
import logging
import random
from typing import List, NamedTuple, Tuple
from datetime import datetime
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from gensim.models import Word2Vec 
from gensim.models.callbacks import CallbackAny2Vec
from nbdev.showdoc import *
from tqdm import tqdm

from word2vec_recommender.core import *

In [None]:
#export
logger = logging.getLogger(__name__)

In [None]:
#export
class GensimParameters(NamedTuple):
    window: int = 10
    iter: int = 20
    sg: int = 1
    hs: int = 0
    negative: int = 10
    alpha: float = 0.03
    min_alpha: float = 0.0007
    seed: int = 14
    compute_loss: bool = True

In [None]:
gensim_parameters = GensimParameters(window=10)

In [None]:
#export
def generate_sentences_by_user(df: pd.DataFrame):
    """
    Generate the Gensin sentences for a dataframe.
    Each sentence is created by joining all ratings from a user sorted by timestamp.
    """
    def to_sentence(r):
        return [str(m) for m in r]
    return df.groupby('userId')['movieId'].apply(to_sentence).tolist()

In [None]:
#export
class _EpochLogger(CallbackAny2Vec):
    """
    Log information about training, reports time for epochs.
    """
    def __init__(self, print_to_stdout: bool = False):
        """
        Constructor for the class to log progress information.
        """
        self._epoch = 1
        self._start = datetime.now()
        self._end = datetime.now()
        self._print_to_stdout = print_to_stdout

    def on_epoch_begin(self, _):
        """
        Print progress information, initializes start time.
        :param _: type gensim word2vec, signature to match the function to be used by gensim
        """
        self._start = datetime.now()
        msg = f"Epoch #{self._epoch} start"
        if self._print_to_stdout:
            print(msg)
        logger.info(msg)

    def on_epoch_end(self, model):
        """
        Print time to for epoch
        :param model: type gensim word2vec, signature to match the function to be used by gensim
        """
        self._end = datetime.now()
        elapsed = self._end - self._start
        msg = f"Epoch #{self._epoch} end in {elapsed} time"
        if self._print_to_stdout:
            print(msg)
        logger.info(msg)
        msg = f"Epoch #{self._epoch}, loss {model.get_latest_training_loss()}"
        if self._print_to_stdout:
            print(msg)
        logger.info(msg)
        self._epoch += 1


In [None]:
#export
class Word2VecMovieRecommender:
    """
    This class encapsulates the training of recommendations plus utilities for persistance and predictions
    """
    def __init__(
        self,
        movies_df: pd.DataFrame,
        ratings_df: pd.DataFrame,
        gensim_parameters: GensimParameters,
        positive_rating_threshold: float = 3.0,
        train_validation_ratio: float = 0.9
        ):

        self.movies_df = movies_df
        self.ratings_df = ratings_df
        self.gensim_parameters = gensim_parameters
        self.model: Word2Vec = None

        # joins movie and ratings df
        df_joined = ratings_df.set_index('movieId').join(movies_df.set_index('movieId'), on='movieId', rsuffix='movie_').reset_index()
        # keep only positive ratings
        df_joined = df_joined[df_joined['rating'] >= positive_rating_threshold]
        # sort by user interactions
        df_joined.sort_values(by=['userId', 'timestamp'], inplace=True)
        # train validation split
        user_ids = df_joined["userId"].unique().tolist()
        random.Random(RANDOM_SEED).shuffle(user_ids)
        training_size = int(0.9 * len(user_ids))
        training_user_ids = user_ids[:training_size]
        validation_user_ids = user_ids[training_size:]
        assert len(validation_user_ids) + len(training_user_ids) == len(user_ids)
        self.train_df = df_joined[df_joined['userId'].isin(training_user_ids)]
        self.validation_df = df_joined[df_joined['userId'].isin(validation_user_ids)]


    def __repr__(self):
        return f"movies={self.movies_df.shape}, ratings={self.ratings_df.shape}, " +\
            f"train_df={self.train_df.shape}, validation_df={self.validation_df.shape}"

    def train(self, print_progress: bool = False):
        sentences_train = generate_sentences_by_user(self.train_df)
        self.model = Word2Vec(sentences_train, callbacks=[_EpochLogger(print_to_stdout=print_progress)],  **self.gensim_parameters._asdict())

    def similar_by_movie_id(self, seed_movie_id: int, n: int = 5) -> List[Recommendation]:
        movie_embedding = self.model.wv[str(seed_movie_id)]
        movies = self.model.wv.similar_by_vector(movie_embedding, topn= n+1)[1:]
        return [ Recommendation(movie_id=int(m[0]), score=m[1]) for m in movies ]
    
    def save_all(self, output_path: Path):
        if not output_path.exists():
            output_path.mkdir()
        if not output_path.is_dir():
            raise ValueError(f"{output_path} should be a directory")
        word_indexes = word2vec_recommender.model.wv.index2word
        embeddings = word2vec_recommender.model.wv.vectors
        with open(output_path / 'words_index.pkl', 'wb') as f:
            np.save(f, word_indexes)
        with open(output_path / 'embeddings.pkl', 'wb') as f:
            np.save(f, embeddings)
        with open(output_path / 'model.gensim', 'wb') as f:
            word2vec_recommender.model.save(f)
        


In [None]:
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
movie_repository = MovieRepository(
    movies_df=movies_df
)

movie_repository.find_by_movie_id(1)

Movie(movie_id=1, title='Toy Story (1995)', genres='Adventure|Animation|Children|Comedy|Fantasy')

In [None]:
word2vec_recommender = Word2VecMovieRecommender(
    movies_df=movies_df,
    ratings_df=ratings_df,
    gensim_parameters=gensim_parameters
)

word2vec_recommender

movies=(9742, 3), ratings=(100836, 4), train_df=(74918, 6), validation_df=(6845, 6)

In [None]:
word2vec_recommender.train_df.head()

Unnamed: 0,movieId,userId,rating,timestamp,title,genres
43,804,1,4.0,964980499,She's the One (1996),Comedy|Romance
73,1210,1,5.0,964980499,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi
120,2018,1,5.0,964980523,Bambi (1942),Animation|Children|Drama
171,2628,1,4.0,964980523,Star Wars: Episode I - The Phantom Menace (1999),Action|Adventure|Sci-Fi
183,2826,1,4.0,964980523,"13th Warrior, The (1999)",Action|Adventure|Fantasy


In [None]:
# sampling of interactions
generate_sentences_by_user(word2vec_recommender.train_df.head(10))

[['804', '1210', '2018', '2628', '2826', '3578', '3617', '3744', '101', '441']]

In [None]:
word2vec_recommender.train(print_progress=True)

Epoch #1 start
Epoch #1 end in 0:00:00.390624 time
Epoch #1, loss 1007355.375
Epoch #2 start
Epoch #2 end in 0:00:00.458152 time
Epoch #2, loss 1809739.625
Epoch #3 start
Epoch #3 end in 0:00:00.417606 time
Epoch #3, loss 2529623.0
Epoch #4 start
Epoch #4 end in 0:00:00.445240 time
Epoch #4, loss 3231414.25
Epoch #5 start
Epoch #5 end in 0:00:00.400902 time
Epoch #5, loss 3922108.75
Epoch #6 start
Epoch #6 end in 0:00:00.430010 time
Epoch #6, loss 4561615.5
Epoch #7 start
Epoch #7 end in 0:00:00.400012 time
Epoch #7, loss 5173044.0
Epoch #8 start
Epoch #8 end in 0:00:00.431030 time
Epoch #8, loss 5783066.5
Epoch #9 start
Epoch #9 end in 0:00:00.410765 time
Epoch #9, loss 6384159.0
Epoch #10 start
Epoch #10 end in 0:00:00.447620 time
Epoch #10, loss 6994881.0
Epoch #11 start
Epoch #11 end in 0:00:00.450338 time
Epoch #11, loss 7606768.5
Epoch #12 start
Epoch #12 end in 0:00:00.472919 time
Epoch #12, loss 8201392.0
Epoch #13 start
Epoch #13 end in 0:00:00.418941 time
Epoch #13, loss 8754

Generating Recommendations

In [None]:
seed_id = 1210
recommendations = word2vec_recommender.similar_by_movie_id(seed_id)
print_recommendations(movie_repository, seed_id, recommendations)

Movie(movie_id=1210, title='Star Wars: Episode VI - Return of the Jedi (1983)', genres='Action|Adventure|Sci-Fi')
> Recommendations:
>> Movie(movie_id=1196, title='Star Wars: Episode V - The Empire Strikes Back (1980)', genres='Action|Adventure|Sci-Fi') score=0.5126053094863892
>> Movie(movie_id=260, title='Star Wars: Episode IV - A New Hope (1977)', genres='Action|Adventure|Sci-Fi') score=0.4886493682861328
>> Movie(movie_id=3578, title='Gladiator (2000)', genres='Action|Adventure|Drama') score=0.46928292512893677
>> Movie(movie_id=986, title='Fly Away Home (1996)', genres='Adventure|Children') score=0.45592010021209717
>> Movie(movie_id=122886, title='Star Wars: Episode VII - The Force Awakens (2015)', genres='Action|Adventure|Fantasy|Sci-Fi|IMAX') score=0.452303946018219


In [None]:
seed_id = 110
recommendations = word2vec_recommender.similar_by_movie_id(seed_id)
print_recommendations(movie_repository, seed_id, recommendations)

Movie(movie_id=110, title='Braveheart (1995)', genres='Action|Drama|War')
> Recommendations:
>> Movie(movie_id=356, title='Forrest Gump (1994)', genres='Comedy|Drama|Romance|War') score=0.6328716278076172
>> Movie(movie_id=589, title='Terminator 2: Judgment Day (1991)', genres='Action|Sci-Fi') score=0.5735100507736206
>> Movie(movie_id=47, title='Seven (a.k.a. Se7en) (1995)', genres='Mystery|Thriller') score=0.5678750276565552
>> Movie(movie_id=593, title='Silence of the Lambs, The (1991)', genres='Crime|Horror|Thriller') score=0.5645602941513062
>> Movie(movie_id=2028, title='Saving Private Ryan (1998)', genres='Action|Drama|War') score=0.5494592189788818


Saving embeddings

In [None]:
word2vec_recommender.save_all(Path('./data/out'))

In [None]:
!ls -lh ./data/out/

total 16800
-rw-r--r--  1 felipe.gasparini  OLX-GLOBAL\Domain Users   1.1M Dec 19 13:56 embeddings.pkl
-rw-r--r--  1 felipe.gasparini  OLX-GLOBAL\Domain Users   5.1M Dec 19 13:56 model.gensim
-rw-r--r--  1 felipe.gasparini  OLX-GLOBAL\Domain Users    68K Dec 19 13:56 words_index.pkl


Plotting embeddings

In [None]:
# Code from: https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#visualising-the-word-embeddings

from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
import numpy as np                                  # array handling


def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    vectors = [] # positions in vector space
    labels = [] # keep track of words to label our data again later
    for word in model.wv.vocab:
        vectors.append(model.wv[word])
        labels.append(word)

    # convert both lists into numpy vectors for reduction
    vectors = np.asarray(vectors)
    labels = np.asarray(labels)

    # reduce using t-SNE
    vectors = np.asarray(vectors)
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels


x_vals, y_vals, labels = reduce_dimensions(word2vec_recommender.model)

def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')


def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly

plot_function(x_vals, y_vals, labels)