# Determining optimum vocabulary size

Excluding `genres`, as vocabulary is used only for `overview`. We need to
determine the value of `vocab_size` below which recommendation quality dips
appreciably. If we don't exclude `genres`, we won't see a sharp dip in
quality, because `genres` will come to the rescue of the model.

In [1]:
import sys
import os

sys.path.append(os.path.join('..'))
from recommender.preprocessing import *

A modified version of `recommender.model.Recommender`, with `genres` not used to
generate embeddings.

In [2]:
import logging
import os
import pickle
from enum import Enum
from timeit import default_timer as timer

import numpy as np
from scipy.sparse import csr_matrix, hstack, vstack
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer

# from .preprocessing import *

KEEP_COLUMNS = ['genres', 'overview', 'keywords']

FILENAME_ENCODER_GENRES = 'encoder_genres.pkl'
FILENAME_VECTORIZER_OVERVIEW = 'vectorizer_overview.pkl'
FILENAME_SVD = 'svd.pkl'

class EmbedderMode(Enum):
    TRAIN = 'train'
    INFER = 'infer'

    @classmethod
    def values(cls):
        return [e.value for e in cls]

class Embedder:
    def __init__(self):
        self.encoder_genres = None
        self.vectorizer_overview = None
        self.svd = None

    def train(self, media, vocab_size=None, embedding_dim=None):
        self.pipeline(media, EmbedderMode.TRAIN, vocab_size=vocab_size, embedding_dim=embedding_dim)

    def embed(self, media, save_path=None):
        embeddings, ids = self.pipeline(media, EmbedderMode.INFER)
        if save_path:
            logging.info(f'Saving embeddings to "{save_path}"')
            np.savez(save_path, embeddings=embeddings, ids=ids)
        return embeddings, ids

    def pipeline(self, media, mode, vocab_size=None, embedding_dim=None):
        assert mode.value in EmbedderMode.values()

        logging.info(f'{self.__class__.__name__} in [{mode.value}] mode.')

        media = media.loc[:, KEEP_COLUMNS]
        ids = media.index.to_numpy()

        # logging.info('Preprocessing \'genres\'')
        # media['genres'] = media['genres'].apply(extract_genre_names)

        # logging.info('Encoding \'genres\'')
        # if mode == EmbedderMode.TRAIN:
        #     self.encoder_genres = MultiLabelBinarizer()
        #     encoded_genres = self.encoder_genres.fit_transform(media['genres'])
        # else:
        #     encoded_genres = self.encoder_genres.transform(media['genres'])

        logging.info('Preprocessing \'overview\'')
        media['overview'] = media['overview'].fillna('')

        # logging.info('Injecting \'keywords\' into \'overview\'')
        # media = inject_keywords(media)

        logging.info(f'Vectorizing \'overview\'...')
        time_start = timer()
        if mode == EmbedderMode.TRAIN:
            self.vectorizer_overview = TfidfVectorizer(
                strip_accents=False,
                lowercase=False,
                preprocessor=None,
                tokenizer=None,
                analyzer=normalize,
                norm='l2',
                max_features=vocab_size
            )
            vectorized_overview = self.vectorizer_overview.fit_transform(media['overview'])
        else:
            vectorized_overview = self.vectorizer_overview.transform(media['overview'])
        logging.info(f'Vectorizing \'overview\' took {round(timer() - time_start, 4)} s, for a vocabulary size of {len(self.get_vocab())}.')

        # logging.info(f'Stacking vectorized overview onto encoded \'genres\'')
        # embeddings = hstack([encoded_genres, vectorized_overview], format='csr')
        embeddings = vectorized_overview

        if mode == EmbedderMode.TRAIN:
            if embedding_dim:
                logging.info(f'Reducing dimension to {embedding_dim}...')
                time_start = timer()
                self.svd = TruncatedSVD(n_components=embedding_dim, algorithm='arpack')
                self.svd.fit(embeddings)
                logging.info(f'Dimensionality reduction took {round(timer() - time_start)} s.')
        else:
            if self.svd:
                logging.info(f'Reducing dimension to {embedding_dim}...')
                time_start = timer()
                embeddings = self.svd.transform(embeddings)
                logging.info(f'Dimensionality reduction took {round(timer() - time_start)} s.')

                logging.info('Sparsifying embeddings...')
                time_start = timer()
                embeddings = csr_matrix(embeddings)
                logging.info(f'Sparsifying embeddings took {round(timer() - time_start)} s.')

        if mode == EmbedderMode.INFER:
            return embeddings, ids

    # def get_genres(self):
    #     return self.encoder_genres.classes_

    def get_vocab(self):
        return self.vectorizer_overview.get_feature_names_out()

    def get_embedding_dim(self):
        return self.svd.n_components_

    def save(self, model_path):
        # with open(os.path.join(model_path, FILENAME_ENCODER_GENRES), 'w') as f:
        #     pickle.dump(self.encoder_genres, f)
        with open(os.path.join(model_path, FILENAME_VECTORIZER_OVERVIEW), 'w') as f:
            pickle.dump(self.vectorizer_overview, f)
        if self.svd:
            with open(os.path.join(model_path, FILENAME_SVD), 'w') as f:
                pickle.dump(self.svd, f)

    @classmethod
    def load(cls, model_path):
        embedder = cls()
        # with open(os.path.join(model_path, FILENAME_ENCODER_GENRES), 'r') as f:
        #     embedder.encoder_genres = pickle.load(f)
        with open(os.path.join(model_path, FILENAME_VECTORIZER_OVERVIEW), 'r') as f:
            embedder.vectorizer_overview = pickle.dump(f)

        svd_path = os.path.join(model_path, FILENAME_SVD)
        if os.path.exists(svd_path):
            with open(svd_path, 'r'):
                embedder.svd = pickle.load(f)
        else:
            embedder.svd = None

        return embedder

    @staticmethod
    def load_embeddings(embeddings_dir):
        embeddings = None
        ids = np.array([])

        for filename in os.listdir(embeddings_dir):
            if os.path.splitext(filename)[1] == '.npz':
                contents = np.load(os.path.join(embeddings_dir, filename), allow_pickle=True)
                new_embeddings = contents['embeddings'].item()
                new_ids = contents['ids']

                if embeddings is None:
                    embeddings = new_embeddings
                else:
                    embeddings = vstack([embeddings, new_embeddings], format='csr')

                ids = np.append(ids, new_ids)

        return embeddings, ids

class Recommender:
    def __init__(self, embeddings, ids):
        self.embeddings = embeddings
        self.ids = ids

    @classmethod
    def from_dir(cls, embeddings_dir):
        embeddings, ids = Embedder.load_embeddings(embeddings_dir)
        return cls(embeddings, ids)

    def recommend(self, embedding, n=10):
        similarity = cosine_similarity(self.embeddings, embedding).flatten()

        most_similar_rows = np.argsort(-similarity)
        if most_similar_rows[0] == 1:
            most_similar_rows = most_similar_rows[1:(n+1)]
        else:
            most_similar_rows = most_similar_rows[:n]

        most_similar_ids = [
            int(self.ids[row])
            for row in most_similar_rows
        ]

        return most_similar_ids

    def recommend_by_id(self, id, n=10):
        assert id in self.ids

        row = list(self.ids).index(id)
        embedding = self.embeddings[row]

        return self.recommend(embedding, n=n+1)[1:]

    def recommend_pprint(self, id, titles, n=10):
        query_title = ids_to_titles(id, titles)
        print(f'Top {n} similar movies to "{query_title}":')
        for similar_id in self.recommend_by_id(id, n=n):
            print(f'* {ids_to_titles(similar_id, titles)}')

def ids_to_titles(ids, titles):
    if type(ids) == int:
        return titles[ids]
    return list(map(titles.__getitem__, ids))

In [3]:
logging.basicConfig(level=logging.INFO)

In [4]:
import pandas as pd

In [5]:
credits = pd.read_csv(os.path.join('data', 'm-st5000', 'tmdb_5000_credits.csv'), index_col='movie_id')
movies = pd.read_csv(os.path.join('data', 'm-st5000', 'tmdb_5000_movies.csv'), index_col='id')

credits = credits.drop(columns=['title'])
movies = movies.join(credits)

movies = movies.sample(frac=1, random_state=1)

del credits

In [6]:
train = movies.iloc[:4000, :]
test = movies.iloc[4000:, :]

In [7]:
674 in train.index      # Harry Potter and the Goblet of Fire

False

In [8]:
10193 in train.index    # Toy Story 3

False

## `vocab_size = 10000`

In [9]:
embedder = Embedder()
embedder.train(train, vocab_size=10000, embedding_dim=None)

INFO:root:Embedder in [train] mode.
INFO:root:Preprocessing 'overview'
INFO:root:Vectorizing 'overview'...
INFO:root:Vectorizing 'overview' took 16.632 s, for a vocabulary size of 10000.


In [10]:
embeddings, ids = embedder.embed(movies)

INFO:root:Embedder in [infer] mode.
INFO:root:Preprocessing 'overview'
INFO:root:Vectorizing 'overview'...
INFO:root:Vectorizing 'overview' took 20.4476 s, for a vocabulary size of 10000.


In [11]:
rec = Recommender(embeddings, ids)

In [12]:
rec.recommend_pprint(674, movies['title'])

Top 10 similar movies to "Harry Potter and the Goblet of Fire":
* Harry Potter and the Prisoner of Azkaban
* Harry Potter and the Half-Blood Prince
* Harry Potter and the Order of the Phoenix
* Harry Potter and the Chamber of Secrets
* Dude Where's My Dog?
* Harry Potter and the Philosopher's Stone
* Married Life
* My Bloody Valentine
* Dante's Peak
* Something's Gotta Give


*Dude Where's My Dog?*, *Dante's Peak* and *Something's Gotta Give* are
particularly interesting: The main character's name is Harry! But they are in
entirely different genres as *Harry Potter*. Concrete example of how excluding
`genres` does bad things!

In [13]:
rec.recommend_pprint(10193, movies['title'])

Top 10 similar movies to "Toy Story 3":
* Toy Story
* Toy Story 2
* The 40 Year Old Virgin
* Man on the Moon
* Heartbeeps
* Factory Girl
* Class of 1984
* A LEGO Brickumentary
* The Man
* Small Soldiers


Capturing similarity on **Andy**:
* *Toy Story 3*: Toys belong to **Andy**.
* *The 40 Year Old Virgin*: Main character is **Andy** Stitzer.
* *Man on the Moon*: Biopic on **Andy** Kaufman.
* *Heartbeeps*: Stars **Andy** Kaufman.
* *Factory Girl*: Biopic with **Andy** Warhol as a character.
* *The Man*: Main character is **Andy** Fiddler.

But again, in different genres.

## `vocab_size = 5000`

In [14]:
embedder = Embedder()
embedder.train(train, vocab_size=5000, embedding_dim=None)

INFO:root:Embedder in [train] mode.
INFO:root:Preprocessing 'overview'
INFO:root:Vectorizing 'overview'...
INFO:root:Vectorizing 'overview' took 13.9551 s, for a vocabulary size of 5000.


In [15]:
embeddings, ids = embedder.embed(movies)

INFO:root:Embedder in [infer] mode.
INFO:root:Preprocessing 'overview'
INFO:root:Vectorizing 'overview'...
INFO:root:Vectorizing 'overview' took 19.4689 s, for a vocabulary size of 5000.


In [16]:
rec = Recommender(embeddings, ids)

In [17]:
rec.recommend_pprint(674, movies['title'])

Top 10 similar movies to "Harry Potter and the Goblet of Fire":
* Harry Potter and the Prisoner of Azkaban
* Harry Potter and the Order of the Phoenix
* Harry Potter and the Chamber of Secrets
* Harry Potter and the Half-Blood Prince
* Dude Where's My Dog?
* Harry Potter and the Philosopher's Stone
* Something's Gotta Give
* Married Life
* Dante's Peak
* Armageddon


In [18]:
rec.recommend_pprint(10193, movies['title'])

Top 10 similar movies to "Toy Story 3":
* Toy Story
* Toy Story 2
* The 40 Year Old Virgin
* Man on the Moon
* Class of 1984
* Factory Girl
* A LEGO Brickumentary
* Heartbeeps
* The Man
* CJ7


*CJ7* is a Chinese sci-fi film about aliens. Quality starting to degrade slightly?

## `vocab_size = 3000`

In [19]:
embedder = Embedder()
embedder.train(train, vocab_size=3000, embedding_dim=None)

INFO:root:Embedder in [train] mode.
INFO:root:Preprocessing 'overview'
INFO:root:Vectorizing 'overview'...
INFO:root:Vectorizing 'overview' took 19.199 s, for a vocabulary size of 3000.


In [20]:
embeddings, ids = embedder.embed(movies)

INFO:root:Embedder in [infer] mode.
INFO:root:Preprocessing 'overview'
INFO:root:Vectorizing 'overview'...
INFO:root:Vectorizing 'overview' took 18.4673 s, for a vocabulary size of 3000.


In [21]:
rec = Recommender(embeddings, ids)

In [22]:
rec.recommend_pprint(674, movies['title'])

Top 10 similar movies to "Harry Potter and the Goblet of Fire":
* Harry Potter and the Prisoner of Azkaban
* Harry Potter and the Order of the Phoenix
* Dude Where's My Dog?
* Married Life
* Something's Gotta Give
* Harry Potter and the Chamber of Secrets
* Harry Potter and the Half-Blood Prince
* Dante's Peak
* Harry Potter and the Philosopher's Stone
* The Greatest Game Ever Played


In [23]:
rec.recommend_pprint(10193, movies['title'])

Top 10 similar movies to "Toy Story 3":
* Toy Story 2
* Toy Story
* The 40 Year Old Virgin
* Class of 1984
* Factory Girl
* Man on the Moon
* Heartbeeps
* A LEGO Brickumentary
* The Man
* CJ7


## `vocab_size = 1000`

In [24]:
embedder = Embedder()
embedder.train(train, vocab_size=1000, embedding_dim=None)

INFO:root:Embedder in [train] mode.
INFO:root:Preprocessing 'overview'
INFO:root:Vectorizing 'overview'...
INFO:root:Vectorizing 'overview' took 16.1751 s, for a vocabulary size of 3000.


In [25]:
embeddings, ids = embedder.embed(movies)

INFO:root:Embedder in [infer] mode.
INFO:root:Preprocessing 'overview'
INFO:root:Vectorizing 'overview'...
INFO:root:Vectorizing 'overview' took 23.0758 s, for a vocabulary size of 3000.


In [26]:
rec = Recommender(embeddings, ids)

In [27]:
rec.recommend_pprint(674, movies['title'])

Top 10 similar movies to "Harry Potter and the Goblet of Fire":
* Harry Potter and the Prisoner of Azkaban
* Harry Potter and the Order of the Phoenix
* Dude Where's My Dog?
* Married Life
* Something's Gotta Give
* Harry Potter and the Chamber of Secrets
* Harry Potter and the Half-Blood Prince
* Dante's Peak
* Harry Potter and the Philosopher's Stone
* The Greatest Game Ever Played


In [28]:
rec.recommend_pprint(10193, movies['title'])

Top 10 similar movies to "Toy Story 3":
* Toy Story 2
* Toy Story
* The 40 Year Old Virgin
* Class of 1984
* Factory Girl
* Man on the Moon
* Heartbeeps
* A LEGO Brickumentary
* The Man
* CJ7


Quality of recommendations seems to be pretty constant even after decreasing vocabulary size.

<table>
    <tr>
        <th>Vocab size</th>
        <th>Coverage</th>
    </tr>
    <tr>
        <td>1000</td>
        <td>80%</td>
    </tr>
    <tr>
        <td>3000</td>
        <td>95%</td>
    </tr>
    <tr>
        <td>5000</td>
        <td>98%</td>
    </tr>
    <tr>
        <td>1000</td>
        <td>99%</td>
    </tr>
</table>

C1 (proficient/advanced) user knows about 8000 words.

Regardless, no harm in erring on the side of a larger vocabulary. Processing
time is not sensitive to vocabulary size.

## `vocab_size = 5`

In [29]:
embedder = Embedder()
embedder.train(train, vocab_size=5, embedding_dim=None)

INFO:root:Embedder in [train] mode.
INFO:root:Preprocessing 'overview'
INFO:root:Vectorizing 'overview'...
INFO:root:Vectorizing 'overview' took 16.5855 s, for a vocabulary size of 5.


In [30]:
embeddings, ids = embedder.embed(movies)

INFO:root:Embedder in [infer] mode.
INFO:root:Preprocessing 'overview'
INFO:root:Vectorizing 'overview'...
INFO:root:Vectorizing 'overview' took 18.5251 s, for a vocabulary size of 5.


In [31]:
rec = Recommender(embeddings, ids)

In [32]:
rec.recommend_pprint(674, movies['title'])

Top 10 similar movies to "Harry Potter and the Goblet of Fire":
* Drinking Buddies
* Butterfly Girl
* Girls Gone Dead
* The Salon
* The First Great Train Robbery
* Obvious Child
* 54
* The Crew
* Sahara
* All Superheroes Must Die


In [33]:
embedder.get_vocab()

array(['life', 'man', 'new', 'world', 'young'], dtype=object)