# Determining optimum embedding dimension

Reducing `embedding_dim` decreases time to calculate similarity.

In [1]:
import logging
import os
import sys

import numpy as np
import pandas as pd

In [2]:
sys.path.append(os.path.join('..'))
from recommender.model import Embedder, Recommender

In [3]:
logging.basicConfig(level=logging.INFO)

In [4]:
credits = pd.read_csv(os.path.join('data', 'm-st5000', 'tmdb_5000_credits.csv'), index_col='movie_id')
movies = pd.read_csv(os.path.join('data', 'm-st5000', 'tmdb_5000_movies.csv'), index_col='id')

credits = credits.drop(columns=['title'])
movies = movies.join(credits)

movies = movies.sample(frac=1, random_state=1)

del credits

In [5]:
train = movies.iloc[:4000, :]
test = movies.iloc[4000:, :]

In [6]:
VOCAB_SIZE = 5000

In [7]:
def calc_density(sparse_mat):
    return sparse_mat.nnz / np.prod(sparse_mat.shape)

## `embedding_dim = 1024`

In [8]:
embedder = Embedder()
embedder.train(train, vocab_size=VOCAB_SIZE, embedding_dim=1024)

INFO:root:Embedder in [train] mode.
INFO:root:Preprocessing 'genres'
INFO:root:Encoding 'genres'
INFO:root:Preprocessing 'overview'
INFO:root:Vectorizing 'overview'...
INFO:root:Vectorizing 'overview' took 18.585 s, for a vocabulary size of 5000.
INFO:root:Stacking vectorized overview onto encoded 'genres'
INFO:root:Reducing dimension to 1024...
INFO:root:Dimensionality reduction took 68 s.


In [9]:
embeddings, ids = embedder.embed(movies, save_path=None)

INFO:root:Embedder in [infer] mode.
INFO:root:Preprocessing 'genres'
INFO:root:Encoding 'genres'
INFO:root:Preprocessing 'overview'
INFO:root:Vectorizing 'overview'...
INFO:root:Vectorizing 'overview' took 19.5326 s, for a vocabulary size of 5000.
INFO:root:Stacking vectorized overview onto encoded 'genres'
INFO:root:Reducing dimension to 1024...
INFO:root:Dimensionality reduction took 0 s.
INFO:root:Sparsifying embeddings...
INFO:root:Sparsifying embeddings took 0 s.


In [10]:
rec = Recommender(embeddings, ids)

In [11]:
rec.recommend_pprint(674, movies['title'])

Top 10 similar movies to "Harry Potter and the Goblet of Fire":
* Harry Potter and the Prisoner of Azkaban
* Harry Potter and the Chamber of Secrets
* Harry Potter and the Philosopher's Stone
* Harry Potter and the Half-Blood Prince
* The Chronicles of Narnia: The Voyage of the Dawn Treader
* The Chronicles of Narnia: Prince Caspian
* The Chronicles of Narnia: The Lion, the Witch and the Wardrobe
* Pete's Dragon
* Pan
* Oz: The Great and Powerful


In [12]:
rec.recommend_pprint(10193, movies['title'])

Top 10 similar movies to "Toy Story 3":
* Toy Story
* Toy Story 2
* Free Birds
* Doug's 1st Movie
* Meet the Deedles
* Barnyard
* The Simpsons Movie
* Looney Tunes: Back in Action
* Garfield: A Tail of Two Kitties
* The SpongeBob SquarePants Movie


In [13]:
print(f'Density = {np.round(calc_density(embeddings) * 100, 4)}%')

Density = 100.0%


## `embedding_dim = 512`

In [14]:
embedder = Embedder()
embedder.train(train, vocab_size=VOCAB_SIZE, embedding_dim=512)

INFO:root:Embedder in [train] mode.
INFO:root:Preprocessing 'genres'
INFO:root:Encoding 'genres'
INFO:root:Preprocessing 'overview'
INFO:root:Vectorizing 'overview'...
INFO:root:Vectorizing 'overview' took 22.6051 s, for a vocabulary size of 5000.
INFO:root:Stacking vectorized overview onto encoded 'genres'
INFO:root:Reducing dimension to 512...
INFO:root:Dimensionality reduction took 17 s.


In [15]:
embeddings, ids = embedder.embed(movies, save_path=None)

INFO:root:Embedder in [infer] mode.
INFO:root:Preprocessing 'genres'
INFO:root:Encoding 'genres'
INFO:root:Preprocessing 'overview'
INFO:root:Vectorizing 'overview'...
INFO:root:Vectorizing 'overview' took 19.5244 s, for a vocabulary size of 5000.
INFO:root:Stacking vectorized overview onto encoded 'genres'
INFO:root:Reducing dimension to 512...
INFO:root:Dimensionality reduction took 0 s.
INFO:root:Sparsifying embeddings...
INFO:root:Sparsifying embeddings took 0 s.


In [16]:
rec = Recommender(embeddings, ids)

In [17]:
rec.recommend_pprint(674, movies['title'])

Top 10 similar movies to "Harry Potter and the Goblet of Fire":
* Harry Potter and the Prisoner of Azkaban
* Harry Potter and the Chamber of Secrets
* Harry Potter and the Philosopher's Stone
* Harry Potter and the Half-Blood Prince
* The Chronicles of Narnia: Prince Caspian
* The Chronicles of Narnia: The Lion, the Witch and the Wardrobe
* The Chronicles of Narnia: The Voyage of the Dawn Treader
* Oz: The Great and Powerful
* Pete's Dragon
* Pan


In [18]:
rec.recommend_pprint(10193, movies['title'])

Top 10 similar movies to "Toy Story 3":
* Toy Story 2
* Toy Story
* Doug's 1st Movie
* Garfield: A Tail of Two Kitties
* The Simpsons Movie
* Meet the Deedles
* Looney Tunes: Back in Action
* Free Birds
* Hoodwinked!
* Barnyard


In [19]:
print(f'Density = {np.round(calc_density(embeddings) * 100, 4)}%')

Density = 100.0%


## `embedding_dim = 256`

In [20]:
embedder = Embedder()
embedder.train(train, vocab_size=VOCAB_SIZE, embedding_dim=256)

INFO:root:Embedder in [train] mode.
INFO:root:Preprocessing 'genres'
INFO:root:Encoding 'genres'
INFO:root:Preprocessing 'overview'
INFO:root:Vectorizing 'overview'...
INFO:root:Vectorizing 'overview' took 15.1989 s, for a vocabulary size of 5000.
INFO:root:Stacking vectorized overview onto encoded 'genres'
INFO:root:Reducing dimension to 256...
INFO:root:Dimensionality reduction took 4 s.


In [21]:
embeddings, ids = embedder.embed(movies, save_path=None)

INFO:root:Embedder in [infer] mode.
INFO:root:Preprocessing 'genres'
INFO:root:Encoding 'genres'
INFO:root:Preprocessing 'overview'
INFO:root:Vectorizing 'overview'...
INFO:root:Vectorizing 'overview' took 17.9241 s, for a vocabulary size of 5000.
INFO:root:Stacking vectorized overview onto encoded 'genres'
INFO:root:Reducing dimension to 256...
INFO:root:Dimensionality reduction took 0 s.
INFO:root:Sparsifying embeddings...
INFO:root:Sparsifying embeddings took 0 s.


In [22]:
rec = Recommender(embeddings, ids)

In [23]:
rec.recommend_pprint(674, movies['title'])

Top 10 similar movies to "Harry Potter and the Goblet of Fire":
* Harry Potter and the Prisoner of Azkaban
* The Chronicles of Narnia: Prince Caspian
* Harry Potter and the Chamber of Secrets
* Harry Potter and the Philosopher's Stone
* The Chronicles of Narnia: The Lion, the Witch and the Wardrobe
* Oz: The Great and Powerful
* The Chronicles of Narnia: The Voyage of the Dawn Treader
* Pete's Dragon
* Harry Potter and the Half-Blood Prince
* The Adventurer: The Curse of the Midas Box


In [24]:
rec.recommend_pprint(10193, movies['title'])

Top 10 similar movies to "Toy Story 3":
* Toy Story 2
* Toy Story
* The Simpsons Movie
* Garfield: A Tail of Two Kitties
* Looney Tunes: Back in Action
* Doug's 1st Movie
* Meet the Deedles
* Hoodwinked!
* Chicken Run
* Hoodwinked Too! Hood VS. Evil


In [25]:
print(f'Density = {np.round(calc_density(embeddings) * 100, 4)}%')

Density = 100.0%


## `embedding_dim = 128`

In [26]:
embedder = Embedder()
embedder.train(train, vocab_size=VOCAB_SIZE, embedding_dim=128)

INFO:root:Embedder in [train] mode.
INFO:root:Preprocessing 'genres'
INFO:root:Encoding 'genres'
INFO:root:Preprocessing 'overview'
INFO:root:Vectorizing 'overview'...
INFO:root:Vectorizing 'overview' took 15.3339 s, for a vocabulary size of 5000.
INFO:root:Stacking vectorized overview onto encoded 'genres'
INFO:root:Reducing dimension to 128...
INFO:root:Dimensionality reduction took 1 s.


In [27]:
embeddings, ids = embedder.embed(movies, save_path=None)

INFO:root:Embedder in [infer] mode.
INFO:root:Preprocessing 'genres'
INFO:root:Encoding 'genres'
INFO:root:Preprocessing 'overview'
INFO:root:Vectorizing 'overview'...
INFO:root:Vectorizing 'overview' took 17.7321 s, for a vocabulary size of 5000.
INFO:root:Stacking vectorized overview onto encoded 'genres'
INFO:root:Reducing dimension to 128...
INFO:root:Dimensionality reduction took 0 s.
INFO:root:Sparsifying embeddings...
INFO:root:Sparsifying embeddings took 0 s.


In [28]:
rec = Recommender(embeddings, ids)

In [29]:
rec.recommend_pprint(674, movies['title'])

Top 10 similar movies to "Harry Potter and the Goblet of Fire":
* The Chronicles of Narnia: Prince Caspian
* The BFG
* Harry Potter and the Prisoner of Azkaban
* Oz: The Great and Powerful
* The Chronicles of Narnia: The Lion, the Witch and the Wardrobe
* Alice in Wonderland
* The Chronicles of Narnia: The Voyage of the Dawn Treader
* Harry Potter and the Chamber of Secrets
* Pete's Dragon
* Percy Jackson & the Olympians: The Lightning Thief


In [30]:
rec.recommend_pprint(10193, movies['title'])

Top 10 similar movies to "Toy Story 3":
* Toy Story 2
* Toy Story
* Looney Tunes: Back in Action
* Garfield: A Tail of Two Kitties
* Doug's 1st Movie
* Hoodwinked!
* The Simpsons Movie
* Barnyard
* Meet the Deedles
* Cloudy with a Chance of Meatballs


In [31]:
print(f'Density = {np.round(calc_density(embeddings) * 100, 4)}%')

Density = 100.0%


## `embedding_dim = 64`

In [32]:
embedder = Embedder()
embedder.train(train, vocab_size=VOCAB_SIZE, embedding_dim=64)

INFO:root:Embedder in [train] mode.
INFO:root:Preprocessing 'genres'
INFO:root:Encoding 'genres'
INFO:root:Preprocessing 'overview'
INFO:root:Vectorizing 'overview'...
INFO:root:Vectorizing 'overview' took 14.6883 s, for a vocabulary size of 5000.
INFO:root:Stacking vectorized overview onto encoded 'genres'
INFO:root:Reducing dimension to 64...
INFO:root:Dimensionality reduction took 0 s.


In [33]:
embeddings, ids = embedder.embed(movies, save_path=None)

INFO:root:Embedder in [infer] mode.
INFO:root:Preprocessing 'genres'
INFO:root:Encoding 'genres'
INFO:root:Preprocessing 'overview'
INFO:root:Vectorizing 'overview'...
INFO:root:Vectorizing 'overview' took 19.1767 s, for a vocabulary size of 5000.
INFO:root:Stacking vectorized overview onto encoded 'genres'
INFO:root:Reducing dimension to 64...
INFO:root:Dimensionality reduction took 0 s.
INFO:root:Sparsifying embeddings...
INFO:root:Sparsifying embeddings took 0 s.


In [34]:
rec = Recommender(embeddings, ids)

In [35]:
rec.recommend_pprint(674, movies['title'])

Top 10 similar movies to "Harry Potter and the Goblet of Fire":
* The BFG
* The Chronicles of Narnia: Prince Caspian
* Oz: The Great and Powerful
* The Chronicles of Narnia: The Lion, the Witch and the Wardrobe
* Alice in Wonderland
* Harry Potter and the Prisoner of Azkaban
* The Chronicles of Narnia: The Voyage of the Dawn Treader
* Harry Potter and the Philosopher's Stone
* Pete's Dragon
* Pan


In [36]:
rec.recommend_pprint(10193, movies['title'])

Top 10 similar movies to "Toy Story 3":
* Garfield
* Toy Story
* Garfield: A Tail of Two Kitties
* Meet the Deedles
* Toy Story 2
* Looney Tunes: Back in Action
* Hoodwinked!
* Barnyard
* Doug's 1st Movie
* Cloudy with a Chance of Meatballs


In [37]:
print(f'Density = {np.round(calc_density(embeddings) * 100, 4)}%')

Density = 100.0%


## `embedding_dim = 32`

In [38]:
embedder = Embedder()
embedder.train(train, vocab_size=VOCAB_SIZE, embedding_dim=32)

INFO:root:Embedder in [train] mode.
INFO:root:Preprocessing 'genres'
INFO:root:Encoding 'genres'
INFO:root:Preprocessing 'overview'
INFO:root:Vectorizing 'overview'...
INFO:root:Vectorizing 'overview' took 23.5811 s, for a vocabulary size of 5000.
INFO:root:Stacking vectorized overview onto encoded 'genres'
INFO:root:Reducing dimension to 32...
INFO:root:Dimensionality reduction took 2 s.


In [39]:
embeddings, ids = embedder.embed(movies, save_path=None)

INFO:root:Embedder in [infer] mode.
INFO:root:Preprocessing 'genres'
INFO:root:Encoding 'genres'
INFO:root:Preprocessing 'overview'
INFO:root:Vectorizing 'overview'...
INFO:root:Vectorizing 'overview' took 21.5968 s, for a vocabulary size of 5000.
INFO:root:Stacking vectorized overview onto encoded 'genres'
INFO:root:Reducing dimension to 32...
INFO:root:Dimensionality reduction took 0 s.
INFO:root:Sparsifying embeddings...
INFO:root:Sparsifying embeddings took 0 s.


In [40]:
rec = Recommender(embeddings, ids)

In [41]:
rec.recommend_pprint(674, movies['title'])

Top 10 similar movies to "Harry Potter and the Goblet of Fire":
* The Chronicles of Narnia: The Lion, the Witch and the Wardrobe
* Harry Potter and the Prisoner of Azkaban
* The BFG
* Oz: The Great and Powerful
* Alice in Wonderland
* Harry Potter and the Chamber of Secrets
* Percy Jackson: Sea of Monsters
* The Chronicles of Narnia: Prince Caspian
* Harry Potter and the Half-Blood Prince
* The Chronicles of Narnia: The Voyage of the Dawn Treader


In [42]:
rec.recommend_pprint(10193, movies['title'])

Top 10 similar movies to "Toy Story 3":
* Garfield
* Doug's 1st Movie
* Animals United
* Chicken Run
* Barnyard
* Toy Story 2
* Toy Story
* Hoodwinked!
* Garfield: A Tail of Two Kitties
* Meet the Deedles


In [43]:
print(f'Density = {np.round(calc_density(embeddings) * 100, 4)}%')

Density = 100.0%


## `embedding_dim = 16`

In [44]:
embedder = Embedder()
embedder.train(train, vocab_size=VOCAB_SIZE, embedding_dim=16)

INFO:root:Embedder in [train] mode.
INFO:root:Preprocessing 'genres'
INFO:root:Encoding 'genres'
INFO:root:Preprocessing 'overview'
INFO:root:Vectorizing 'overview'...
INFO:root:Vectorizing 'overview' took 13.6423 s, for a vocabulary size of 5000.
INFO:root:Stacking vectorized overview onto encoded 'genres'
INFO:root:Reducing dimension to 16...
INFO:root:Dimensionality reduction took 0 s.


In [45]:
embeddings, ids = embedder.embed(movies, save_path=None)

INFO:root:Embedder in [infer] mode.
INFO:root:Preprocessing 'genres'
INFO:root:Encoding 'genres'
INFO:root:Preprocessing 'overview'
INFO:root:Vectorizing 'overview'...
INFO:root:Vectorizing 'overview' took 16.5206 s, for a vocabulary size of 5000.
INFO:root:Stacking vectorized overview onto encoded 'genres'
INFO:root:Reducing dimension to 16...
INFO:root:Dimensionality reduction took 0 s.
INFO:root:Sparsifying embeddings...
INFO:root:Sparsifying embeddings took 0 s.


In [46]:
rec = Recommender(embeddings, ids)

In [47]:
rec.recommend_pprint(674, movies['title'])

Top 10 similar movies to "Harry Potter and the Goblet of Fire":
* The BFG
* The Spiderwick Chronicles
* The Adventurer: The Curse of the Midas Box
* Harry Potter and the Prisoner of Azkaban
* Harry Potter and the Chamber of Secrets
* Oz: The Great and Powerful
* The Chronicles of Narnia: The Voyage of the Dawn Treader
* The Indian in the Cupboard
* Alice in Wonderland
* The Chronicles of Narnia: The Lion, the Witch and the Wardrobe


In [48]:
rec.recommend_pprint(10193, movies['title'])

Top 10 similar movies to "Toy Story 3":
* Monsters, Inc.
* Doug's 1st Movie
* Barnyard
* Over the Hedge
* Surf's Up
* Frankenweenie
* The SpongeBob SquarePants Movie
* Hoodwinked Too! Hood VS. Evil
* Toy Story
* The Simpsons Movie


In [49]:
print(f'Density = {np.round(calc_density(embeddings) * 100, 4)}%')

Density = 100.0%


## `embedding_dim = 8`

In [50]:
embedder = Embedder()
embedder.train(train, vocab_size=VOCAB_SIZE, embedding_dim=8)

INFO:root:Embedder in [train] mode.
INFO:root:Preprocessing 'genres'
INFO:root:Encoding 'genres'
INFO:root:Preprocessing 'overview'
INFO:root:Vectorizing 'overview'...
INFO:root:Vectorizing 'overview' took 13.6143 s, for a vocabulary size of 5000.
INFO:root:Stacking vectorized overview onto encoded 'genres'
INFO:root:Reducing dimension to 8...
INFO:root:Dimensionality reduction took 0 s.


In [51]:
embeddings, ids = embedder.embed(movies, save_path=None)

INFO:root:Embedder in [infer] mode.
INFO:root:Preprocessing 'genres'
INFO:root:Encoding 'genres'
INFO:root:Preprocessing 'overview'
INFO:root:Vectorizing 'overview'...
INFO:root:Vectorizing 'overview' took 16.2972 s, for a vocabulary size of 5000.
INFO:root:Stacking vectorized overview onto encoded 'genres'
INFO:root:Reducing dimension to 8...
INFO:root:Dimensionality reduction took 0 s.
INFO:root:Sparsifying embeddings...
INFO:root:Sparsifying embeddings took 0 s.


In [52]:
rec = Recommender(embeddings, ids)

In [53]:
rec.recommend_pprint(674, movies['title'])

Top 10 similar movies to "Harry Potter and the Goblet of Fire":
* The Chronicles of Narnia: The Lion, the Witch and the Wardrobe
* The Chronicles of Narnia: The Voyage of the Dawn Treader
* The BFG
* The Spiderwick Chronicles
* Percy Jackson & the Olympians: The Lightning Thief
* Alice in Wonderland
* Oz: The Great and Powerful
* Harry Potter and the Chamber of Secrets
* Harry Potter and the Prisoner of Azkaban
* Percy Jackson: Sea of Monsters


In [54]:
rec.recommend_pprint(10193, movies['title'])

Top 10 similar movies to "Toy Story 3":
* Frankenweenie
* Cloudy with a Chance of Meatballs
* Looney Tunes: Back in Action
* Surf's Up
* Happy Feet Two
* Hop
* Barnyard
* Monsters, Inc.
* Doug's 1st Movie
* Toy Story 2


In [55]:
print(f'Density = {np.round(calc_density(embeddings) * 100, 4)}%')

Density = 100.0%
