# Testing performance on out-of-sample

In [1]:
import logging
import os
import sys

import numpy as np
import pandas as pd

In [2]:
sys.path.append(os.path.join('..'))
from recommender.model import Embedder, Recommender

In [3]:
logging.basicConfig(level=logging.INFO)

In [4]:
credits = pd.read_csv(os.path.join('data', 'm-st5000', 'tmdb_5000_credits.csv'), index_col='movie_id')
movies = pd.read_csv(os.path.join('data', 'm-st5000', 'tmdb_5000_movies.csv'), index_col='id')

credits = credits.drop(columns=['title'])
movies = movies.join(credits)

movies = movies.sample(frac=1, random_state=1)

del credits

In [5]:
train = movies.iloc[:4000, :]
test = movies.iloc[4000:, :]

In [6]:
embedder = Embedder()
embedder.train(train, vocab_size=5000, embedding_dim=None)

INFO:root:Embedder in [train] mode.
INFO:root:Preprocessing 'genres'
INFO:root:Encoding 'genres'
INFO:root:Preprocessing 'overview'
INFO:root:Vectorizing 'overview'...
INFO:root:Vectorizing 'overview' took 18.802 s, for a vocabulary size of 5000.
INFO:root:Stacking vectorized overview onto encoded 'genres'


In [7]:
embeddings, ids = embedder.embed(movies, save_path=None)

INFO:root:Embedder in [infer] mode.
INFO:root:Preprocessing 'genres'
INFO:root:Encoding 'genres'
INFO:root:Preprocessing 'overview'
INFO:root:Vectorizing 'overview'...
INFO:root:Vectorizing 'overview' took 19.225 s, for a vocabulary size of 5000.
INFO:root:Stacking vectorized overview onto encoded 'genres'


In [8]:
rec = Recommender(embeddings, ids)

In [9]:
674 in train.index      # Harry Potter and the Goblet of Fire

False

In [10]:
rec.recommend_pprint(674, movies['title'])

Top 10 similar movies to "Harry Potter and the Goblet of Fire":
* Harry Potter and the Prisoner of Azkaban
* Harry Potter and the Chamber of Secrets
* Harry Potter and the Half-Blood Prince
* Harry Potter and the Philosopher's Stone
* Percy Jackson: Sea of Monsters
* The Chronicles of Narnia: Prince Caspian
* Oz: The Great and Powerful
* Inkheart
* Harry Potter and the Order of the Phoenix
* City of Ember


Seems to generalize pretty well!

In [11]:
10193 in train.index    # Toy Story 3

False

In [12]:
rec.recommend_pprint(10193, movies['title'])

Top 10 similar movies to "Toy Story 3":
* Toy Story
* Toy Story 2
* The Simpsons Movie
* Chicken Little
* Hop
* Looney Tunes: Back in Action
* Monsters, Inc.
* Cloudy with a Chance of Meatballs 2
* Hotel Transylvania 2
* Chicken Run


## Was sparsifying worth it?

In [13]:
embeddings

<4803x5020 sparse matrix of type '<class 'numpy.float64'>'
	with 111031 stored elements in Compressed Sparse Row format>

In [14]:
np.prod(embeddings.shape)

24111060

In [15]:
embeddings.nnz

111031

In [16]:
print(f'Density = {np.round((embeddings.nnz / np.prod(embeddings.shape)) * 100, 4)}%')

Density = 0.4605%


While SVD (applied later) can increase density, it is still beneficial to
sparsify the data since the intermediate steps involved in the process can
result in large matrices that occupy space, unless they are sparsified.