In [1]:
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
import pandas as pd
import numpy as np

In [2]:
model = Word2Vec.load("cinema-rec.model")

In [3]:
movies_we = pd.read_json("movies_we.json")

In [8]:
movies_we_dict = dict(zip(movies_we["Title"], movies_we["count_embedding"]))

In [4]:
import json 

with open('movies_we.json', 'r') as f:
    embeddings_dict = json.load(f)

titles = list(embeddings_dict["Title"].values())
count_embeddings_matrix = np.array(list(embeddings_dict["count_embedding"].values()))

def find_top_similar_titles(query_title, top_n=5):
    # Find the index of the query title
    query_index = titles.index(query_title)
    
    # Get the embedding of the query title
    query_embedding = count_embeddings_matrix[query_index]
    
    # Calculate cosine similarity between the query title embedding and all other title embeddings
    similarity_scores = cosine_similarity([query_embedding], count_embeddings_matrix)[0]
    
    # Sort titles based on cosine similarity scores
    sorted_indices = np.argsort(similarity_scores)[::-1]
    sorted_titles = [(titles[i], similarity_scores[i]) for i in sorted_indices if titles[i] != query_title]
    
    # Select the top similar titles
    top_similar_titles = sorted_titles[:top_n]
    
    return top_similar_titles

In [7]:
query_title = "No Country for Old Men"
top_similar_titles = find_top_similar_titles(query_title)

for title, similarity_score in top_similar_titles:
    print(title, ":", similarity_score)

The Hollow Point : 0.9375156571975086
Death Wish II : 0.9319400240344249
Manhunter : 0.9312630427929911
Unforgiven : 0.9304567567735813
The Purple Gang : 0.9294734692889288


In [18]:
with open('shows_we.json', 'r') as f:
    embeddings_dict = json.load(f)

titles = list(embeddings_dict["Title"].values())
embeddings_matrix = np.array(list(embeddings_dict["count_embedding"].values()))


def find_top_similar_titles(query_title, top_n=5):
    # Find the index of the query title
    query_index = titles.index(query_title)
    
    # Get the embedding of the query title
    query_embedding = embeddings_matrix[query_index]
    
    # Calculate cosine similarity between the query title embedding and all other title embeddings
    similarity_scores = euclidean_distances([query_embedding], embeddings_matrix)[0]
    
    # Sort titles based on cosine similarity scores
    sorted_indices = np.argsort(similarity_scores)
    sorted_titles = [(titles[i], similarity_scores[i]) for i in sorted_indices if titles[i] != query_title]
    
    # Select the top similar titles
    top_similar_titles = sorted_titles[:top_n]
    
    return top_similar_titles

In [19]:
query_title = "Better Call Saul"
top_similar_titles = find_top_similar_titles(query_title)

for title, similarity_score in top_similar_titles:
    print(title, ":", similarity_score)

Baretta : 1.0995543216676753
Nero Wolfe : 1.1605278541044688
Hagen : 1.1847008087042445
Island Son : 1.189776767237423
Ramar of the Jungle : 1.1920142290152533


## Save Matrices

In [20]:
with open('movies_we.json', 'r') as f:
    movies_embeddings_dict = json.load(f)

titles = list(movies_embeddings_dict["Title"].values())
movies_count_embeddings_matrix = np.array(list(movies_embeddings_dict["count_embedding"].values()))
movies_tfidf_embeddings_matrix = np.array(list(movies_embeddings_dict["tfidf_embedding"].values()))

In [21]:
with open('shows_we.json', 'r') as f:
    shows_embeddings_dict = json.load(f)

titles = list(shows_embeddings_dict["Title"].values())
shows_count_embeddings_matrix = np.array(list(shows_embeddings_dict["count_embedding"].values()))
shows_tfidf_embeddings_matrix = np.array(list(shows_embeddings_dict["tfidf_embedding"].values()))

In [23]:
movies_count_embeddings_matrix.dump("movies_count_embedding_matrix.dat")
movies_tfidf_embeddings_matrix.dump("movies_tfidf_embedding_matrix.dat")
shows_count_embeddings_matrix.dump("shows_count_embedding_matrix.dat")
shows_tfidf_embeddings_matrix.dump("shows_tfidf_embedding_matrix.dat")

