In [70]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import json
import nltk
import math
import string

nltk.download('punkt')
pd.set_option('display.max_columns', None)  


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\brand\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Data Prepping

In [71]:
tmdb_movies = pd.read_csv("TMDB_movie_dataset_v11.csv")
movies = pd.read_json("movies.json")

tmdb_shows = pd.read_csv("TMDB_tv_dataset_v3.csv")
shows = pd.read_json("shows.json")

In [72]:
movies = movies.dropna(subset=["Title", "Plot", "Genre", "Type", "Director"])
movies = movies[(movies["Plot"] != "N/A") & (movies["Plot"] != "nan") & (movies["Title"] != "N/A") & (movies["Type"] != "series") & (movies["Director"] != "N/A")]
movies = movies.drop_duplicates(["Title"], keep='first')

shows = shows.dropna(subset=["Title", "Plot", "Genre", "Type"])
shows = shows[(shows["Plot"] != "N/A") & (shows["Plot"] != "nan") & (shows["Title"] != "N/A") & (shows["Type"] != "movie")]
shows = shows.drop_duplicates(subset=["Title"], keep='first')

In [73]:
movies = pd.merge(movies, tmdb_movies, how="left", left_on="imdbID", right_on="imdb_id")
shows = pd.merge(shows, tmdb_shows, how="left", left_on="Title", right_on="name")

In [74]:
shows = shows[shows["origin_country"].str.contains("US") | shows["origin_country"].str.contains("JP")]
movies = movies[movies["original_language"]=="en"]

In [75]:
movies["description"] = movies.apply(lambda x: f"A {x['Genre']} {x['Type']} directed by {x['Director']} with keywords consisting of {x['keywords']}. {x['Plot']}", axis=1)
shows["description"] = shows.apply(lambda x: f"A {x['Genre']} {x['Type']}. {x['Plot']}", axis=1)

In [7]:
print(movies.shape[0])
print(shows.shape[0])

194482
17751


## Get Corpora Tokenized (Movies and Shows Separate)

In [10]:
def tokenize(document):
    '''
    Takes a document and returns a list of tokens from all the sentences in that document.

    Parameters
    ----------
    document : str
        The body of text you would like to tokenize.

    Returns
    -------
    doc_tokens : list
        A list of tokens comprising the sentences in that document.

    Example
    -------
    document = "A Crime, Drama, Thriller movie directed by Ethan Coen. It was a super good film."

    doc_tokens = tokenize(document)

    Will output:
        "['a', 'crime', 'drama', 'thriller', 'movie', 'directed', 'by' 'ethan', 'coen', 'it', 'was', 'a', 'super', 'good', 'film']"
    '''
    doc_tokens = []
    sentences = nltk.sent_tokenize(document)

    for sentence in sentences:
        sentence = sentence.translate(str.maketrans('', '', string.punctuation))
        sent_tokens = nltk.word_tokenize(sentence)
        sent_tokens = [word.lower() for word in sent_tokens if word]
        doc_tokens += sent_tokens
    return doc_tokens



In [11]:
all_sentences_tokenized_movies = []

documents = list(movies.description)

for document in tqdm(documents):
    doc_tokens = tokenize(document)
    all_sentences_tokenized_movies += [doc_tokens]

100%|██████████| 194482/194482 [01:38<00:00, 1978.19it/s]


In [12]:
all_sentences_tokenized_shows = []

documents = list(shows.description)

for document in tqdm(documents):
    doc_tokens = tokenize(document)
    all_sentences_tokenized_shows += [doc_tokens]

  0%|          | 0/17751 [00:00<?, ?it/s]

100%|██████████| 17751/17751 [00:07<00:00, 2466.59it/s]


## Word2Vec Separate Counts Models

In [57]:
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
sentences = all_sentences_tokenized_movies
vector_size = 300
window = 5
workers = 6
hs = 1
sg = 0
seed = 27
min_count = 1
epochs = 250
compute_loss = True

In [15]:
model = Word2Vec(
    sentences=sentences,
    vector_size=vector_size,
    window=window,
    workers=workers,
    hs=hs,
    sg=sg,
    seed=seed,
    min_count=min_count,
    compute_loss=compute_loss
)

In [16]:
model.save("cinema-rec.model")

In [17]:
sentences = all_sentences_tokenized_shows
vector_size = 300
window = 5
workers = 6
hs = 1
sg = 0
seed = 27
min_count = 1
epochs = 250
compute_loss = True

In [18]:
model2 = Word2Vec(
    sentences=sentences,
    vector_size=vector_size,
    window=window,
    workers=workers,
    hs=hs,
    sg=sg,
    seed=seed,
    min_count=min_count,
    compute_loss=compute_loss
)

In [19]:
model2.save("show-rec.model")

In [76]:
model = Word2Vec.load("cinema-rec.model")
model2 = Word2Vec.load("show-rec.model")

In [21]:
def tokenize_join(document):
    '''
    Takes a document and returns a list of tokens from all the sentences in that document.

    Parameters
    ----------
    document : str
        The body of text you would like to tokenize.

    Returns
    -------
    doc_tokens : list
        A list of tokens comprising the sentences in that document.

    Example
    -------
    document = "A Crime, Drama, Thriller movie directed by Ethan Coen. It was a super good film."

    doc_tokens = tokenize(document)

    Will output:
        "['a', 'crime', 'drama', 'thriller', 'movie', 'directed', 'by' 'ethan', 'coen', 'it', 'was', 'a', 'super', 'good', 'film']"
    '''
    doc_tokens = []
    sentences = nltk.sent_tokenize(document)

    for sentence in sentences:
        sentence = sentence.translate(str.maketrans('', '', string.punctuation))
        sent_tokens = nltk.word_tokenize(sentence)
        sent_tokens = [word.lower() for word in sent_tokens if word]
        doc_tokens += sent_tokens
    return " ".join(doc_tokens)


In [22]:
all_sentences_tokenized_joined_movies = []

for document in tqdm(list(movies.description)):
    doc_tokens = tokenize_join(document)
    all_sentences_tokenized_joined_movies += [doc_tokens]

100%|██████████| 194482/194482 [01:38<00:00, 1974.56it/s]


In [None]:
all_sentences_tokenized_joined_shows = []

for document in tqdm(list(shows.description)):
    doc_tokens = tokenize_join(document)
    all_sentences_tokenized_joined_shows += [doc_tokens]

### TFIDF

In [None]:
m_vectorizer = TfidfVectorizer()
tfidf_matrix = m_vectorizer.fit_transform(movies["description"])

movies_tfidf_features = m_vectorizer.get_feature_names_out()
movies_tfidf_list = dict(zip(m_vectorizer.get_feature_names_out(), m_vectorizer.idf_))

In [65]:
# Initialize movie_tfidf_vectors
movie_tfidf_vectors = np.zeros((len(movies), 300))

for i, description in tqdm(enumerate(all_sentences_tokenized_joined_movies), total=len(all_sentences_tokenized_joined_movies)):
    words = description.split()
    tfidf_indices = [m_vectorizer.vocabulary_[word] for word in words if word in m_vectorizer.vocabulary_]
    tfidf_values = [movies_tfidf_list[word] for word in words if word in m_vectorizer.vocabulary_]

    for j, idx in enumerate(tfidf_indices):
        movie_tfidf_vectors[i] += model.wv[words[j]] * tfidf_values[j]

    movie_tfidf_vectors[i] /= np.sum(tfidf_values)

100%|██████████| 194482/194482 [01:40<00:00, 1927.08it/s]


In [67]:
s_vectorizer = TfidfVectorizer()
tfidf_matrix = s_vectorizer.fit_transform(shows["description"])

shows_tfidf_features = s_vectorizer.get_feature_names_out()
shows_tfidf_list = dict(zip(s_vectorizer.get_feature_names_out(), s_vectorizer.idf_))

In [68]:
# Initialize movie_tfidf_vectors
shows_tfidf_vectors = np.zeros((len(shows), 300))

for i, description in tqdm(enumerate(all_sentences_tokenized_joined_shows), total=len(all_sentences_tokenized_joined_shows)):
    words = description.split()
    tfidf_indices = [s_vectorizer.vocabulary_[word] for word in words if word in s_vectorizer.vocabulary_]
    tfidf_values = [shows_tfidf_list[word] for word in words if word in s_vectorizer.vocabulary_]

    for j, idx in enumerate(tfidf_indices):
        shows_tfidf_vectors[i] += model2.wv[words[j]] * tfidf_values[j]

    shows_tfidf_vectors[i] /= np.sum(tfidf_values)

100%|██████████| 17751/17751 [00:06<00:00, 2850.13it/s]


In [77]:
movies["tfidf_embedding"] = list(movie_tfidf_vectors)
shows["tfidf_embedding"] = list(shows_tfidf_vectors)

### Counts

In [None]:
movie_counts_vectors = np.zeros((len(all_sentences_tokenized_joined_movies), 300))

for i, description in enumerate(tqdm(all_sentences_tokenized_joined_movies)):  
    words = description.split()
    desc_vec = np.zeros((1, 300))
    for word in words:
        desc_vec += model2.wv[word] 
    desc_vec /= len(words)

    movie_counts_vectors[i] = desc_vec

In [25]:
shows_counts_vectors = np.zeros((len(all_sentences_tokenized_joined_shows), 300))

for i, description in enumerate(tqdm(all_sentences_tokenized_joined_shows)):  
    words = description.split()
    desc_vec = np.zeros((1, 300))
    for word in words:
        desc_vec += model2.wv[word] 
    desc_vec /= len(words)

    shows_counts_vectors[i] = desc_vec

100%|██████████| 17751/17751 [00:03<00:00, 4919.07it/s]


In [78]:
movies["count_embedding"] = list(movie_counts_vectors)
shows["count_embedding"] = list(shows_counts_vectors)

In [79]:
movies.to_json("movies_we.json")
shows.to_json("shows_we.json")

## Doc2Vec Separate Models

In [28]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [29]:
movie_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(all_sentences_tokenized_movies)]
show_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(all_sentences_tokenized_shows)]

In [30]:
model = Doc2Vec(movie_documents, vector_size=200, window=5, workers = 6, epochs = 150, hs = 1)
model.save("doc2vec-cinema.model")


In [31]:
model2 = Doc2Vec(show_documents, vector_size=200, window=5, workers = 6, epochs = 150, hs = 1)
model2.save("doc2vec-show.model")

In [53]:
model= Doc2Vec.load("doc2vec-cinema.model")
model2= Doc2Vec.load("doc2vec-show.model")

In [54]:
movies = movies.loc[:, ["Title", "Year", "Genre", "Director", "Plot", "description", "Poster", "Ratings", "Language", "Type", "count_embedding"]]
shows = shows.loc[:, ["Title", "Year", "Genre", "Plot", "description", "Poster", "Ratings", "Language", "Type", "count_embedding"]]

In [55]:
movies.reset_index(inplace=True, drop=True)
shows.reset_index(inplace=True, drop=True)