In [1]:
import gensim
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
import itertools
import json
import pickle
from nltk.tokenize import word_tokenize
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm



In [2]:
'''
Load a json file into a dictionary

Input: 
    - file_path: path to json file containing dataset

Returns: 
    - 
'''
def load_json(file_path): 
    with open(file_path, 'r') as f: 
        data = json.load(f)
    return data



In [3]:
'''
Returns the k tropes closest to the keyword 

Inputs: 
    - keyword: the keyword to search
    - word_to_trope: dictionary mapping word to set of tropes containing the word
    - model: the pretrained gensim model
    - top_k: number of tropes to return

Returns: 
    - trope_matches: list of the k tropes that are closest to the keyword
'''
def get_closest_tropes_to_keyword(keyword, word_to_trope, model, top_k = 5): 
    
    # check that keyword is in vocabulary 
    if keyword in model.vocab: 
        # compute cosine similarity between query and all trope words
        all_words = list(word_to_trope.keys())
        all_words = [word for word in all_words if word in model.vocab]
        dists = model.distances(keyword, all_words)

        # sort by similarity in ascending order (0 = perfect similarity)
        sorted_indices = np.argsort(dists)
        sorted_keyword_match = [all_words[idx] for idx in sorted_indices[:top_k]]

        print('\ntop {} matches most similar to `{}`'.format(top_k, keyword))
        for word in sorted_keyword_match: 
            print('`{}` : {}'.format(word, word_to_trope[word]))

        trope_matches = list(itertools.chain.from_iterable([word_to_trope[word] for word in sorted_keyword_match if word in word_to_trope]))
        print('\nenhancing search with : {}'.format(trope_matches[:top_k]))

        return trope_matches[:top_k]
    
    else: 
        print('`{}` not in model vocabulary, cannot enhance search with keyword'.format(keyword))
        return []
    

In [4]:
'''
Finds the best titles according to tropes based on an input title and keyword. 
The trope words of the title being searched and the trope words most similar to 
the keyword are used to find the best matches. Half the number of the tropes corresponding
to the title being queried are used to enhance the keyword aspect of the search (eg. if the 
queried title has 10 tropes associated with it, then the top 5 tropes associated with the 
keyword will be used to enhance the search. This value seems to result in a good balance)

Inputs: 
    - title: the title being queried
    - from_dataset: the dataset corresponding to the title (eg. book dataset if book title)
    - keyword: the keyword to search
    - word_to_trope: dictionary mapping word to set of tropes containing the word
    - to_tf_idf_matrix: tf-idf representation of tropes associated with each title of desired return type
    - similarity_matrix: similarity matrix according to tf-idf representation 
    - model: the pretrained gensim model
    
Reutrns: 
    - similarity_scores: numpy array of similarity scores in where the index in the array corresponds
                         to the index in the dataset of the media type being recommended
'''
def best_titles_by_tropes_enhanced(title, from_dataset, keyword, word_to_trope, vectorizer, to_tf_idf_matrix, model): 
    
    # get tropes coresponding to title 
    title_tropes =  from_dataset[title]
    print('tropes for title `{}` : {}'.format(title, title_tropes))
    
    # get most similar tropes to keyword, use half the number of tropes in the title to enhance search 
    top_k_tropes = int(len(title_tropes)/2)
    most_similar_tropes = get_closest_tropes_to_keyword(keyword, word_to_trope, model, top_k_tropes)
    
    # extend query to include tropes associated with keyword
    query_tropes = most_similar_tropes + title_tropes
    
    print('\ntropes used for final query : {}'.format(query_tropes))
    
    # generate a query vector 
    query_vector = vectorizer.transform([' '.join(query_tropes)])
    
    # compute cosine similarity between query and all titles 
    similarity_scores = cosine_similarity(query_vector, to_tf_idf_matrix).flatten()
    
    return similarity_scores


In [5]:
def print_results(title, keyword, similarity_scores, to_dataset, top_k_titles = 10): 
    
    # sort the scores in descending order
    ranked_indices = np.argsort(similarity_scores)[::-1]
    
    # get list of titles
    to_titles = list(to_dataset.keys())

    print('\ntop {} most similar titles to `{}` by trope to keyword `{}` '.format(top_k_titles, title, keyword))
    for idx in range(top_k_titles): 
        print((similarity_scores[ranked_indices[idx]], to_titles[ranked_indices[idx]]))
    

In [6]:
# load small word embedding model
model = KeyedVectors.load("tbwb_model.bin")


In [7]:
# load book and movie datasets
books = load_json('Literature_tropes_dataset3.json')
movies = load_json('Film_tropes_dataset3.json')


In [8]:
# load word to trope mappings
book_word_to_trope = load_json('book_word_to_trope.json')
movie_word_to_trope = load_json('movie_word_to_trope.json')

In [9]:
# load vectorizers 
book_to_movie_vectorizer = pickle.load(open("book_to_movie_vectorizer.pickle", "rb" ))
movie_to_book_vectorizer = pickle.load(open("movie_to_book_vectorizer.pickle", "rb" ))

In [10]:
# load tf-idf matrices
movie_tf_idf = pickle.load(open("movie_tf_idf.pickle", "rb" ))
book_tf_idf = pickle.load(open("book_tf_idf.pickle", "rb" ))


In [11]:
'''
FULL SEARCH ALGORITHM TO USE IN FINAL PROJECT. This returns a numpy array of similarity scores
for a title + keyword query

Inputs: 
     - title: the title being queried
     - keyword: the keyword being queried
     - direction: either 'book to movie' or 'movie to book' 

Returns: 
     - similarity_scores: numpy array of similarity scores where the index in the array 
                          corresponds to the index in the 'to' dataset (ie. if the query
                          direction was book to movie, the indices would correspond to the 
                          indices in the movie tropes dataset)
'''
def full_search(title, keyword, direction):
    
    
    if direction == 'book to movie': 
        from_dataset = books
        to_dataset = movies
        word_to_trope = movie_word_to_trope
        vectorizer = book_to_movie_vectorizer
        to_tf_idf_matrix = movie_tf_idf
    else: # movie to book
        from_dataset = movies
        to_dataset = books
        word_to_trope = book_word_to_trope
        vectorizer = movie_to_book_vectorizer
        to_tf_idf_matrix = book_tf_idf
           
    similarity_scores = best_titles_by_tropes_enhanced(title, 
                                                       from_dataset, 
                                                       keyword, 
                                                       word_to_trope, 
                                                       vectorizer, 
                                                       to_tf_idf_matrix, 
                                                       model)
    
    print_results(title, keyword, similarity_scores, to_dataset, 10)

    return similarity_scores
    
    

In [12]:
# example query 
scores = full_search('Harry Potter and the Chamber of Secrets', 'zombies', 'book to movie')


top 101 matches most similar to `zombies`
`zombies` : ['NaziZombies', 'RoomFullOfZombies', 'EverythingsDeaderWithZombies', 'OurZombiesAreDifferent']
`zombie` : ['NoZombieCannibals', 'ZombieAdvocate', 'ZombieMooks', 'PlagueZombie', 'NinjaPirateZombieRobot', 'VoodooZombie', 'ArtificialZombie', 'NotAZombie', 'NinjaPirateRobotZombie', 'IncongruouslyDressedZombie', 'ZombieGait', 'TechnicallyLivingZombie', 'RevenantZombie', 'ZombieApocalypse', 'ZombiePukeAttack', 'ReviveKillsZombie', 'AndThenJohnWasAZombie', 'ZombieInfectee', 'AttractiveZombie', 'FriendlyZombie']
`vampires` : ['FriendlyNeighborhoodVampires', 'VampiresSleepInCoffins', 'VampiresOwnNightclubs', 'VampiresHarem', 'YourVampiresSuck', 'VampiresAreRich', 'OurVampiresAreDifferent', 'VampiresOwnNightClubs', 'VampiresHateGarlic', 'VampiresAreSexGods', 'FriendlyNeighbourhoodVampires']
`undead` : ['UndeadAuthor', 'TheUndead', 'UndeadChild', 'TurnUndead', 'UndeadTaxExemption', 'MistakenForUndead', 'UndeadBarefooter', 'BurnTheUndead', 'Sl