In [61]:
import gensim
from gensim.corpora import Dictionary
from gensim.models import LsiModel
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
import itertools
import json
from nltk.tokenize import word_tokenize
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [62]:
'''
Load a json file into a dictionary

Input: 
    - file_path: path to json file containing dataset

Returns: 
    - 
'''
def load_json(file_path): 
    with open(file_path, 'r') as f: 
        data = json.load(f)
    return data



In [63]:
'''
build a dataset using tropes 

Inputs: 
    - data: dictionary mapping titles to list of tropes

Returns: 
    - dictionary: a bag of words dictionary representation tropes 
    - word_to_trope: mapping from individual word to tropes that contain the word
'''
def build_representation_for_tropes(data): 
    
    # build corpus of titles that contain field 
    corpus = []
    word_to_trope = {}
    for title, tropes in data.items(): 
        all_trope_words_for_title = []
        for trope in tropes: 
            trope_words = [word.lower() for word in re.findall('[A-Z][^A-Z]*', trope)]
            all_trope_words_for_title.extend(trope_words)
            for word in trope_words: 
                if word in word_to_trope: 
                    word_to_trope[word].add(trope)
                else: 
                    word_to_trope[word] = set([trope])
        corpus.append(all_trope_words_for_title)                                              
            
    print('{} titles'.format(len(corpus)))
        
    # build dictionary from corpus
    dictionary = Dictionary(corpus)

    return dictionary, word_to_trope


In [64]:
'''
Build a vectorizer and tf-idf matrix corresponding to a dataset

Inputs: 
    - data: dictionary mapping titles to tropes
    
Returns: 
    - vectorizer: a vectorizer object 
    - tf_idf_matrix: a tf-idf matrix corresponding to the topes in the dataset
'''
def make_tf_idf(data): 
    
    # make a vectorizer based on the 'to' dataset
    vectorizer = TfidfVectorizer(analyzer = 'word',
                                 tokenizer = lambda x : x, 
                                 lowercase = False)
    tf_idf_matrix = vectorizer.fit_transform(list(data.values()))

    return vectorizer, tf_idf_matrix
    

In [65]:
'''
Returns the k tropes closest to the keyword 

Inputs: 
    - keyword: the keyword to search
    - dictionary: dictionary of trope words in dataset to search against 
    - word_to_trope: dictionary mapping word to set of tropes containing the word
    - model: the pretrained gensim model
    - top_k: number of tropes to return

Returns: 
    - trope_matches: list of the k tropes that are closest to the keyword
'''
def get_closest_tropes_to_keyword(keyword, dictionary, word_to_trope, model, top_k = 5): 
    
    # check that keyword is in vocabulary 
    if keyword in model.vocab: 
        # compute cosine similarity between query and all trope words
        all_words = list(dictionary.values())
        all_words = [word for word in all_words if word in model.vocab]
        dists = model.distances(keyword, all_words)

        # sort by similarity in ascending order (0 = perfect similarity)
        sorted_indices = np.argsort(dists)
        sorted_keyword_match = [all_words[idx] for idx in sorted_indices[:top_k]]

        print('\ntop {} matches most similar to `{}`'.format(top_k, keyword))
        for word in sorted_keyword_match: 
            print('`{}` : {}'.format(word, word_to_trope[word]))

        trope_matches = list(itertools.chain.from_iterable([word_to_trope[word] for word in sorted_keyword_match if word in word_to_trope]))
        print('\nenhancing search with : {}'.format(trope_matches[:top_k]))

        return trope_matches[:top_k]
    
    else: 
        print('`{}` not in model vocabulary, cannot enhance search with keyword'.format(keyword))
        return []
    

In [66]:
'''
Finds the best titles according to tropes based on an input title and keyword. 
The trope words of the title being searched and the trope words most similar to 
the keyword are used to find the best matches. Half the number of the tropes corresponding
to the title being queried are used to enhance the keyword aspect of the search (eg. if the 
queried title has 10 tropes associated with it, then the top 5 tropes associated with the 
keyword will be used to enhance the search. This value seems to result in a good balance)

Inputs: 
    - title: the title being queried
    - from_dataset: the dataset corresponding to the title (eg. book dataset if book title)
    - keyword: the keyword to search
    - dictionary: dictionary representation of trope words associated with each title of desired return type
    - word_to_trope: dictionary mapping word to set of tropes containing the word
    - to_tf_idf_matrix: tf-idf representation of tropes associated with each title of desired return type
    - similarity_matrix: similarity matrix according to tf-idf representation 
    - model: the pretrained gensim model
    
Reutrns: 
    - similarity_scores: numpy array of similarity scores in where the index in the array corresponds
                         to the index in the dataset of the media type being recommended
'''
def best_titles_by_tropes_enhanced(title, from_dataset, keyword, dictionary, word_to_trope, vectorizer, to_tf_idf_matrix, model): 
    
    # get tropes coresponding to title 
    title_tropes =  from_dataset[title]
    print('tropes for title `{}` : {}'.format(title, title_tropes))
    
    # get most similar tropes to keyword, use half the number of tropes in the title to enhance search 
    top_k_tropes = int(len(title_tropes)/2)
    most_similar_tropes = get_closest_tropes_to_keyword(keyword, dictionary, word_to_trope, model, top_k_tropes)
    
    # extend query to include tropes associated with keyword
    query_tropes = most_similar_tropes + title_tropes
    
    print('\ntropes used for final query : {}'.format(query_tropes))
    
    # generate a query vector 
    query_vector = vectorizer.transform([query_tropes])
    
    # compute cosine similarity between query and all titles 
    similarity_scores = cosine_similarity(query_vector, to_tf_idf_matrix).flatten()
    
    return similarity_scores


In [67]:
def print_results(title, keyword, similarity_scores, to_dataset, top_k_titles = 10): 
    
    # sort the scores in descending order
    ranked_indices = np.argsort(similarity_scores)[::-1]
    
    # get list of titles
    to_titles = list(to_dataset.keys())

    print('\ntop {} most similar titles to `{}` by trope to keyword `{}` '.format(top_k_titles, title, keyword))
    for idx in range(top_k_titles): 
        print((similarity_scores[ranked_indices[idx]], to_titles[ranked_indices[idx]]))
    

In [68]:
# load pretrained word embeddings
model = KeyedVectors.load_word2vec_format('./app/irsystem/controllers/DatasetInfo/gensim_glove.6B.50d.txt', binary = False, limit=50000)


In [69]:
with open("app/irsystem/controllers/TVTropesScraper/Film/Film_tropes_dataset3.json", 'r') as f:
    movies = json.load(f)
with open("app/irsystem/controllers/TVTropesScraper/Literature/Literature_tropes_dataset3.json", 'r') as f:
    books = json.load(f)

In [70]:
len(movies)

4797

In [71]:
# create dictionary representations of datasets
book_dictionary, book_word_to_trope = build_representation_for_tropes(books)
movie_dictionary, movie_word_to_trope = build_representation_for_tropes(movies)

2933 titles
4797 titles


In [72]:
# build book to movie tf-idf matrix
book_to_movie_vectorizer, movie_tf_idf = make_tf_idf(movies)
print(movie_tf_idf.shape)


(4797, 23416)


In [73]:
# build movie to book tf-idf matrix
movie_to_book_vectorizer, book_tf_idf = make_tf_idf(books)
print(book_tf_idf.shape)

(2933, 20195)


In [74]:
'''
FULL SEARCH ALGORITHM TO USE IN FINAL PROJECT. This returns a numpy array of similarity scores
for a title + keyword query

Inputs: 
     - title: the title being queried
     - keyword: the keyword being queried
     - direction: either 'book to movie' or 'movie to book' 

Returns: 
     - similarity_scores: numpy array of similarity scores where the index in the array 
                          corresponds to the index in the 'to' dataset (ie. if the query
                          direction was book to movie, the indices would correspond to the 
                          indices in the movie tropes dataset)
'''
def full_search(title, keyword, direction):
    
    
    if direction == 'book to movie': 
        from_dataset = books
        dictionary = movie_dictionary
        word_to_trope = movie_word_to_trope
        vectorizer = book_to_movie_vectorizer
        to_tf_idf_matrix = movie_tf_idf
    else: # movie to book
        from_dataset = movies
        dictionary = book_dictionary
        word_to_trope = book_word_to_trope
        vectorizer = movie_to_book_vectorizer
        to_tf_idf_matrix = book_tf_idf
           
    similarity_scores = best_titles_by_tropes_enhanced(title, 
                                                       from_dataset, 
                                                       keyword, 
                                                       dictionary, 
                                                       word_to_trope, 
                                                       vectorizer, 
                                                       to_tf_idf_matrix, 
                                                       model)

    return similarity_scores
    
    

In [75]:
# example query 
scores = full_search('Harry Potter and the Chamber of Secrets', 'dogs', 'book to movie')


top 101 matches most similar to `dogs`
`dogs` : {'HypnosisProofDogs', 'DogsAreDumb', 'SledDogsThroughTheSnow', 'DogsHateSquirrels', 'AllAnimalsAreDogs', 'AllDogsArePurebred', 'YankTheDogsChain', 'HeroesLoveDogs', 'ACatInAGangOfDogs'}
`cats` : {'CatsAreMagic', 'CatsAreMean', 'CatsHaveNineLives', 'AllWitchesHaveCats', 'CatsHateWater', 'CatsAreSnarkers', 'CatsAreSuperior', 'TheInternetIsForCats'}
`dog` : {'PetTheDog', 'DogFoodDiet', 'RightHandAttackDog', 'BewareOfViciousDog', 'CatDogDichotomy', 'DogDiesAtTheEnd', 'HeroicDog', 'PostApocalypticDog', 'BigFriendlyDog', 'EatTheDog', 'PuppyDogEyes', 'PhotoOpWithTheDog', 'ADogNamedDog', 'KickTheDog', 'ADogAteMyHomework', 'MistakenForDog', 'MicrowaveTheDog', 'BadgesAndDogTags', 'LickedByTheDog', 'KicktheDog', 'INeedToGoIronMyDog', 'ShaggyDogStory', 'ShooTheDog', 'OldSchoolDogFight', 'IHaveToGoIronMyDog', 'ThrowTheDogABone', 'TheDogShotFirst', 'EvenTheDogIsAshamed', 'DogPileofDoom', 'RobotDog', 'ShootTheShaggyDog', 'ScareTheDog', 'DogStereotype',

`birds` : {'BirdsOfAFeather', 'AllFlyersAreBirds', 'NobodyHereButUsBirds', 'RocBirds'}
`snakes` : {'SnakesAreSinister', 'SnakesAreEvil', 'WhyDidItHaveToBeSnakes', 'SnakesAreSexy'}
`rabbits` : {'LuckyRabbitsFoot'}
`eating` : {'CheeseEatingSurrenderMonkeys', 'EatingMachine', 'EatingShoes', 'EatingPetFood', 'EroticEating', 'EatingLunchAlone', 'EatingTheEnemy', 'TypewriterEating', 'RegretEatingMe', 'PostStressOverEating', 'EatingTheEyeCandy', 'ManEatingPlant', 'EatingContest'}
`eat` : {'EatTheBomb', 'EatTheCamera', 'EatMe', 'EatTheDog', 'JustEatGilligan', 'YouAreWhoYouEat', 'EatMyDust', 'EatTheRich', 'IfYoureSoEvilEatThisKitten', 'CyberneticsEatYourSoul', 'EatDirtCheap', 'RealMenEatMeat', 'CityPeopleEatSushi', 'JustEatHim', 'EatBrainForMemories', 'DontEatAndSwim', 'ForgetsToEat', 'EatTheEvidence', 'IsItSomethingYouEat', 'CyberneticsWillEatYourSoul', 'SapientEatSapient'}
`elephants` : {'PinkElephants', 'WarElephants', 'ElephantsChild'}
`insects` : {'HumansAreInsects'}
`pet` : {'PetTheDog', 

`them` : {'BeatThemAtTheirOwnGame', 'MustLetThemGetAway', 'GottaCatchThemAll', 'CantLiveWithThemCantLiveWithoutThem', 'IWantThemAlive', 'AliensMadeThemDoIt', 'NotWithThemForTheMoney', 'MarryThemAll', 'OnlyTheAuthorCanSaveThemNow', 'BuyThemOff', 'DumpThemAll', 'LetThemDieHappy', 'OneMythToExplainThemAll', 'BlindWithoutThem', 'KickThemWhileTheyAreDown', 'MakeThemRot', 'MakeAnExampleOfThem', 'SockItToThem', 'GivingThemTheStrip', 'YouSaidYouWouldLetThemGo', 'UnhandThemVillain', 'AProtagonistShallLeadThem', 'AllOfThem', 'KnowWhenToFoldThem', 'OneJudgeToRuleThemAll', 'AChildShallLeadThem', 'FairestOfThemAll', 'ScrewTheRulesIMakeThem', 'BreakThemByTalking', 'KillThemAll', 'ISeeThemToo', 'JustToyingWithThem', 'SeizeThem', 'PutThemAllOutOfMyMisery', 'TheyDontMakeThemLikeTheyUsedTo', 'HowManyAllOfThem', 'GottaKillThemAll', 'ScrewTheRulesTheyBrokeThemFirst', 'ThrowTheBookAtThem'}
`kill` : {'YouKillItYouBoughtIt', 'RailingKill', 'CradlingYourKill', 'ThereIsNoKillLikeOverkill', 'YouCanNotKillAnIdea

`dying` : {'DyingMomentOfAwesome', 'DyingAlone', 'DyingClue', 'DyingToBeReplaced', 'DyingAsYourself', 'DyingDealUpgrade', 'DyingCurse', 'DyingCandle', 'DyingDream', 'YouSeeImDying', 'DyingTruce', 'DyingDeclarationOfHate', 'VillainsDyingGrace', 'DyingForSymbolism', 'LikeYouWereDying', 'SorryThatImDying', 'DyingMomentofAwesome', 'DyingTown', 'DyingMessage', 'DyingRace', 'WithMyDyingBreathISummonYou', 'IAmDyingPleaseTakeMyMacguffin', 'TheDyingWalk', 'MistakenForDying', 'ImDyingPleaseTakeMyMacGuffin', 'DyingDeclarationOfLove', 'DyingSpeech', 'SecretlyDying'}
`frogs` : {'FreeTheFrogs'}
`alligator` : {'ApothecaryAlligator', 'BigLippedAlligatorMoment'}
`monkey` : {'GreaseMonkey', 'CuriousAsAMonkey', 'IronButtMonkey', 'MonkeyMoralityPose', 'KillerSpaceMonkey', 'PowersuitMonkey', 'CymbalBangingMonkey', 'WeNamedTheMonkeyJack', 'ButtMonkey'}
`flies` : {'WingedSoulFliesOffAtDeath', 'CharactersDroppingLikeFlies', 'FliesEqualsEvil', 'IfItSwimsItFlies'}
`mad` : {'ReluctantMadScientist', 'MadLibThrill



In [76]:
# example to print nice list of ranked results 
print_results('Harry Potter and the Chamber of Secrets', 'dogs', scores, movies, 10)


top 10 most similar titles to `Harry Potter and the Chamber of Secrets` by trope to keyword `dogs` 
(0.10132777456717702, 'Homeward Bound: The Incredible Journey')
(0.10051968461980247, 'Cats and Dogs')
(0.08335459540398535, 'The Secret Life of Pets')
(0.0710858068850421, 'Monsters, Inc.')
(0.06768822375750554, 'Okja')
(0.06765832591624081, 'The Aristocats')
(0.0653700663892429, 'Arachnophobia')
(0.06426243544283632, 'Beethoven')
(0.059512484121812266, 'Harry Potter and the Chamber of Secrets')
(0.058462709392773045, 'Puss in Boots')


In [38]:
len(scores)

4797