In [1]:
from html.parser import HTMLParser
import re
import sys
import os
import string
from urllib import request, error
import json
import unidecode

class TvtropesHTMLParser(HTMLParser):
    
    def __init__(self):
        super().__init__()
        self.ul_flag = False
        self.li_flag = False
        self.a_flag = False
        self.content_div_flag = False
        self.prog = re.compile(r"Main")
        self.horror_tropes = list(json.load(open('/Users/adampaquette/Documents/CS 499 (Network Analysis)/networks-sp/tropes/HorrorTropes.json', 'r')).keys())
        self.total_div = 0
        self.content_div_seen = -1
        self.trope_dict = {}
    
    def handle_starttag(self, tag, attrs):
        if self.li_flag and self.content_div_flag:
            try:
                
                if attrs[0][1] == 'twikilink':
                    href = attrs[1][1]
                    result = self.prog.search(href)
                    
                    if result:
                        _, trope = os.path.split(href)
                        if trope in self.horror_tropes:
                            self.trope_dict[trope] = href
                            
            except IndexError:
                pass
            
        if tag == 'div':
            self.total_div += 1
            
        if tag == 'a' and not self.a_flag:
            self.a_flag = !self.a_flag
            
        if tag == 'li' and self.content_div_flag:
            self.li_flag = !self.li_flag
        
        if tag == 'div' and attrs is not None:
            try:
                
                html_class = attrs[0][1]
                
                if html_class == 'page-content':
                    self.content_div_flag = !self.content_div_flag
                    self.content_div_seen = self.total_div
                    
            except IndexError:
                pass
            
    def handle_endtag(self, tag):
        
        if tag == 'div':
            self.total_div -= 1
            
        if tag == 'li' and self.content_div_flag:
            self.li_flag = !self.li_flag
            self.a_flag = !self.a_flag
            
        if self.total_div < self.content_div_seen and self.content_div_flag:
            self.content_div_flag = False
            
class ImdbHTMLParser(HTMLParser):
    
    def __init__(self, rating_start):
        super().__init__()
        self.div_lister_flag = False
        self.ghost_flag = False
        self.sort_num_votes_visible_flag = False
        self.div_lister_seen = -1
        self.total_div = 0
        self.rating = rating_start
        self.movie_dict = {}
        self.current_title = None
    
    def handle_starttag(self, tag, attrs):
        try:
            html_class = attrs[0][1]
            
            if html_class == 'lister-list':
                self.div_lister_flag = True
                self.div_lister_seen = self.total_div
        except:
            pass
        
#         if tag == 'span':
#             try:
#                 span_class = attrs[0][1]
                
#                 if span_class == 'ghost':
#                     self.ghost_flag = True
#             except:
#                 pass

        if tag == 'div':
            self.total_div += 1
        
        if self.div_lister_flag:
            if (tag == 'img'):
                title = attrs[0][1].title().replace(' ', '')
                chars = re.escape(string.punctuation)
                title = re.sub(r'['+chars+']', '',title)
                self.current_title = unidecode.unidecode(title)
#                 self.movie_dict[self.current_title] = self.rating
#                 self.rating += 1
                
        if self.div_lister_flag:
            try:
                p_class = attrs[0][1]
                
                if p_class == 'sort-num_votes-visible':
                    self.sort_num_votes_visible_flag = True
                
            except:
                pass
                
        if tag == 'span' and self.sort_num_votes_visible_flag:
            try:
                span_class = attrs[0][1]

                if span_class == 'ghost':
                    self.movie_dict[self.current_title] = self.rating
                    self.rating += 1
            except:
                pass
                

    def handle_endtag(self, tag):
        if tag == 'div':
            self.total_div -= 1
            
        if tag == 'p' and self.sort_num_votes_visible_flag:
            self.sort_num_votes_visible_flag = False
            
#         if tag == 'span':
#             self.ghost_flag = False
            
        if self.total_div < self.div_lister_seen and self.div_lister_flag:
            self.div_lister_flag = False

In [2]:
def get_tropes(path, movie):
    try:
        open(path + movie + '.json', 'r')
        print('FOUND', movie + '.json')
        return
    except FileNotFoundError:
        pass
    
    url = 'http://tvtropes.org/pmwiki/pmwiki.php/Film/'
    
    try:
        local_filename, headers = request.urlretrieve(url + movie)
    except error.HTTPError:
        print("Unable to find", movie, "on tvtropes.")
        return
        
    movie_html = open(local_filename, encoding='mac_roman').read()

    parser = TvtropesHTMLParser()
    parser.feed(movie_html)

    movie_tropes = parser.trope_dict

    with open(path + movie + '.json', 'w') as file:
         json.dump(movie_tropes, file)
            
def get_movies(url, rating_start):
    local_filename, headers = request.urlretrieve(url)

    movie_html = open(local_filename, encoding='utf-8').read()

    parser = ImdbHTMLParser(rating_start)
    parser.feed(movie_html)
    
    return parser.movie_dict

In [3]:
# Modify to suite your path
path = '/Users/adampaquette/Documents/CS 499 (Network Analysis)/networks-sp/tropes/twentytens/'

start = '2010'
end = '2019'
period = start + '%2C' + end
page = start + ',' + end +'&page=2&ref_=adv_nxt'
url1 = 'http://www.imdb.com/search/title?genres=horror&sort=boxoffice_gross_us&title_type=feature&year=' + period
url2 = 'http://www.imdb.com/search/title?genres=horror&sort=boxoffice_gross_us&title_type=feature&year=' + page

movies = {**get_movies(url1, 1)}
print(movies)

with open(path + 'movie_list.json', 'w') as file:
     json.dump(movies, file)

{'It': 1, 'WorldWarZ': 2, 'GetOut': 3, 'HotelTransylvania2': 4, 'HotelTransylvania': 5, 'Split': 6, 'TheConjuring': 7, 'ParanormalActivity3': 8, 'TheConjuring2': 9, 'AnnabelleCreation': 10, 'DonTBreathe': 11, 'ParanormalActivity2': 12, 'Annabelle': 13, 'InsidiousChapter2': 14, 'Goosebumps': 15, 'DarkShadows': 16, 'ThePurgeElectionYear': 17, 'AlienCovenant': 18, 'BooAMadeaHalloween': 19, 'ThePurgeAnarchy': 20, '10CloverfieldLane': 21, 'Mama': 22, 'InsidiousTheLastKey': 23, 'LightsOut': 24, 'AQuietPlace': 25, 'WarmBodies': 26, 'TheVisit': 27, 'ThePurge': 28, 'TheShapeOfWater': 29, 'ANightmareOnElmStreet': 30, 'UnderworldAwakening': 31, 'TheWolfman': 32, 'ResidentEvilAfterlife': 33, 'DraculaUntold': 34, 'HanselGretelWitchHunters': 35, 'HappyDeathDay': 36, 'TheShallows': 37, 'TheWomanInBlack': 38, 'EvilDead': 39, 'Insidious': 40, 'ParanormalActivity4': 41, 'TheDevilInside': 42, 'InsidiousChapter3': 43, 'Ouija': 44, 'TheDarkTower': 45, 'ThePossession': 46, 'Sinister': 47, 'Poltergeist': 48,

In [4]:
# Replace with name of movie you want to know the tropes for
for movie in movies.keys():
    get_tropes(path, movie)

FOUND It.json
FOUND WorldWarZ.json
FOUND GetOut.json
FOUND HotelTransylvania2.json
FOUND HotelTransylvania.json
FOUND Split.json
FOUND TheConjuring.json
FOUND ParanormalActivity3.json
FOUND TheConjuring2.json
FOUND AnnabelleCreation.json
FOUND DonTBreathe.json
FOUND ParanormalActivity2.json
FOUND Annabelle.json
FOUND InsidiousChapter2.json
FOUND Goosebumps.json
FOUND DarkShadows.json
FOUND ThePurgeElectionYear.json
FOUND AlienCovenant.json
Unable to find BooAMadeaHalloween on tvtropes.
FOUND ThePurgeAnarchy.json
Unable to find 10CloverfieldLane on tvtropes.
FOUND Mama.json
Unable to find InsidiousTheLastKey on tvtropes.
FOUND LightsOut.json
FOUND AQuietPlace.json
FOUND WarmBodies.json
FOUND TheVisit.json
FOUND ThePurge.json
FOUND TheShapeOfWater.json
FOUND ANightmareOnElmStreet.json
FOUND UnderworldAwakening.json
FOUND TheWolfman.json
FOUND ResidentEvilAfterlife.json
FOUND DraculaUntold.json
Unable to find HanselGretelWitchHunters on tvtropes.
FOUND HappyDeathDay.json
FOUND TheShallows