In [2]:
from bs4 import BeautifulSoup
import urllib
import csv
import re

# Open csv file and read rows into a dict
movie_list = {}
with open('movie_list.csv', 'r') as csvfile:
    reader = csv.DictReader(csvfile)
    for rows in reader:
        # Remove punctuation except "'s"
        rows['movie_title'] = re.sub(r'[,:;!?-]', '', rows['movie_title'])
        movie_list[rows['movie_title']] = rows['release_year']
        
print movie_list

{'12 Years a Slave': '2013', "Winter's Bone": '2010', 'Crouching Tiger Hidden Dragon': '2000', 'Mad Max Fury Road': '2015'}


In [3]:
def get_imdbid(movie_list):
    '''
    Retrieve IMDb IDs from movie list via OMDb API
    '''
    imdbid = {}
    for title, year in movie_list.iteritems():
        # Add + in between spaces in movie title
        title_query = re.sub(r'\s', '+', title)
        # Extract IMDb IDs
        r = urllib.urlopen('http://www.omdbapi.com/?t=' 
                           + title_query + '&y=' + year + '&plot=short&r=xml').read()
        soup = BeautifulSoup(r, 'xml')
        imdbid[title] = soup.movie['imdbID']
    return imdbid

imdbid = get_imdbid(movie_list)
print imdbid

{'Mad Max Fury Road': u'tt1392190', '12 Years a Slave': u'tt2024544', "Winter's Bone": u'tt1399683', 'Crouching Tiger Hidden Dragon': u'tt0190332'}


In [40]:
def get_awards(imdbid, award_list, dataset):
    '''
    Scrape list of awards nominated and won for each movie title
    '''
    dataset = {}
    for title, id in imdbid.iteritems():
        # Add movie titles in dataset
        dataset[title] = {}
        r = urllib.urlopen('http://www.imdb.com/title/' + id + '/awards?ref_=tt_awd').read()
        soup = BeautifulSoup(r, 'lxml')
        awards = soup.find("div", class_="article listo")
        for a in awards.contents:
            for i in award_list:
                # Search for h3 tags which contain the award title
                if a.name == "h3" and a.find(string=re.compile('(?m)^' + i + '$')):
                    # Search award results in its next sibling
                    result = a.find_next_sibling().td.b
                    # If award is won, add award = 1 to dataset
                    if result.get_text() == "Won":
                        dataset[title][i] = 1
                else:
                    # Add award = 0 if not won
                    try:
                        dataset[title][i] != 1
                    except:
                        dataset[title][i] = 0
    return dataset
    
award_list = ["Screen Actors Guild Awards", "Directors Guild"]       
dataset = get_awards(imdbid, award_list, dataset)
print dataset

{'12 Years a Slave': {'Screen Actors Guild Awards': 0, 'Directors Guild': 0}, "Winter's Bone": {'Screen Actors Guild Awards': 0, 'Directors Guild': 0}, 'Crouching Tiger Hidden Dragon': {'Screen Actors Guild Awards': 0, 'Directors Guild': 0}, 'Mad Max Fury Road': {'Screen Actors Guild Awards': 0, 'Directors Guild': 0}}


In [14]:
print dataset['Mad Max Fury Road']

{'Screen Actors Guild Awards': 1}
