## Scrape Validation Dataset (2015 movies)

In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
import re
from tqdm import tqdm
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [2]:
newlist = pd.read_csv("new_movies.csv")
newlist = newlist.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
newlist.head()

Unnamed: 0,titles_new,wiki_url_new,imdb_link_new,rt_link_new,imdb_id
0,Pinnokam,/wiki/Pinnokam,,,
1,20 Once Again,/wiki/20_Once_Again,http://www.imdb.com/title/tt4344878/,,tt4344878
2,Blackhat,/wiki/Blackhat_(film),http://www.imdb.com/title/tt2717822/,http://www.rottentomatoes.com/m/blackhat/,tt2717822
3,I,/wiki/I_(film),http://www.imdb.com/title/tt2302966/,,tt2302966
4,Wild Card,/wiki/Wild_Card_(2015_film),http://www.imdb.com/title/tt2231253/,http://www.rottentomatoes.com/m/heat_2013/,tt2231253


In [3]:
newlist = newlist[newlist.imdb_id.isnull()!=1]
print "Number of new movies in 2015:", len(newlist)
#newlist[1:10]

Number of new movies in 2015: 305


In [4]:
### Need to run this block of code before reading them all from api...
import omdb
import json
trash = []
req = omdb.request(i=newlist.imdb_id.values[1], type='movie', tomatoes=True)
temp = json.loads(req.text)
trash.append(temp)
req = omdb.request(i=newlist.imdb_id.values[2], type='movie', tomatoes=True)
temp = json.loads(req.text)
trash.append(temp)
len(trash)
pd.DataFrame(trash)

Unnamed: 0,Actors,Awards,BoxOffice,Country,DVD,Director,Genre,Language,Metascore,Plot,Poster,Production,Rated,Released,Response,Runtime,Title,Type,Website,Writer,Year,imdbID,imdbRating,imdbVotes,tomatoConsensus,tomatoFresh,tomatoImage,tomatoMeter,tomatoRating,tomatoReviews,tomatoRotten,tomatoUserMeter,tomatoUserRating,tomatoUserReviews
0,"Chris Hemsworth, Leehom Wang, Wei Tang, Viola ...",1 nomination.,$7.8M,USA,12 May 2015,Michael Mann,"Action, Crime, Drama","English, Mandarin, Spanish",51.0,A furloughed convict and his American and Chin...,http://ia.media-imdb.com/images/M/MV5BMTg1NDUy...,Universal Studios,R,16 Jan 2015,True,133 min,Blackhat,movie,http://www.blackhatthemovie.com/,Morgan Davis Foehl,2015,tt2717822,5.4,32017,"Thematically timely but dramatically inert, Bl...",53.0,rotten,34.0,4.9,157.0,104.0,24,2.5,19310
1,"'Chiyaan' Vikram, Amy Jackson, Suresh Gopi, Up...",,,India,,S. Shankar,"Action, Romance, Thriller","Tamil, Hindi, Telugu",,"Lingesan, a deformed hunchback who was once a ...",http://ia.media-imdb.com/images/M/MV5BMTQyNDk3...,Aascar Films,,14 Jan 2015,True,170 min,i,movie,,"A.N. Balakrishnan (dialogue), S. Shankar (stor...",2015,tt2302966,7.7,11351,,,,,,,,55,3.3,139


In [5]:
import requests
import json

movieslist = []
movienotfound = []
i = 0
for t in tqdm(newlist.imdb_id.values, leave=True):
    try:
        time.sleep(0.5)
        #url = "http://www.omdbapi.com/?t={}&y=&plot=full&r=json".format(t)
        #temp = json.loads(requests.get(url).text)
        req = omdb.request(i=t, type='movie', tomatoes=True)
        temp = json.loads(req.text)
        if str(temp['Response'])== 'False':
            movienotfound.append(t)
        if str(temp['Response'])== 'True':
            movieslist.append(temp)
    except: 
        movienotfound.append(t)      

        pass

100%|██████████| 305/305 [03:27<00:00,  1.56it/s]


In [6]:
print "Number of movies not found:", len(movieslist) - len(newlist)

Number of movies not found: 0


In [7]:
movies = pd.DataFrame(movieslist)
#print movies.Awards[1]
print "Make sure all the movie type is 'movie':", movies[movies.Type!='movie'].shape[0]
print "Make sure all the movie is found in the api portal:", movies[movies.Response=='False'].shape[0]

## remove games, series and NA types.
movies = movies[movies.Type=='movie']
movies = movies[movies.Response=='True']

def runtime(s):
    ss = s.split()
    if len(ss) > 1 and int(ss[0]) > 60:
        return int(ss[0])
    else:
        return None
    
movies['Runtime'] = movies.apply(lambda r: runtime(r.Runtime), axis=1)

Make sure all the movie type is 'movie': 0
Make sure all the movie is found in the api portal: 0


We need to extract the correspondeing information from award and put them into individual columns.

In [8]:
def numNomiOscar(s):
    re1='(Nominated)'
    re2='.*?'
    re3='(\\d+)'
    re4='.*?'
    re5='(Oscar)'
    re6='.*?'

    rg = re.compile(re1+re2+re3+re4+re5+re6,re.IGNORECASE|re.DOTALL)
    
    m = rg.search(s)
    if m:
        return int(m.group(2))
    else:
        return 0

def numWonOscar(s):
    re1='(Won)'
    re2='.*?'
    re3='(\\d+)'
    re4='.*?'
    re5='(Oscar)'
    re6='.*?'
    
    rg = re.compile(re1+re2+re3+re4+re5+re6,re.IGNORECASE|re.DOTALL)
    m = rg.search(s)
    if m:
        return int(m.group(2))
    else:
        return 0
    
def anotherWins(s):
    re1='(Another)'
    re2='.*?'
    re3='(\\d+)'
    re4='.*?'
    re5='(win)'
    re6='.*?'
    
    rg = re.compile(re1+re2+re3+re4+re5+re6,re.IGNORECASE|re.DOTALL)
    m = rg.search(s)
    if m:
        return int(m.group(2))
    else:
        return 0

def anotherNoms(s):
    re1='.*?'	# Non-greedy match on filler
    re2='\\d+'	# Uninteresting: int
    re3='.*?'	# Non-greedy match on filler
    re4='\\d+'	# Uninteresting: int
    re5='.*?'	# Non-greedy match on filler
    re6='(\\d+)'	# Integer Number 1
    re7='.*?'	# Non-greedy match on filler
    re8='(nomination)'	# Word 1

    rg = re.compile(re1+re2+re3+re4+re5+re6+re7+re8,re.IGNORECASE|re.DOTALL)
    m = rg.search(s)
    if m:
        return int(m.group(1))
    else:
        return 0


In [9]:
movies['Nominated Oscars'] = movies.apply(lambda r: numNomiOscar(r.Awards), axis=1)
movies['Won Oscars'] = movies.apply(lambda r: numWonOscar(r.Awards), axis=1)
movies['Other wins'] = movies.apply(lambda r: anotherWins(r.Awards), axis=1)
movies['Other nominations'] = movies.apply(lambda r: anotherNoms(r.Awards), axis=1)
import datetime
from dateutil import parser
movies['Released'] = movies.apply(lambda r: parser.parse(r.Released) if r.Released != 'N/A' else False, axis=1)
movies = movies.drop(['Plot', 'Poster', 'Writer', 'Type', 'Response'], axis=1)
movies = movies
#cols = list(movies.columns.values)
#print cols
movies = movies[[u'imdbID', u'Title', u'Released', u'Awards', u'Director', u'Actors', u'Country', u'Genre', u'Language',
                 u'Rated', u'Runtime', u'Year', u'Metascore', u'imdbRating', u'imdbVotes', u'tomatoConsensus', u'tomatoFresh',
                 u'tomatoMeter', u'tomatoRating', u'tomatoReviews', u'tomatoRotten', u'tomatoUserMeter',
                 u'tomatoUserRating', u'tomatoUserReviews', 'Nominated Oscars', 'Won Oscars', 'Other wins', 'Other nominations']]  

Note that the **Actors** and **Director** field may contain more than one person, and soon we would like to incorprate information of actors and directors to build new quantitative feature to measure the qualaity of a movie, so it will be more handy if we could explode the actor list and director list so that we would have multiple rows of under the same movie title but the actor and director field is different.

In [10]:
rows = []
_ = movies.apply(lambda row: [rows.append([row['imdbID'], row['Title'], row['Released'], row['Awards'], row['Director'],
                                           nn.strip(), row['Country'], row['Language'], row[u'Genre'],row['Rated'],
                                           row['Runtime'], row['Year'], row['Metascore'], row['imdbRating'],
                                           row['imdbVotes'], row['tomatoConsensus'], row['tomatoFresh'], row['tomatoMeter'],
                                           row['tomatoRating'], row['tomatoReviews'], row['tomatoRotten'], row['tomatoUserMeter'],
                                           row['tomatoUserRating'], row['tomatoUserReviews'], row['Nominated Oscars'],
                                           row['Won Oscars'], row['Other wins'], row['Other nominations']]) 
                         for nn in row.Actors.strip().split(',')], axis=1)
#rows
movies = pd.DataFrame(rows, columns=movies.columns)
movies

Unnamed: 0,imdbID,Title,Released,Awards,Director,Actors,Country,Genre,Language,Rated,Runtime,Year,Metascore,imdbRating,imdbVotes,tomatoConsensus,tomatoFresh,tomatoMeter,tomatoRating,tomatoReviews,tomatoRotten,tomatoUserMeter,tomatoUserRating,tomatoUserReviews,Nominated Oscars,Won Oscars,Other wins,Other nominations
0,tt4344878,Miss Granny,2015-01-16 00:00:00,1 win & 2 nominations.,Leste Chen,Zishan Yang,"China, South Korea",Mandarin,Comedy,,131,2015,,6.8,473,,,,,,,89,4.3,86,0,0,0,0
1,tt4344878,Miss Granny,2015-01-16 00:00:00,1 win & 2 nominations.,Leste Chen,Ya-Lei Kuei,"China, South Korea",Mandarin,Comedy,,131,2015,,6.8,473,,,,,,,89,4.3,86,0,0,0,0
2,tt4344878,Miss Granny,2015-01-16 00:00:00,1 win & 2 nominations.,Leste Chen,Bo-lin Chen,"China, South Korea",Mandarin,Comedy,,131,2015,,6.8,473,,,,,,,89,4.3,86,0,0,0,0
3,tt4344878,Miss Granny,2015-01-16 00:00:00,1 win & 2 nominations.,Leste Chen,Deshun Wang,"China, South Korea",Mandarin,Comedy,,131,2015,,6.8,473,,,,,,,89,4.3,86,0,0,0,0
4,tt2717822,Blackhat,2015-01-16 00:00:00,1 nomination.,Michael Mann,Chris Hemsworth,USA,"English, Mandarin, Spanish","Action, Crime, Drama",R,133,2015,51,5.4,32017,"Thematically timely but dramatically inert, Bl...",53,34,4.9,157,104,24,2.5,19310,0,0,0,0
5,tt2717822,Blackhat,2015-01-16 00:00:00,1 nomination.,Michael Mann,Leehom Wang,USA,"English, Mandarin, Spanish","Action, Crime, Drama",R,133,2015,51,5.4,32017,"Thematically timely but dramatically inert, Bl...",53,34,4.9,157,104,24,2.5,19310,0,0,0,0
6,tt2717822,Blackhat,2015-01-16 00:00:00,1 nomination.,Michael Mann,Wei Tang,USA,"English, Mandarin, Spanish","Action, Crime, Drama",R,133,2015,51,5.4,32017,"Thematically timely but dramatically inert, Bl...",53,34,4.9,157,104,24,2.5,19310,0,0,0,0
7,tt2717822,Blackhat,2015-01-16 00:00:00,1 nomination.,Michael Mann,Viola Davis,USA,"English, Mandarin, Spanish","Action, Crime, Drama",R,133,2015,51,5.4,32017,"Thematically timely but dramatically inert, Bl...",53,34,4.9,157,104,24,2.5,19310,0,0,0,0
8,tt2302966,i,2015-01-14 00:00:00,,S. Shankar,'Chiyaan' Vikram,India,"Tamil, Hindi, Telugu","Action, Romance, Thriller",,170,2015,,7.7,11351,,,,,,,55,3.3,139,0,0,0,0
9,tt2302966,i,2015-01-14 00:00:00,,S. Shankar,Amy Jackson,India,"Tamil, Hindi, Telugu","Action, Romance, Thriller",,170,2015,,7.7,11351,,,,,,,55,3.3,139,0,0,0,0


In [11]:
rows = []
_ = movies.apply(lambda row: [rows.append([row['imdbID'], row['Title'], row['Released'], row['Awards'], nn.strip(),
                                           row['Actors'], row['Country'], row['Language'], row[u'Genre'],row['Rated'],
                                           row['Runtime'], row['Year'], row['Metascore'], row['imdbRating'],
                                           row['imdbVotes'], row['tomatoConsensus'], row['tomatoFresh'], row['tomatoMeter'],
                                           row['tomatoRating'], row['tomatoReviews'], row['tomatoRotten'], row['tomatoUserMeter'],
                                           row['tomatoUserRating'], row['tomatoUserReviews'], row['Nominated Oscars'],row['Won Oscars'],
                                           row['Other wins'], row['Other nominations']]) 
                         for nn in row.Director.strip().split(',')], axis=1)
#rows
movies = pd.DataFrame(rows, columns=movies.columns).set_index('imdbID')
movies

Unnamed: 0_level_0,Title,Released,Awards,Director,Actors,Country,Genre,Language,Rated,Runtime,Year,Metascore,imdbRating,imdbVotes,tomatoConsensus,tomatoFresh,tomatoMeter,tomatoRating,tomatoReviews,tomatoRotten,tomatoUserMeter,tomatoUserRating,tomatoUserReviews,Nominated Oscars,Won Oscars,Other wins,Other nominations
imdbID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
tt4344878,Miss Granny,2015-01-16 00:00:00,1 win & 2 nominations.,Leste Chen,Zishan Yang,"China, South Korea",Comedy,Mandarin,,131,2015,,6.8,473,,,,,,,89,4.3,86,0,0,0,0
tt4344878,Miss Granny,2015-01-16 00:00:00,1 win & 2 nominations.,Leste Chen,Ya-Lei Kuei,"China, South Korea",Comedy,Mandarin,,131,2015,,6.8,473,,,,,,,89,4.3,86,0,0,0,0
tt4344878,Miss Granny,2015-01-16 00:00:00,1 win & 2 nominations.,Leste Chen,Bo-lin Chen,"China, South Korea",Comedy,Mandarin,,131,2015,,6.8,473,,,,,,,89,4.3,86,0,0,0,0
tt4344878,Miss Granny,2015-01-16 00:00:00,1 win & 2 nominations.,Leste Chen,Deshun Wang,"China, South Korea",Comedy,Mandarin,,131,2015,,6.8,473,,,,,,,89,4.3,86,0,0,0,0
tt2717822,Blackhat,2015-01-16 00:00:00,1 nomination.,Michael Mann,Chris Hemsworth,USA,"Action, Crime, Drama","English, Mandarin, Spanish",R,133,2015,51,5.4,32017,"Thematically timely but dramatically inert, Bl...",53,34,4.9,157,104,24,2.5,19310,0,0,0,0
tt2717822,Blackhat,2015-01-16 00:00:00,1 nomination.,Michael Mann,Leehom Wang,USA,"Action, Crime, Drama","English, Mandarin, Spanish",R,133,2015,51,5.4,32017,"Thematically timely but dramatically inert, Bl...",53,34,4.9,157,104,24,2.5,19310,0,0,0,0
tt2717822,Blackhat,2015-01-16 00:00:00,1 nomination.,Michael Mann,Wei Tang,USA,"Action, Crime, Drama","English, Mandarin, Spanish",R,133,2015,51,5.4,32017,"Thematically timely but dramatically inert, Bl...",53,34,4.9,157,104,24,2.5,19310,0,0,0,0
tt2717822,Blackhat,2015-01-16 00:00:00,1 nomination.,Michael Mann,Viola Davis,USA,"Action, Crime, Drama","English, Mandarin, Spanish",R,133,2015,51,5.4,32017,"Thematically timely but dramatically inert, Bl...",53,34,4.9,157,104,24,2.5,19310,0,0,0,0
tt2302966,i,2015-01-14 00:00:00,,S. Shankar,'Chiyaan' Vikram,India,"Action, Romance, Thriller","Tamil, Hindi, Telugu",,170,2015,,7.7,11351,,,,,,,55,3.3,139,0,0,0,0
tt2302966,i,2015-01-14 00:00:00,,S. Shankar,Amy Jackson,India,"Action, Romance, Thriller","Tamil, Hindi, Telugu",,170,2015,,7.7,11351,,,,,,,55,3.3,139,0,0,0,0


We then explode the **Genre** list as we did for homework 1, and create an indicator variable for each genre.

In [12]:
Genre = set()
for m in movies.Genre:
    Genre.update(g.strip() for g in str(m).split(',') if g != '')
Genre = sorted(Genre)
#make a column for each genre
for genre in Genre:
    movies[genre] = [genre.strip() in str(m).split(', ') for m in movies.Genre]
movies = movies.drop('Genre', axis=1)
movies = movies.sort_index(axis=0)

movies.head(5)

Unnamed: 0_level_0,Title,Released,Awards,Director,Actors,Country,Language,Rated,Runtime,Year,Metascore,imdbRating,imdbVotes,tomatoConsensus,tomatoFresh,tomatoMeter,tomatoRating,tomatoReviews,tomatoRotten,tomatoUserMeter,tomatoUserRating,tomatoUserReviews,Nominated Oscars,Won Oscars,Other wins,Other nominations,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Musical,Mystery,N/A,Romance,Sci-Fi,Sport,Thriller,War,Western
imdbID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1
tt0369610,Jurassic World,2015-06-12 00:00:00,1 win & 5 nominations.,Colin Trevorrow,Chris Pratt,USA,English,PG-13,124,2015,59,7.2,314481,Jurassic World can't match the original for sh...,194,71,6.7,274,80,80,3.9,206851,0,0,0,0,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
tt0369610,Jurassic World,2015-06-12 00:00:00,1 win & 5 nominations.,Colin Trevorrow,Irrfan Khan,USA,English,PG-13,124,2015,59,7.2,314481,Jurassic World can't match the original for sh...,194,71,6.7,274,80,80,3.9,206851,0,0,0,0,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
tt0369610,Jurassic World,2015-06-12 00:00:00,1 win & 5 nominations.,Colin Trevorrow,Vincent D'Onofrio,USA,English,PG-13,124,2015,59,7.2,314481,Jurassic World can't match the original for sh...,194,71,6.7,274,80,80,3.9,206851,0,0,0,0,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
tt0369610,Jurassic World,2015-06-12 00:00:00,1 win & 5 nominations.,Colin Trevorrow,Bryce Dallas Howard,USA,English,PG-13,124,2015,59,7.2,314481,Jurassic World can't match the original for sh...,194,71,6.7,274,80,80,3.9,206851,0,0,0,0,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
tt0470752,Ex Machina,2015-04-24 00:00:00,2 wins & 7 nominations.,Alex Garland,Domhnall Gleeson,UK,English,R,108,2015,78,7.7,194256,Ex Machina leans heavier on ideas than effects...,199,92,8.0,216,17,85,4.0,61929,0,0,0,0,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,True,False,False,False,False


In [13]:
movies.shape

(1304, 48)

In [14]:
movies.to_csv('movies2015.csv', encoding='utf-8')

### We then merge actors and directors information into the movie dataframe.

In [15]:
actors = pd.read_csv("actordf2015.csv", usecols=['Birthplace', 'DOB', 'Name', 'credits', 'wins', 'age'])
actors.rename(columns={'Birthplace': 'A_birthplace', 'DOB': 'A_DOB', 'Name': 'A_name', 'credits': 'A_credits', 'wins': 'A_wins',
                       'age': 'A_age' }, inplace=True)
directors = pd.read_csv("directordf2015.csv", usecols=['Birthplace', 'DOB', 'Name', 'credits', 'wins', 'age'])
directors.rename(columns={'Birthplace': 'D_birthplace', 'DOB': 'D_DOB','Name': 'D_name', 'credits': 'D_credits', 'wins': 'D_wins',
                       'age': 'D_age' }, inplace=True)
movies = pd.read_csv("movies2015.csv")


In [16]:
merge1 = pd.merge(movies, directors, left_on='Director', right_on='D_name', how='left')
merge2 = pd.merge(merge1, actors, left_on='Actors', right_on='A_name', how='left')
merge2.dropna(subset=['A_name'], how='all', inplace=True)
merge2.dropna(subset=['D_name'], how='all', inplace=True)
merge2
#merge2 = merge2.drop(['Director', 'Actors', 'movie_id'], axis=1)


Unnamed: 0,imdbID,Title,Released,Awards,Director,Actors,Country,Language,Rated,Runtime,Year,Metascore,imdbRating,imdbVotes,tomatoConsensus,tomatoFresh,tomatoMeter,tomatoRating,tomatoReviews,tomatoRotten,tomatoUserMeter,tomatoUserRating,tomatoUserReviews,Nominated Oscars,Won Oscars,Other wins,Other nominations,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Musical,Mystery,N/A,Romance,Sci-Fi,Sport,Thriller,War,Western,D_birthplace,D_DOB,D_name,D_credits,D_wins,D_age,A_birthplace,A_DOB,A_name,A_credits,A_wins,A_age
0,tt0369610,Jurassic World,2015-06-12 00:00:00,1 win & 5 nominations.,Colin Trevorrow,Chris Pratt,USA,English,PG-13,124,2015,59,7.2,314481,Jurassic World can't match the original for sh...,194,71,6.7,274,80,80,3.9,206851,0,0,0,0,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,"San Francisco, California, USA",1976-9-13,Colin Trevorrow,7,6 nominations.,39.192334,"Virginia, Minnesota, USA",1979-6-21,Chris Pratt,43,8 wins & 21 nominations.,36.424367
1,tt0369610,Jurassic World,2015-06-12 00:00:00,1 win & 5 nominations.,Colin Trevorrow,Irrfan Khan,USA,English,PG-13,124,2015,59,7.2,314481,Jurassic World can't match the original for sh...,194,71,6.7,274,80,80,3.9,206851,0,0,0,0,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,"San Francisco, California, USA",1976-9-13,Colin Trevorrow,7,6 nominations.,39.192334,"Jaipur, Rajasthan, India",1967-1-7,Irrfan Khan,130,15 wins & 19 nominations.,48.876112
2,tt0369610,Jurassic World,2015-06-12 00:00:00,1 win & 5 nominations.,Colin Trevorrow,Vincent D'Onofrio,USA,English,PG-13,124,2015,59,7.2,314481,Jurassic World can't match the original for sh...,194,71,6.7,274,80,80,3.9,206851,0,0,0,0,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,"San Francisco, California, USA",1976-9-13,Colin Trevorrow,7,6 nominations.,39.192334,"Brooklyn, New York City, New York, USA",1959-6-30,Vincent D'Onofrio,89,Nominated for\n 1\n Primetime Emmy.,56.399726
3,tt0369610,Jurassic World,2015-06-12 00:00:00,1 win & 5 nominations.,Colin Trevorrow,Bryce Dallas Howard,USA,English,PG-13,124,2015,59,7.2,314481,Jurassic World can't match the original for sh...,194,71,6.7,274,80,80,3.9,206851,0,0,0,0,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,"San Francisco, California, USA",1976-9-13,Colin Trevorrow,7,6 nominations.,39.192334,"Los Angeles, California, USA",1981-3-2,Bryce Dallas Howard,30,Nominated for\n 1\n Golden Globe.,34.726899
4,tt0470752,Ex Machina,2015-04-24 00:00:00,2 wins & 7 nominations.,Alex Garland,Domhnall Gleeson,UK,English,R,108,2015,78,7.7,194256,Ex Machina leans heavier on ideas than effects...,199,92,8.0,216,17,85,4.0,61929,0,0,0,0,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,True,False,False,False,False,"London, England, UK",1970-1-1,Alex Garland,2,4 wins & 10 nominations.,45.891855,"Dublin, Ireland",1983-5-12,Domhnall Gleeson,32,5 wins & 5 nominations.,32.533881
5,tt0470752,Ex Machina,2015-04-24 00:00:00,2 wins & 7 nominations.,Alex Garland,Alicia Vikander,UK,English,R,108,2015,78,7.7,194256,Ex Machina leans heavier on ideas than effects...,199,92,8.0,216,17,85,4.0,61929,0,0,0,0,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,True,False,False,False,False,"London, England, UK",1970-1-1,Alex Garland,2,4 wins & 10 nominations.,45.891855,"Gothenburg, Västra Götalands län, Sweden",1988-10-3,Alicia Vikander,27,8 wins & 8 nominations.,27.137577
6,tt0470752,Ex Machina,2015-04-24 00:00:00,2 wins & 7 nominations.,Alex Garland,Oscar Isaac,UK,English,R,108,2015,78,7.7,194256,Ex Machina leans heavier on ideas than effects...,199,92,8.0,216,17,85,4.0,61929,0,0,0,0,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,True,False,False,False,False,"London, England, UK",1970-1-1,Alex Garland,2,4 wins & 10 nominations.,45.891855,Guatemala,1980-3-9,Oscar Isaac,34,Nominated for\n 1\n Golden Globe.,35.707050
8,tt0478970,Ant-Man,2015-07-17 00:00:00,2 nominations.,Peyton Reed,Corey Stoll,USA,English,PG-13,117,2015,64,7.6,138824,"Led by a charming performance from Paul Rudd, ...",197,79,6.8,248,51,88,4.1,148048,0,0,0,0,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,"Raleigh, North Carolina, USA",1964-7-3,Peyton Reed,26,1 win & 1 nomination.,51.389459,"New York City, New York, USA",1976-3-14,Corey Stoll,49,Nominated for\n 1\n Golden Globe.,39.693361
10,tt0478970,Ant-Man,2015-07-17 00:00:00,2 nominations.,Peyton Reed,Michael Douglas,USA,English,PG-13,117,2015,64,7.6,138824,"Led by a charming performance from Paul Rudd, ...",197,79,6.8,248,51,88,4.1,148048,0,0,0,0,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,"Raleigh, North Carolina, USA",1964-7-3,Peyton Reed,26,1 win & 1 nomination.,51.389459,"New Brunswick, New Jersey, USA",1944-9-25,Michael Douglas,57,Won\n 2\n Oscars.,71.159480
11,tt0478970,Ant-Man,2015-07-17 00:00:00,2 nominations.,Peyton Reed,Paul Rudd,USA,English,PG-13,117,2015,64,7.6,138824,"Led by a charming performance from Paul Rudd, ...",197,79,6.8,248,51,88,4.1,148048,0,0,0,0,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,"Raleigh, North Carolina, USA",1964-7-3,Peyton Reed,26,1 win & 1 nomination.,51.389459,"Passaic, New Jersey, USA",1969-4-6,Paul Rudd,96,6 wins & 14 nominations.,46.631075


In [17]:
def numNomiOscar(s):
    re1='(Nominated)'
    re2='.*?'
    re3='(\\d+)'
    re4='.*?'
    re5='(Oscar)'
    re6='.*?'

    rg = re.compile(re1+re2+re3+re4+re5+re6,re.IGNORECASE|re.DOTALL)
    
    m = rg.search(s)
    if m:
        return m.group(2)
    else:
        return 0

def numWonOscar(s):
    re1='(Won)'
    re2='.*?'
    re3='(\\d+)'
    re4='.*?'
    re5='(Oscar)'
    re6='.*?'
    
    rg = re.compile(re1+re2+re3+re4+re5+re6,re.IGNORECASE|re.DOTALL)
    m = rg.search(s)
    if m:
        return m.group(2)
    else:
        return 0

In [18]:
merge2['Director Nominated Oscars'] = merge2.apply(lambda r: numNomiOscar(str(r.D_wins)), axis=1)
merge2['Director Won Oscars'] = merge2.apply(lambda r: numWonOscar(str(r.D_wins)), axis=1)
merge2['Actor Nominated Oscars'] = merge2.apply(lambda r: numNomiOscar(str(r.A_wins)), axis=1)
merge2['Actor Won Oscars'] = merge2.apply(lambda r: numWonOscar(str(r.A_wins)), axis=1)

In [19]:
print merge2.shape
print "sample size:", len(list(set(merge2.Title)))


(418, 65)
sample size: 168


### We then read in boxoffice and budget information that we scraped from wikipedia and adjust for different currecy units and inflation rate using historic CPI.

In [20]:
wiki = pd.read_csv('2015_movies_wiki.csv')
cpi = pd.read_csv('cpi.csv', usecols=['Year', 'Avg'])
wiki.columns

Index([u'Unnamed: 0', u'titles_new', u'wiki_url_new', u'imdb_link_new', u'rt_link_new', u'imdb_id', u'budget_new', u'currency_budget', u'box_office_new', u'currency_bo'], dtype='object')

In [21]:
outcome = pd.merge(merge2, wiki, left_on='imdbID', right_on='imdb_id', how='left')

Index([u'imdbID', u'Title', u'Released', u'Awards', u'Director', u'Actors', u'Country', u'Language', u'Rated', u'Runtime', u'Year', u'Metascore', u'imdbRating', u'imdbVotes', u'tomatoConsensus', u'tomatoFresh', u'tomatoMeter', u'tomatoRating', u'tomatoReviews', u'tomatoRotten', u'tomatoUserMeter', u'tomatoUserRating', u'tomatoUserReviews', u'Nominated Oscars', u'Won Oscars', u'Other wins', u'Other nominations', u'Action', u'Adventure', u'Animation', u'Biography', u'Comedy', u'Crime',
       u'Documentary', u'Drama', u'Family', u'Fantasy', u'History', u'Horror', u'Music', u'Musical', u'Mystery', u'N/A', u'Romance', u'Sci-Fi', u'Sport', u'Thriller', u'War', u'Western', u'D_birthplace', u'D_DOB', u'D_name', u'D_credits', u'D_wins', u'D_age', u'A_birthplace', u'A_DOB', u'A_name', u'A_credits', u'A_wins', u'A_age', u'Director Nominated Oscars', u'Director Won Oscars', u'Actor Nominated Oscars', u'Actor Won Oscars', u'Unnamed: 0', u'titles_new', u'wiki_url_new', u'imdb_link_new',
       u'rt

In [22]:
import requests
import json
import numbers

def convertCurrency(C_from, amount, yr):
    ref_cpi = float(cpi[cpi.Year==yr]['Avg'])
    cur_cpi = float(cpi[cpi.Year==2015]['Avg'])
    try:
        if isinstance(amount, numbers.Number):
            url = ('https://currency-api.appspot.com/api/%s/%s.json') % (C_from, 'USD')
            temp = json.loads(requests.get(url).text)
            return amount * float(temp['rate']) * cur_cpi / ref_cpi ## adjust for exchange rate and inflation
        else:
            return None
    except: 
        print (C_from, amount)
        return(C_from)
        pass

outcome['budget_USD'] = outcome.apply(lambda r: convertCurrency(r.currency_budget, r.budget_new, r.Year), axis=1)
outcome['box_USD'] = outcome.apply(lambda r: convertCurrency(r.currency_bo, r.box_office_new, r.Year), axis=1)

In [23]:
print len(list(set(outcome[outcome.budget_USD.isnull()].imdbID)))
print len(list(set(outcome[outcome.box_USD.isnull()].imdbID)))

#cols =  outcome.columns
#print cols
outcome_copy = outcome
outcome = outcome.drop([ u'Unnamed: 0', u'titles_new', u'imdb_id',
                        u'budget_new', u'currency_budget', u'box_office_new', u'currency_bo'], axis=1)

168
70


In [24]:
outcome.shape

(422, 70)

In [25]:
outcome.to_csv('merge2015.csv', encoding='utf-8')