### We merge actors and directors information into the movie dataframe.

In [2]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
from tqdm import tqdm
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [3]:
actors = pd.read_csv("actordf.csv", usecols=['Birthplace', 'DOB', 'Name', 'credits', 'wins', 'age'])
actors.rename(columns={'Birthplace': 'A_birthplace', 'DOB': 'A_DOB', 'Name': 'A_name', 'credits': 'A_credits', 'wins': 'A_wins',
                       'age': 'A_age' }, inplace=True)
directors = pd.read_csv("directordf.csv", usecols=['Birthplace', 'DOB', 'Name', 'credits', 'wins', 'age'])
directors.rename(columns={'Birthplace': 'D_birthplace', 'DOB': 'D_DOB','Name': 'D_name', 'credits': 'D_credits', 'wins': 'D_wins',
                       'age': 'D_age' }, inplace=True)
movies = pd.read_csv("movies.csv")

In [4]:
directors.tail()

Unnamed: 0,D_birthplace,D_DOB,D_name,D_credits,D_wins,D_age
498,"New York City, New York, USA",1966-12-30,Bennett Miller,5,Nominated for\n 2\n Oscars.,48.898015
499,"Louisville, Kentucky, USA",1887-11-23,Hobart Henley,57,0,127.997262
500,"Minneapolis, Minnesota, USA",1940-11-22,Terry Gilliam,18,Nominated for\n 1\n Oscar.,75.000684
501,"New York City, New York, USA",1932-1-1,Lewis John Carlino,3,Nominated for\n 1\n Oscar.,83.893224
502,"London, England, UK",1944-9-22,Brian Gibson,16,Won\n 1\n Primetime Emmy.,71.167693


In [5]:
actors.tail()

Unnamed: 0,A_birthplace,A_DOB,A_name,A_credits,A_wins,A_age
1469,"Barry, Wales, UK",1906-6-25,Roger Livesey,66,1 win.,109.412731
1470,"Los Mochis, Sinaloa, Mexico",1964-3-3,Laura Harring,60,2 wins & 1 nomination.,51.723477
1471,"Washington, District of Columbia, USA",1971-8-10,Justin Theroux,45,1 nomination.,44.287474
1472,"Hammersmith, London, England, UK",1944-3-1,Roger Daltrey,57,Nominated for\n 1\n Golden Globe.,71.728953
1473,"Fairfax, Virginia, USA",1985-6-28,Rae'Ven Larrymore Kelly,42,4 wins & 5 nominations.,30.403833


In [6]:
movies.tail()

Unnamed: 0,imdbID,Title,Released,Awards,Director,Actors,Country,Language,Rated,Runtime,Year,Metascore,imdbRating,imdbVotes,tomatoConsensus,tomatoFresh,tomatoMeter,tomatoRating,tomatoReviews,tomatoRotten,tomatoUserMeter,tomatoUserRating,tomatoUserReviews,Nominated Oscars,Won Oscars,Othter wins,Othter nominations,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,Fantasy,Film-Noir,History,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
5323,tt3316960,Still Alice,2015-02-20 00:00:00,Won 1 Oscar. Another 26 wins & 25 nominations.,Richard Glatzer,Shane McRae,"USA, France",English,PG-13,101,2014,72,7.5,66329,Elevated by a gripping performance from Julian...,145,88,7.5,165,20,85,4,35264,0,1,26,25,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
5324,tt3316960,Still Alice,2015-02-20 00:00:00,Won 1 Oscar. Another 26 wins & 25 nominations.,Wash Westmoreland,Julianne Moore,"USA, France",English,PG-13,101,2014,72,7.5,66329,Elevated by a gripping performance from Julian...,145,88,7.5,165,20,85,4,35264,0,1,26,25,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
5325,tt3316960,Still Alice,2015-02-20 00:00:00,Won 1 Oscar. Another 26 wins & 25 nominations.,Richard Glatzer,Julianne Moore,"USA, France",English,PG-13,101,2014,72,7.5,66329,Elevated by a gripping performance from Julian...,145,88,7.5,165,20,85,4,35264,0,1,26,25,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
5326,tt3316960,Still Alice,2015-02-20 00:00:00,Won 1 Oscar. Another 26 wins & 25 nominations.,Wash Westmoreland,Hunter Parrish,"USA, France",English,PG-13,101,2014,72,7.5,66329,Elevated by a gripping performance from Julian...,145,88,7.5,165,20,85,4,35264,0,1,26,25,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
5327,tt3316960,Still Alice,2015-02-20 00:00:00,Won 1 Oscar. Another 26 wins & 25 nominations.,Wash Westmoreland,Shane McRae,"USA, France",English,PG-13,101,2014,72,7.5,66329,Elevated by a gripping performance from Julian...,145,88,7.5,165,20,85,4,35264,0,1,26,25,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [7]:
merge1 = pd.merge(movies, directors, left_on='Director', right_on='D_name', how='left')
merge2 = pd.merge(merge1, actors, left_on='Actors', right_on='A_name', how='left')
merge2.dropna(subset=['A_name'], how='all', inplace=True)
merge2.dropna(subset=['D_name'], how='all', inplace=True)

### We also scraped the award information about each movie and merged these in.

In [8]:
outcome = pd.read_pickle("result.csv")
outcome.head()

Unnamed: 0,bp_n,bp_w,fl_n,fl_w,fs_n,fs_w,ml_n,ml_w,movie_id,ms_n,ms_w
0,False,False,False,False,False,False,True,False,tt0070666,False,False
1,False,False,False,False,False,False,False,False,tt2582802,False,False
2,False,False,False,False,True,False,False,False,tt0172493,False,False
3,True,False,False,False,False,False,True,False,tt0045943,False,False
4,False,False,True,False,False,False,True,False,tt0055895,False,False


In [9]:
merge2 = pd.merge(merge2, outcome, left_on='imdbID', right_on='movie_id', how='left')
merge2.head()

Unnamed: 0,imdbID,Title,Released,Awards,Director,Actors,Country,Language,Rated,Runtime,Year,Metascore,imdbRating,imdbVotes,tomatoConsensus,tomatoFresh,tomatoMeter,tomatoRating,tomatoReviews,tomatoRotten,tomatoUserMeter,tomatoUserRating,tomatoUserReviews,Nominated Oscars,Won Oscars,Othter wins,Othter nominations,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,Fantasy,Film-Noir,History,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western,D_birthplace,D_DOB,D_name,D_credits,D_wins,D_age,A_birthplace,A_DOB,A_name,A_credits,A_wins,A_age,bp_n,bp_w,fl_n,fl_w,fs_n,fs_w,ml_n,ml_w,movie_id,ms_n,ms_w
0,tt0018389,A Ship Comes In,1928-01-04 00:00:00,,William K. Howard,Rudolph Schildkraut,USA,,,70,1928,,5.8,88,,,,,,,,,,0,0,0,0,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,"St. Marys, Ohio, USA",1893-6-16,William K. Howard,54,1 win.,122.433949,"Constantinople, Ottoman Empire [now Istanbul, ...",1862-4-27,Rudolph Schildkraut,26,0,153.571526,False,False,True,False,False,False,False,False,tt0018389,False,False
1,tt0018389,A Ship Comes In,1928-01-04 00:00:00,,William K. Howard,Louise Dresser,USA,,,70,1928,,5.8,88,,,,,,,,,,0,0,0,0,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,"St. Marys, Ohio, USA",1893-6-16,William K. Howard,54,1 win.,122.433949,"Evansville, Indiana, USA",1878-10-5,Louise Dresser,50,Nominated for\n 1\n Oscar.,137.130732,False,False,True,False,False,False,False,False,tt0018389,False,False
2,tt0018515,Two Arabian Knights,1927-09-23 00:00:00,,Lewis Milestone,William Boyd,USA,English,TV-G,92,1927,,7.0,505,,,,,,,71.0,3.8,124.0,0,0,0,0,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,"Kishinev, Russian Empire [now Chisinau, Moldova]",1895-9-30,Lewis Milestone,52,Won\n 2\n Oscars.,120.145106,"Hendrysburg, Ohio, USA",1895-6-5,William Boyd,142,2 wins.,120.465435,False,False,False,False,False,False,False,False,tt0018515,False,False
3,tt0018515,Two Arabian Knights,1927-09-23 00:00:00,,Lewis Milestone,Mary Astor,USA,English,TV-G,92,1927,,7.0,505,,,,,,,71.0,3.8,124.0,0,0,0,0,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,"Kishinev, Russian Empire [now Chisinau, Moldova]",1895-9-30,Lewis Milestone,52,Won\n 2\n Oscars.,120.145106,"Quincy, Illinois, USA",1906-5-3,Mary Astor,155,Won\n 1\n Oscar.,109.557837,False,False,False,False,False,False,False,False,tt0018515,False,False
4,tt0018515,Two Arabian Knights,1927-09-23 00:00:00,,Lewis Milestone,Louis Wolheim,USA,English,TV-G,92,1927,,7.0,505,,,,,,,71.0,3.8,124.0,0,0,0,0,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,"Kishinev, Russian Empire [now Chisinau, Moldova]",1895-9-30,Lewis Milestone,52,Won\n 2\n Oscars.,120.145106,"New York City, New York, USA",1880-3-28,Louis Wolheim,56,0,135.652293,False,False,False,False,False,False,False,False,tt0018515,False,False


In [10]:
def numNomiOscar(s):
    re1='(Nominated)'
    re2='.*?'
    re3='(\\d+)'
    re4='.*?'
    re5='(Oscar)'
    re6='.*?'

    rg = re.compile(re1+re2+re3+re4+re5+re6,re.IGNORECASE|re.DOTALL)
    
    m = rg.search(s)
    if m:
        return m.group(2)
    else:
        return 0

def numWonOscar(s):
    re1='(Won)'
    re2='.*?'
    re3='(\\d+)'
    re4='.*?'
    re5='(Oscar)'
    re6='.*?'
    
    rg = re.compile(re1+re2+re3+re4+re5+re6,re.IGNORECASE|re.DOTALL)
    m = rg.search(s)
    if m:
        return m.group(2)
    else:
        return 0

In [11]:
import re
merge2['Director Nominated Oscars'] = merge2.apply(lambda r: numNomiOscar(str(r.D_wins)), axis=1)
merge2['Director Won Oscars'] = merge2.apply(lambda r: numWonOscar(str(r.D_wins)), axis=1)
merge2['Actor Nominated Oscars'] = merge2.apply(lambda r: numNomiOscar(str(r.A_wins)), axis=1)
merge2['Actor Won Oscars'] = merge2.apply(lambda r: numWonOscar(str(r.A_wins)), axis=1)

In [12]:
merge2 = merge2.drop('movie_id', axis=1)


In [13]:
print merge2.shape
print "sample size:", len(list(set(merge2.Title)))

(3562, 74)
sample size: 1168


In [14]:
merge2.tail()

Unnamed: 0,imdbID,Title,Released,Awards,Director,Actors,Country,Language,Rated,Runtime,Year,Metascore,imdbRating,imdbVotes,tomatoConsensus,tomatoFresh,tomatoMeter,tomatoRating,tomatoReviews,tomatoRotten,tomatoUserMeter,tomatoUserRating,tomatoUserReviews,Nominated Oscars,Won Oscars,Othter wins,Othter nominations,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,Fantasy,Film-Noir,History,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western,D_birthplace,D_DOB,D_name,D_credits,D_wins,D_age,A_birthplace,A_DOB,A_name,A_credits,A_wins,A_age,bp_n,bp_w,fl_n,fl_w,fs_n,fs_w,ml_n,ml_w,ms_n,ms_w,Director Nominated Oscars,Director Won Oscars,Actor Nominated Oscars,Actor Won Oscars
3557,tt2267998,Gone Girl,2014-10-03 00:00:00,Nominated for 1 Oscar. Another 58 wins & 152 n...,David Fincher,Ben Affleck,USA,English,R,149,2014,79,8.2,494319,"Dark, intelligent, and stylish to a fault, Gon...",252,88,8.0,286,34,87,4.1,122789,1,0,58,152,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,"Denver, Colorado, USA",1962-8-28,David Fincher,25,Nominated for\n 2\n Oscars.,53.237509,"Berkeley, California, USA",1972-8-15,Ben Affleck,64,Won\n 2\n Oscars.,43.271732,False,False,False,False,False,False,False,False,False,False,2,0,0,2
3558,tt2334873,Blue Jasmine,2013-08-23 00:00:00,Won 1 Oscar. Another 50 wins & 67 nominations.,Woody Allen,Cate Blanchett,USA,English,PG-13,98,2013,78,7.3,136421,Woody Allen's Blue Jasmine finds the director ...,181,91,8.1,199,18,77,3.8,47563,0,1,50,67,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,"Brooklyn, New York City, New York, USA",1935-12-1,Woody Allen,52,Won\n 4\n Oscars.,79.978097,"Melbourne, Victoria, Australia",1969-5-14,Cate Blanchett,61,Won\n 2\n Oscars.,46.527036,False,False,False,False,False,False,False,False,False,False,0,4,0,2
3559,tt2431286,Philomena,2013-11-27 00:00:00,Nominated for 4 Oscars. Another 29 wins & 71 n...,Stephen Frears,Mare Winningham,"UK, USA, France",English,PG-13,98,2013,76,7.6,69361,Based on a powerful true story and led by note...,159,92,7.9,172,13,89,4.1,51105,4,0,29,71,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,"Leicester, Leicestershire, England, UK",1941-6-20,Stephen Frears,61,Nominated for\n 2\n Oscars.,74.425736,"Phoenix, Arizona, USA",1959-5-16,Mare Winningham,92,Nominated for\n 1\n Oscar.,56.52293,False,False,False,False,False,False,False,False,False,False,2,0,1,0
3560,tt2431286,Philomena,2013-11-27 00:00:00,Nominated for 4 Oscars. Another 29 wins & 71 n...,Stephen Frears,Judi Dench,"UK, USA, France",English,PG-13,98,2013,76,7.6,69361,Based on a powerful true story and led by note...,159,92,7.9,172,13,89,4.1,51105,4,0,29,71,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,"Leicester, Leicestershire, England, UK",1941-6-20,Stephen Frears,61,Nominated for\n 2\n Oscars.,74.425736,"York, North Yorkshire, England, UK",1934-12-9,Judi Dench,114,Won\n 1\n Oscar.,80.95551,False,False,False,False,False,False,False,False,False,False,2,0,0,1
3561,tt2431286,Philomena,2013-11-27 00:00:00,Nominated for 4 Oscars. Another 29 wins & 71 n...,Stephen Frears,Steve Coogan,"UK, USA, France",English,PG-13,98,2013,76,7.6,69361,Based on a powerful true story and led by note...,159,92,7.9,172,13,89,4.1,51105,4,0,29,71,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,"Leicester, Leicestershire, England, UK",1941-6-20,Stephen Frears,61,Nominated for\n 2\n Oscars.,74.425736,"Middleton, Manchester, England, UK",1965-10-14,Steve Coogan,109,Nominated for\n 2\n Oscars.,50.108145,False,False,False,False,False,False,False,False,False,False,2,0,2,0


### We then read in boxoffice and budget information that we scraped from wikipedia and adjust for different currecy units and inflation rate using historic CPI.

In [15]:
wiki = pd.read_pickle('wiki_df_cleaned.csv')
box = pd.read_pickle('box_office.csv')
cpi = pd.read_csv('cpi.csv', usecols=['Year', 'Avg'])

In [16]:
wiki.head()

Unnamed: 0,movie_url,title,budget,box_office,imdb_link,rt_link,budget_sorted,currency,imdb_id
0,/wiki/The_Year_of_Living_Dangerously_(film),The Year of Living Dangerously,[A$6 million],"[A$2,898,000 (Australia), US$10,278,575]",http://www.imdb.com/title/tt0086617/,http://www.rottentomatoes.com/m/year_of_living...,6000000.0,AUD,tt0086617
1,/wiki/Saratoga_Trunk,Saratoga Trunk,[$1.75 million],"[$4,250,000 (US/ Canada rentals) ]",http://www.imdb.com/title/tt0038053/,,1750000.0,USD,tt0038053
2,/wiki/All_the_King%27s_Men_(1949_film),All the King's Men,[NA],[$2.4 million (US rentals)],http://www.imdb.com/title/tt041113/,http://www.rottentomatoes.com/m/1000654-all_th...,,,tt041113
3,/wiki/Romance_(1930_film),Romance,[NA],[NA],http://www.imdb.com/title/tt0021310/,,,,tt0021310
4,/wiki/An_Education,An Education,[$7.5 million],"[$26,096,852]",http://www.imdb.com/title/tt1174732/,http://www.rottentomatoes.com/m/an_education/,7500000.0,USD,tt1174732


In [17]:
box.tail()

Unnamed: 0,imdb_id,box_office,curreny
1258,tt0082416,22600000,USD
1259,tt0105107,5414619,USD
1260,tt0379725,28747570,USD
1261,tt0073812,34251525,USD
1262,tt0047849,1966000,USD


In [18]:
cpi.tail()

Unnamed: 0,Year,Avg
98,2011,224.939
99,2012,229.594
100,2013,232.957
101,2014,236.736
102,2015,237.0343


In [19]:
wiki = pd.merge(merge2, wiki, left_on='imdbID', right_on='imdb_id', how='left')
outcome = pd.merge(wiki, box, left_on='imdbID', right_on='imdb_id', how='left')

In [20]:
import requests
import json
import numbers

def convertCurrency(C_from, amount, yr):
    ref_cpi = float(cpi[cpi.Year==yr]['Avg'])
    cur_cpi = float(cpi[cpi.Year==2015]['Avg'])
    try:
        if isinstance(amount, numbers.Number):
            url = ('https://currency-api.appspot.com/api/%s/%s.json') % (C_from, 'USD')
            temp = json.loads(requests.get(url).text)
            return amount * float(temp['rate']) * cur_cpi / ref_cpi ## adjust for exchange rate and inflation
        else:
            return None
    except: 
        print (C_from, amount)
        return(C_from)
        pass

In [21]:
outcome['budget_USD'] = outcome.apply(lambda r: convertCurrency(r.currency, r.budget_sorted, r.Year), axis=1)
outcome['box_USD'] = outcome.apply(lambda r: convertCurrency(r.curreny, r.box_office_y, r.Year), axis=1)

In [22]:
print len(list(set(outcome[outcome.budget_USD.isnull()].imdbID)))
print len(list(set(outcome[outcome.box_USD.isnull()].imdbID)))

433
84


In [23]:
#cols =  outcome.columns
#print cols
outcome_copy = outcome
outcome = outcome.drop([u'title', u'budget', u'box_office_x', u'budget_sorted',
                        u'currency', u'imdb_id_x', u'imdb_id_y', u'box_office_y', u'curreny'], axis=1)


In [24]:
outcome.to_csv('merge.csv', encoding='utf-8')

In [25]:
outcome = pd.read_csv('merge.csv')