In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import requests
import json
import bs4
from bs4 import BeautifulSoup

Now, we get the merge data and do some calculation based on it. We then get the desired features for us to do the analysis.

In [2]:
merge_tab = pd.read_csv('merge2015.csv')
merge_tab.shape

(422, 71)

First, we transform the date of brith of actors and directors into date format, then substract the released date from their birthday to get the age of directors and actors when the movie was released (they are called 'Aage' and 'Dage'). We think this is an important feature because age, to some extent, implies the experience of the director/actor.

In [3]:
import dateutil.parser as parser
from datetime import datetime

for i in range(len(merge_tab.Released)):
    if merge_tab.Released[i] == 'False':
        merge_tab.Released[i] = '1929-08-20 00:00:00'

RD = []
DDOB = []
ADOB = []
for i in range(len(merge_tab.Released)):
    try:
        RD.append(parser.parse(merge_tab.Released[i]))
        DDOB.append(parser.parse(merge_tab.D_DOB[i]))
        ADOB.append(parser.parse(merge_tab.A_DOB[i]))
    except:
        pass
merge_tab['RD']=RD
merge_tab['DDOB']=DDOB
merge_tab['ADOB']=ADOB
merge_tab['Dage'] = (merge_tab.RD - merge_tab.DDOB).values/np.timedelta64(1, 'D')/365.25
merge_tab['Aage'] = (merge_tab.RD - merge_tab.ADOB).values/np.timedelta64(1, 'D')/365.25



A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


We drop the unnecessary columns from the merged data frame and remove the comma in 'imdbVotes'. The purpose of this is to transform 'imdbVotes' into numeric .

In [4]:
merge_tab = merge_tab.drop(['Unnamed: 0','Awards','Metascore','D_wins','A_wins','RD','A_age','D_age','DDOB','ADOB'], axis=1)

#merge_tab['imdbVotes'] = merge_tab.imdbVotes.apply(lambda x: x.replace(',',''))

Now we have the ages for each of the director/actor in each movie, we need to calculate an weighted mean of the ages. We choose the weights to be the 'credits' of the director/actor. 'Credits' here are how many movies the actor has been the head actor for, or how many movies the director has directed. Besides age, we are also interested in the ratings of each movie. Currently, we have the user rating for each movie from both IMDB and Rotten Tomatos. We, again, calculate the weighted average of these two ratings by the reviews counts from each website.


In [5]:
def A_w_mean(df):
    return np.average(df.Aage, weights=df.A_credits)
def D_w_mean(df):
    return np.average(df.Dage, weights=df.D_credits)
def mean_rating(tab):
    w1 = [float(i) for i in tab.imdbVotes]
    w2 = [float(i) for i in tab.tomatoUserReviews]
    if any(np.isnan(tab.tomatoUserRating)):
        mean_rate = np.average(tab.imdbRating,  weights=w1)
    else:
        mean_rate = (np.average(tab.imdbRating,  weights=w1) + 2*np.average(tab.tomatoUserRating,  weights=w2))/2
    return mean_rate

The function 'clean_table' calculates the weighted average of ages, credits and rating.

In [6]:
def clean_table(tab):
    movies = merge_tab.groupby('imdbID')
    A_mean_age = movies.apply(A_w_mean)
    D_mean_age = movies.apply(D_w_mean)
    A_mean_credit = movies.apply(lambda x: np.average(x.A_credits))
    D_mean_credit = movies.apply(lambda x: np.average(x.D_credits))
    rating_mean = movies.apply(mean_rating)
    dic = {'A_mean_age': A_mean_age, 'D_mean_age': D_mean_age, 'A_mean_credit':A_mean_credit, 'D_mean_credit':D_mean_credit,
          'mean_rate': rating_mean}
    output = pd.DataFrame(dic)
    return output 

In [7]:
for i, r in merge_tab.iterrows():
    if type(r.imdbVotes) == str:
        merge_tab['imdbVotes'][i]= r.imdbVotes.replace(',','')

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [8]:
for i, r in merge_tab.iterrows():
    if type(r.tomatoUserReviews) == str:
        merge_tab['tomatoUserReviews'][i]= r.tomatoUserReviews.replace(',','')

The following are the result of calculation, each row corresponds to a movie.

In [9]:
merge_tab2 = clean_table(merge_tab).reset_index()
merge_tab2.head()

Unnamed: 0,imdbID,A_mean_age,A_mean_credit,D_mean_age,D_mean_credit,mean_rate
0,tt0369610,47.432927,73.0,38.743326,7,7.5
1,tt0470752,31.544133,31.0,45.308693,2,7.85
2,tt0478970,51.516363,67.333333,51.036277,26,7.9
3,tt0810819,30.39723,25.5,43.230664,18,6.75
4,tt0884732,32.953663,56.333333,39.129363,5,6.95


Since in the original table, each movie has three row, and each row corresponds to a different actor/director combination, we need to combine these rows into one single row for each movies. Therefore we remove the duplicate rows in the table. After we do that, we merge the original table and the calcualted table.

In [14]:
merge_tab3 = pd.merge(merge_tab, merge_tab2, left_on = 'imdbID', right_on='imdbID', how='left')
merge_tab3 = merge_tab3.drop(['D_birthplace','D_DOB','D_name','D_credits','A_birthplace','A_DOB','A_name','A_credits',
              'Actors', 'Director', 'Dage', 'Aage', 'Director Nominated Oscars', 'Director Won Oscars', 'Actor Nominated Oscars',
                             'Actor Won Oscars'], axis=1)
merge_tab3 = merge_tab3.drop_duplicates()

In [15]:
merge_tab3

Unnamed: 0,imdbID,Title,Released,Country,Language,Rated,Runtime,Year,imdbRating,imdbVotes,tomatoConsensus,tomatoFresh,tomatoMeter,tomatoRating,tomatoReviews,tomatoRotten,tomatoUserMeter,tomatoUserRating,tomatoUserReviews,Nominated Oscars,Won Oscars,Other wins,Other nominations,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Musical,Mystery,N/A,Romance,Sci-Fi,Sport,Thriller,War,Western,wiki_url_new,imdb_link_new,rt_link_new,budget_USD,box_USD,A_mean_age,A_mean_credit,D_mean_age,D_mean_credit,mean_rate
0,tt0369610,Jurassic World,2015-06-12 00:00:00,USA,English,PG-13,124,2015,7.2,314481,Jurassic World can't match the original for sh...,194,71,6.7,274,80,80,3.9,206851,0,0,0,0,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,/wiki/Jurassic_World,http://www.imdb.com/title/tt0369610/,http://www.rottentomatoes.com/m/jurassic_world/,,6.521980e+08,47.432927,73.000000,38.743326,7,7.50
4,tt0470752,Ex Machina,2015-04-24 00:00:00,UK,English,R,108,2015,7.7,194256,Ex Machina leans heavier on ideas than effects...,199,92,8.0,216,17,85,4.0,61929,0,0,0,0,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,True,False,False,False,False,/wiki/Ex_Machina_(film),http://www.imdb.com/title/tt0470752/,http://www.rottentomatoes.com/m/ex_machina/,,2.544097e+07,31.544133,31.000000,45.308693,2,7.85
7,tt0478970,Ant-Man,2015-07-17 00:00:00,USA,English,PG-13,117,2015,7.6,138824,"Led by a charming performance from Paul Rudd, ...",197,79,6.8,248,51,88,4.1,148048,0,0,0,0,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,/wiki/Ant-Man_(film),http://www.imdb.com/title/tt0478970/,,,1.786361e+08,51.516363,67.333333,51.036277,26,7.90
10,tt0810819,The Danish Girl,2015-12-25 00:00:00,"UK, Germany, USA","English, German",R,120,2015,5.3,2539,,,,,,,,4.1,304,0,0,0,0,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,/wiki/The_Danish_Girl_(film),http://www.imdb.com/title/tt0810819/,http://www.rottentomatoes.com/m/The_Danish_Gir...,,,30.397230,25.500000,43.230664,18,6.75
12,tt0884732,The Wedding Ringer,2015-01-16 00:00:00,USA,English,R,101,2015,6.7,42323,Kevin Hart and Josh Gad might be two great com...,25,27,4.3,93,68,65,3.6,39352,0,0,0,0,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,/wiki/The_Wedding_Ringer,http://www.imdb.com/title/tt0884732/,http://www.rottentomatoes.com/m/the_wedding_ri...,,6.446021e+07,32.953663,56.333333,39.129363,5,6.95
15,tt0900387,Suite Française,1929-08-20 00:00:00,"UK, France, Canada, Belgium","English, German, French",,107,2014,6.8,6940,,23,77,6.0,30,7,,,,0,0,0,0,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,True,False,/wiki/Suite_Fran%C3%A7aise_(film),http://www.imdb.com/title/tt0900387/,http://www.rottentomatoes.com/m/suite_francaise/,,,-40.433073,46.333333,-38.365503,6,6.80
18,tt1014763,Child 44,2015-04-17 00:00:00,"USA, UK, Czech Republic, Romania, Russia",English,R,137,2015,6.4,28118,There's a gripping story at the heart of Child...,18,25,4.7,71,53,43,3.1,11248,0,0,0,0,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,/wiki/Child_44_(film),http://www.imdb.com/title/tt1014763/,,,1.206135e+06,36.588389,44.000000,38.067077,7,6.30
20,tt1018765,Our Brand Is Crisis,2015-10-30 00:00:00,USA,"English, Spanish",R,107,2015,5.6,1443,Our Brand Is Crisis offers sporadic amusement ...,37,33,5.3,113,76,43,3.1,5341,0,0,0,0,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,/wiki/Our_Brand_Is_Crisis_(2015_film),http://www.imdb.com/title/tt1018765/,http://www.rottentomatoes.com/m/our_brand_is_c...,,,51.355324,62.666667,40.558522,18,5.90
23,tt1029360,Poltergeist,2015-05-22 00:00:00,"USA, Canada",English,PG-13,93,2015,5.0,32812,Paying competent homage without adding anythin...,32,31,4.8,102,70,23,2.5,35883,0,0,0,0,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,/wiki/Poltergeist_(2015_film),http://www.imdb.com/title/tt1029360/,http://www.rottentomatoes.com/m/poltergeist-2009/,,4.741569e+07,45.590144,62.500000,38.595483,3,5.00
25,tt1137470,Accidental Love,2015-02-10 00:00:00,"USA, UK",English,PG-13,100,2015,4.1,3562,,2,7,3.2,29,27,16,2.0,4625,0,0,0,0,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,/wiki/Accidental_Love,http://www.imdb.com/title/tt1137470/,http://www.rottentomatoes.com/m/accidental_love/,,,31.685718,24.000000,56.476386,12,4.05


In [17]:
merge_tab3.to_csv('final_table2015.csv', encoding='utf-8')