In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import requests
import json
import bs4
from bs4 import BeautifulSoup

Now, we get the merge data and do some calculation based on it. We then get the desired features for us to do the analysis.

In [2]:
merge_tab = pd.read_csv('merge.csv')
merge_tab.shape

(3566, 80)

First, we transform the date of brith of actors and directors into date format, then can the released date from their birthday to get the age of directors and actors when the movie was released (they are called 'Aage' and 'Dage'). We think this is an important feature because age, to some extent, implies the experience of the director/actor.

In [3]:
import dateutil.parser as parser
from datetime import datetime

for i in range(len(merge_tab.Released)):
    if merge_tab.Released[i] == 'False':
        merge_tab.Released[i] = '1929-08-20 00:00:00'

RD = []
DDOB = []
ADOB = []
for i in range(len(merge_tab.Released)):
    try:
        RD.append(parser.parse(merge_tab.Released[i]))
        DDOB.append(parser.parse(merge_tab.D_DOB[i]))
        ADOB.append(parser.parse(merge_tab.A_DOB[i]))
    except:
        pass
merge_tab['RD']=RD
merge_tab['DDOB']=DDOB
merge_tab['ADOB']=ADOB
merge_tab['Dage'] = (merge_tab.RD - merge_tab.DDOB).values/np.timedelta64(1, 'D')/365.25
merge_tab['Aage'] = (merge_tab.RD - merge_tab.ADOB).values/np.timedelta64(1, 'D')/365.25



A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


We drop the unnecessary columns from the merged data frame and remove the comma in 'imdbVotes'. The purpose of this is to transform 'imdbVotes' into numeric .

In [5]:
merge_tab = merge_tab.drop(['Awards','Metascore','D_wins','A_wins','RD','A_age','D_age',
                           'DDOB','ADOB'], axis=1)

merge_tab['imdbVotes'] = merge_tab.imdbVotes.apply(lambda x: x.replace(',',''))

Now we have the ages for each of the director/actor in each movie, we need to calculate an weighted mean of the ages. We choose the weights to be the 'credits' of the director/actor. 'Credits' here are how many movies the actor has been the head actor for, or how many movies the director has directed. Besides age, we are also interested in the ratings of each movie. Currently, we have the user rating for each movie from both IMDB and Rotten Tomatos. We, again, calculate the weighted average of these two ratings by the reviews counts from each website.


In [6]:
def A_w_mean(df):
    return np.average(df.Aage, weights=df.A_credits)
def D_w_mean(df):
    return np.average(df.Dage, weights=df.D_credits)
def mean_rating(tab):
    w1 = [float(i) for i in tab.imdbVotes]
    w2 = [float(i) for i in tab.tomatoUserReviews]
    if any(np.isnan(tab.tomatoUserRating)):
        mean_rate = np.average(tab.imdbRating,  weights=w1)
    else:
        mean_rate = (np.average(tab.imdbRating,  weights=w1) + 2*np.average(tab.tomatoUserRating,  weights=w2))/2
    return mean_rate

The function 'clean_table' calculates the weighted average of ages, credits and rating.

In [7]:
def clean_table(tab):
    movies = merge_tab.groupby('imdbID')
    A_mean_age = movies.apply(A_w_mean)
    D_mean_age = movies.apply(D_w_mean)
    A_mean_credit = movies.apply(lambda x: np.average(x.A_credits))
    D_mean_credit = movies.apply(lambda x: np.average(x.D_credits))
    rating_mean = movies.apply(mean_rating)
    dic = {'A_mean_age': A_mean_age, 'D_mean_age': D_mean_age, 'A_mean_credit':A_mean_credit, 'D_mean_credit':D_mean_credit,
          'mean_rate': rating_mean}
    output = pd.DataFrame(dic)
    return output 

The following are the result of calculation, each row corresponds to a movie.

In [8]:
merge_tab2 = clean_table(merge_tab).reset_index()
merge_tab2.head()

Unnamed: 0,imdbID,A_mean_age,A_mean_credit,D_mean_age,D_mean_credit,mean_rate
0,tt0018389,54.870204,38.0,34.548939,54,5.8
1,tt0018515,29.918217,117.666667,31.978097,52,7.3
2,tt0018674,34.211518,120.666667,43.816564,83,7.9
3,tt0018806,28.208751,34.5,34.061602,79,7.9
4,tt0019071,34.847159,102.0,33.839836,35,8.0


Since in the original table, each movie has three row, and each row corresponds to a different actor/director combination, we need to combine these rows into one single row for each movies. Therefore we remove the duplicate rows in the table. After we do that, we merge the original table and the calcualted table.

In [9]:
merge_tab3 = pd.merge(merge_tab, merge_tab2, left_on = 'imdbID', right_on='imdbID', how='left')
merge_tab3 = merge_tab3.drop(['Unnamed: 0', 'D_birthplace','D_DOB','D_name','D_credits','A_birthplace','A_DOB','A_name','A_credits','Director Nominated Oscars',
              'Director Won Oscars', 'Actor Nominated Oscars', 'Actor Won Oscars', 'Dage', 'Aage'], axis=1)
merge_tab3 = merge_tab3.drop_duplicates()

We sum up the oscar win and nominated of director and actor for each movie.

In [10]:
DOscars = merge_tab[['imdbID', 'D_name', 'A_name', 'Director Nominated Oscars', 'Director Won Oscars', 'Actor Nominated Oscars', 'Actor Won Oscars']]
DOscars11 = DOscars.drop_duplicates(subset=['imdbID', 'D_name', 'Director Nominated Oscars', 'Director Won Oscars'])
DOscars22 = DOscars.drop_duplicates(subset=['imdbID', 'A_name', 'Actor Nominated Oscars', 'Actor Won Oscars'])
Dnominated = DOscars11.groupby('imdbID').apply(lambda x: sum(x['Director Nominated Oscars']))
Dwon = DOscars11.groupby('imdbID').apply(lambda x: sum(x['Director Won Oscars']))
Anominated = DOscars22.groupby('imdbID').apply(lambda x: sum(x['Actor Nominated Oscars']))
Awon = DOscars22.groupby('imdbID').apply(lambda x: sum(x['Actor Won Oscars']))

Oscars = pd.DataFrame({'Dnominated':Dnominated, 'Dwon':Dwon, 'Anominated':Anominated, 'Awon':Awon}).reset_index()

In [11]:
Oscars.head()

Unnamed: 0,imdbID,Anominated,Awon,Dnominated,Dwon
0,tt0018389,1,0,0,0
1,tt0018515,0,1,0,2
2,tt0018674,1,0,0,0
3,tt0018806,0,0,5,0
4,tt0019071,3,1,2,0


Finally, we merge all three tables together to get the final data table that we can do analysis.

In [12]:
merge_tab4 = pd.merge(merge_tab3, Oscars, left_on = 'imdbID', right_on='imdbID', how='left')

In [14]:
merge_tab4.head()

Unnamed: 0,imdbID,Title,Released,Director,Actors,Country,Language,Rated,Runtime,Year,imdbRating,imdbVotes,tomatoConsensus,tomatoFresh,tomatoMeter,tomatoRating,tomatoReviews,tomatoRotten,tomatoUserMeter,tomatoUserRating,tomatoUserReviews,Nominated Oscars,Won Oscars,Othter wins,Othter nominations,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Family,Fantasy,Film-Noir,History,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western,bp_n,bp_w,fl_n,fl_w,fs_n,fs_w,ml_n,ml_w,ms_n,ms_w,movie_url,imdb_link,rt_link,budget_USD,box_USD,A_mean_age,A_mean_credit,D_mean_age,D_mean_credit,mean_rate,Anominated,Awon,Dnominated,Dwon
0,tt0018389,A Ship Comes In,1928-01-04 00:00:00,William K. Howard,Rudolph Schildkraut,USA,,,70,1928,5.8,88,,,,,,,,,,0,0,0,0,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,/wiki/A_Ship_Comes_In,http://www.imdb.com/title/tt0018389/,,,69308274.8538,54.870204,38.0,34.548939,54,5.8,1,0,0,0
1,tt0018389,A Ship Comes In,1928-01-04 00:00:00,William K. Howard,Louise Dresser,USA,,,70,1928,5.8,88,,,,,,,,,,0,0,0,0,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,/wiki/A_Ship_Comes_In,http://www.imdb.com/title/tt0018389/,,,69308274.8538,54.870204,38.0,34.548939,54,5.8,1,0,0,0
2,tt0018515,Two Arabian Knights,1927-09-23 00:00:00,Lewis Milestone,William Boyd,USA,English,TV-G,92,1927,7.0,505,,,,,,,71.0,3.8,124.0,0,0,0,0,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,/wiki/Two_Arabian_Knights,http://www.imdb.com/title/tt0018515/,,,91438583.1532,29.918217,117.666667,31.978097,52,7.3,0,1,0,2
3,tt0018515,Two Arabian Knights,1927-09-23 00:00:00,Lewis Milestone,Mary Astor,USA,English,TV-G,92,1927,7.0,505,,,,,,,71.0,3.8,124.0,0,0,0,0,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,/wiki/Two_Arabian_Knights,http://www.imdb.com/title/tt0018515/,,,91438583.1532,29.918217,117.666667,31.978097,52,7.3,0,1,0,2
4,tt0018515,Two Arabian Knights,1927-09-23 00:00:00,Lewis Milestone,Louis Wolheim,USA,English,TV-G,92,1927,7.0,505,,,,,,,71.0,3.8,124.0,0,0,0,0,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,/wiki/Two_Arabian_Knights,http://www.imdb.com/title/tt0018515/,,,91438583.1532,29.918217,117.666667,31.978097,52,7.3,0,1,0,2


In [15]:
merge_tab4.to_csv('final_table.csv', encoding='utf-8')