## Part 3: Reading actors and directors information for each movie from IMDB


In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import requests
import json
import bs4
from bs4 import BeautifulSoup
import time

First, we read in the "movie" table we get from scrapingimdb_1.ipynb and get a unique list of IMDB Ids.


In [2]:
movies = pd.read_csv('movies.csv')
tts = (list(set(movies.imdbID)))### get id list to get reviewer count

We loop through the movie id list to get the director urls

In [12]:
%%time
directorlist = []
for t in tqdm(tts, leave=True):
    url = "http://www.imdb.com/title/{}/".format(t)
    t1970 = requests.get(url)
    soup = BeautifulSoup(t1970.text, "html.parser")
    tb = soup.find("table").findAll("div", attrs = {'class','txt-block'})
    if tb[0].find('a')['href'] == None:
        directorlist.append(None)
    else:
        directorlist.append(tb[0].find('a')['href'])

100%|██████████| 1241/1241 [13:50<00:00,  1.47it/s]

CPU times: user 3min 21s, sys: 3.57 s, total: 3min 24s
Wall time: 13min 50s





In [13]:
#directorlist2 = dict(zip(tts, directorlist)) 
fd=open("directorlist.json","w")
json.dump(directorlist, fd)
fd.close()

In [14]:
with open("directorlist.json") as json_file:
    directorlist = json.load(json_file)

In [15]:
len(directorlist)

1241

For each director, we get his/her name, date of birth, birthplace, number of oscar wins and nominations annd the credits(how many movies have they participated in). We store these information in a dictionay with keys: Name, DOB, Birthplace, wins and credits.

In [16]:
%%time
director_dict = {"Name":[], "DOB":[], "Birthplace":[], "wins":[], "credits":[]}

for u in tqdm(directorlist, leave=True):
    #print u
    url = 'http://www.imdb.com{}'.format(u)
    t1990=requests.get(url)
    soup=BeautifulSoup(t1990.text, "html.parser")
    try:
        divs=soup.findAll('div', attrs = {'id':'name-born-info'})[0]
        Name = soup.findAll('table', attrs = {'id':'name-overview-widget-layout'})[0].find('span',attrs={'class':'itemprop'}).get_text()
        try: 
            DOB=divs.find('time').get('datetime')
        except:
            DOB=None 
        try:
            Birthplace=divs.findAll('a')[-1].get_text()
        except:
            Birthplace=None
        try:
            wins=soup.findAll('div', attrs = {'class':'article highlighted'})[0].find_all('span')[0].get_text().strip()
        except:
            wins=0
        try:
            credits=soup.findAll('div', attrs = {'id':'filmo-head-director'})[0].get_text().strip()[22:24]
        except:
            credits=0
        if Name not in director_dict["Name"]:
            director_dict["Name"].append(Name)
            director_dict['DOB'].append(DOB)
            director_dict['Birthplace'].append(Birthplace)
            director_dict['wins'].append(wins)
            director_dict['credits'].append(credits)
    except:
        pass


100%|██████████| 1241/1241 [17:03<00:00,  2.02it/s]

CPU times: user 4min 44s, sys: 4.64 s, total: 4min 49s
Wall time: 17min 3s





We convert the dictionary to dataframe and calculate the age of the directors on 2015/11/23. We will later calculate the age of director on the released date of the movie.

In [19]:
import datetime
directordf = pd.DataFrame(director_dict) ### fix 'XXXX-0-0' date issue

for i in range(len(directordf.DOB)):
    if directordf.DOB[i] == None:
        directordf.DOB[i] = '2015-11-23'
        print directordf.DOB[i]
    if directordf.DOB[i].split('-')[1] == '0' or directordf.DOB[i].split('-')[2] == '0':
        directordf.DOB[i] = directordf.DOB[i].split('-')[0]+'-1'+'-1'
        print directordf.DOB[i]
        
        
today = datetime.datetime(2015, 11, 23)
directordf['age'] = (today - directordf.DOB.apply(pd.datetools.parse)).values/np.timedelta64(1, 'D')/365.25


1958-1-1
1966-1-1
1955-1-1
1958-1-1
1972-1-1
1964-1-1
1964-1-1
1960-1-1
1952-1-1
1964-1-1
1936-1-1
1954-1-1
1918-1-1
1946-1-1
1967-1-1
1964-1-1
1970-1-1
2015-11-23
1964-1-1
1975-1-1
1945-1-1
1942-1-1
1960-1-1
1963-1-1
1954-1-1
1951-1-1
1965-1-1
1967-1-1
1968-1-1
1960-1-1
1974-1-1
1954-1-1
1950-1-1
1967-1-1


In [20]:
#directordf.tail(50)

In [14]:
directordf.to_csv('directordf.csv', encoding='utf-8')

NameError: name 'directordf' is not defined

After we save the director info data frame, we did the similar things for all the actors in each movies. To simplify our scraping, we only scraped three main actors in each movie (refer to the code in scrapingimdb_1).

In [3]:
%%time
actorurl = {}
### return a dict, with key the movie id and values the urls for actors 
for t in tqdm(tts, leave=True):
    try:
        actorurl[t] = []
        l = []
        url = "http://www.imdb.com/title/{}/".format(t)
        t1970=requests.get(url)
        soup = BeautifulSoup(t1970.text, "html.parser")
        divs = soup.find('table').findAll('div', attrs = {'class':'txt-block', 'itemprop':'actors'})[0]
        for div in divs.find_all('a'):
            l.append(div.get('href'))
        actorurl[t] = l[:-1]
    except:
        pass
 

100%|██████████| 1241/1241 [14:12<00:00,  1.46it/s]

CPU times: user 3min 32s, sys: 4.05 s, total: 3min 36s
Wall time: 14min 12s





In [4]:
#directorlist2 = dict(zip(tts, directorlist)) 
fd=open("actorurl.json","w")
json.dump(actorurl, fd)
fd.close()

In [5]:
with open("actorurl.json") as json_file:
    actorurl = json.load(json_file)

In [6]:
len(actorurl)

1241

In [7]:
%%time
actor_dict = {"Name":[], "DOB":[], "Birthplace":[], "wins":[], "credits":[]}

for k in tqdm(actorurl.keys(), leave=True):
    url_list = actorurl[k]
    for u in url_list:
        #print u
        try:
            url = 'http://www.imdb.com{}'.format(u)
            t1990=requests.get(url)
            soup=BeautifulSoup(t1990.text, "html.parser")
            divs=soup.findAll('table', attrs = {'id':'name-overview-widget-layout'})[0].findAll('div', attrs = {'id':'name-born-info'})[0]
            try:
                Name=soup.findAll('table', attrs = {'id':'name-overview-widget-layout'})[0].find('span',attrs={'class':'itemprop'}).get_text()
            except:
                Name=None
            try:
                DOB=divs.find('time').get('datetime')
            except:
                DOB=None
            try:
                Birthplace=divs.findAll('a')[-1].get_text()
            except:
                Birthdplace=None
            try:
                wins=soup.findAll('div', attrs = {'class':'article highlighted'})[0].find_all('span')[0].get_text().strip()
            except:
                wins=0
                
            try:
                c = soup.findAll('div', attrs = {'id':'filmo-head-actor'})[0]
                c = str(c.get_text()[20:].strip())
                credit = float(''.join(x for x in c if x.isdigit()))
            except:
                c = soup.findAll('div', attrs = {'id':'filmo-head-actress'})[0]
                c = str(c.get_text()[20:].strip())
                credit = float(''.join(x for x in c if x.isdigit()))
                
            if Name not in actor_dict["Name"]:
                actor_dict['Name'].append(Name)
                actor_dict['DOB'].append(DOB)
                actor_dict['Birthplace'].append(Birthplace)
                actor_dict['wins'].append(wins)
                actor_dict['credits'].append(credit)
        except:
            pass

100%|██████████| 1241/1241 [52:06<00:00,  0.42it/s]

CPU times: user 15min 55s, sys: 14.6 s, total: 16min 9s
Wall time: 52min 6s





In [9]:
import datetime
today = datetime.datetime(2015, 11, 23)
actordf=pd.DataFrame(actor_dict)

for i in range(len(actordf.DOB)):
    if actordf.DOB[i] == None:
        actordf.DOB[i] = '2015-11-23'
        print actordf.DOB[i]
    if actordf.DOB[i].split('-')[1] == '0' or actordf.DOB[i].split('-')[2] == '0':
        actordf.DOB[i] = actordf.DOB[i].split('-')[0]+'-1'+'-1'
        print actordf.DOB[i]
    
### calculate the age
actordf['age'] = (today - actordf.DOB.apply(pd.datetools.parse)).values/np.timedelta64(1, 'D')/365.25


A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


1950-1-1
2015-11-23
1938-1-1
1990-1-1
1982-1-1
2015-11-23
1973-1-1
1968-1-1
2015-11-23
1924-1-1
2015-11-23
1973-1-1
1973-1-1
2015-11-23
2015-11-23
1955-1-1
1942-1-1
2015-11-23


In [10]:
actordf.head()

Unnamed: 0,Birthplace,DOB,Name,credits,wins,age
0,"New York City, New York, USA",1940-4-25,Al Pacino,53,Won\n 1\n Oscar.,75.578371
1,"New York City, New York, USA",1915-6-1,John Randolph,177,0,100.479124
2,"Downingtown, Pennsylvania, USA",1987-2-20,Miles Teller,22,3 wins & 19 nominations.,28.755647
3,"Detroit, Michigan, USA",1955-1-9,J.K. Simmons,163,Won\n 1\n Oscar.,60.870637
4,"Littleton, Colorado, USA",1988-10-4,Melissa Benoist,14,1 nomination.,27.134839


In [13]:
actordf.to_csv('actordf.csv', encoding='utf-8')