# Get directors and actors information through scraper

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import os

In [2]:
# Set the base URL and agent
base_url = "https://www.themoviedb.org/"
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.76 Safari/537.36"
headers = { 'User-Agent' : user_agent }

In [3]:
# load top movies csv data
topMovies = pd.read_csv('data/rawTopMovies.csv', encoding = 'utf-8') #utf-8: show foreign characters
topMovies.head()

Unnamed: 0,id,title,original_title,overview,release_date,original_language,vote_average,vote_count
0,19404,Dilwale Dulhania Le Jayenge,दिलवाले दुल्हनिया ले जायेंगे,"Raj is a rich, carefree, happy-go-lucky second...",1995-10-20,hi,9.1,1971
1,278,The Shawshank Redemption,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,1994-09-23,en,8.6,12087
2,238,The Godfather,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",1972-03-14,en,8.6,9258
3,372058,Your Name.,君の名は。,High schoolers Mitsuha and Taki are complete s...,2016-08-26,ja,8.6,3542
4,424,Schindler's List,Schindler's List,The true story of how businessman Oskar Schind...,1993-12-15,en,8.5,7265


In [5]:
# This function is used for extract person information given the person page on TMDB
def extractPersonInfo(personlink, movieID):
    personUrl = base_url + personlink
    
    personInfo = dict.fromkeys(['ID', 'Name', 'Gender', 'Birthday', 'Place of Birth', 'Movie ID'])
    personInfo['Movie ID'] = movieID
#     The personlink includes person ID
    personInfo['ID'] = personlink.replace('/person/', '').split('-')[0]
    
    personPage = BeautifulSoup(requests.get(personUrl, headers=headers).text, "html.parser")
    
    personInfo['Name'] = personPage.find('div', {'class': 'title'}).find('a').text.strip('\n')
    
    infoLists = personPage.find('section', {'class': 'facts left_column'}).find_all('p')
    for info in infoLists:
        if (info.find('bdi')):
            bdi = info.find('bdi').text
            if bdi in personInfo:
                personInfo[bdi] = info.text.replace(bdi, '').strip()
    return personInfo  

#### Start get information from the webpage

In [6]:
directors = []
actors = []

movieBaseUrl = base_url + 'movie/'

# Get movie ID from the top movies download through TMDB API
for movieID in topMovies['id']:
    moviePageUrl = movieBaseUrl + str(movieID)
    moviePage = BeautifulSoup(requests.get(moviePageUrl, headers=headers).text, "html.parser")
#     Get directors' information
    crews = moviePage.find('div', {"class": "header_info"}).find_all('li')
    directorLinks = [crew.find(href = True)['href'] \
                 for crew in crews if 'Director' in crew.find('p', {"class": "character"}).text]
    directors += [extractPersonInfo(directorLink, movieID) for directorLink in directorLinks]
#     Get Top Billed Cast information
    billCast = moviePage.find('section', {'class': 'panel top_billed scroller'}).find_all('li', {'class': 'card'})
    actorLinks = [actor.find(href = True)['href'] for actor in billCast]
    actors += [extractPersonInfo(actorLink, movieID) for actorLink in actorLinks]

In [7]:
directorsDataFrame = pd.DataFrame(directors, columns=['ID', 'Name', 'Gender', 'Birthday', 'Place of Birth', 'Movie ID'])
directorsDataFrame.head()

Unnamed: 0,ID,Name,Gender,Birthday,Place of Birth,Movie ID
0,35771,Aditya Chopra,-,1971-05-21,-,19404
1,4027,Frank Darabont,Male,1959-01-28,"Montbéliard, Doubs, France",278
2,1776,Francis Ford Coppola,Male,1939-04-07,"Detroit, Michigan, USA",238
3,74091,Makoto Shinkai,Male,1973-02-09,"Nagano Prefecture, Japan",372058
4,488,Steven Spielberg,Male,1946-12-18,Cincinnati - Ohio - USA,424


In [8]:
actorsDataFrame = pd.DataFrame(actors, columns=['ID', 'Name', 'Gender', 'Birthday', 'Place of Birth', 'Movie ID'])
actorsDataFrame.head()

Unnamed: 0,ID,Name,Gender,Birthday,Place of Birth,Movie ID
0,35742,Shah Rukh Khan,Male,1965-11-02,"New Delhi, Delhi, India",19404
1,55061,Kajol,Female,1974-08-05,"Mumbai, Maharashtra, India",19404
2,691,Amrish Puri,Male,1932-06-22,"Jalandhar, Punjab, India",19404
3,6217,Anupam Kher,Male,1955-03-07,"Shimla, Himachal Pradesh, India",19404
4,35759,Satish Shah,Male,1951-06-25,"Bombay, India",19404


#### Save raw data

In [4]:
# Prepare the file we will write into
current_dir = os.getcwd()
if not os.path.exists(os.path.join(current_dir, 'data')):
    os.makedirs(os.path.join(current_dir, 'data'))

In [17]:
directorsDataFrame.to_csv('data/rawDirectors.csv', encoding='utf-8', index=False)
actorsDataFrame.to_csv('data/rawActors.csv', encoding='utf-8', index=False)

#### Data cleaning and auditing on directorsDataFrame and actorsDataFrame

##### Find missing data

In [10]:
directorsDataFrame.replace(['-'], [None], inplace=True)
directorsDataFrame[directorsDataFrame.isnull().any(axis=1)]

Unnamed: 0,ID,Name,Gender,Birthday,Place of Birth,Movie ID
0,35771,Aditya Chopra,,1971-05-21,,19404
5,59918,Rodney Rothman,Male,,,324857
6,936670,Bob Persichetti,Male,,,324857
7,151007,Peter Ramsey,Male,,"Baldwin Hills, California, Stati Uniti",324857
29,567374,Carl Tibbetts,Male,,,374430
36,95456,Elio Petri,,1929-01-29,"Rome, Italy",26451
38,42274,Nick Hurran,Male,,"London, England, United Kingdom",313106
44,70235,Marco Tullio Giordana,,1950-10-01,"Milano, Italy",11659
45,32375,Mario Monicelli,,1915-05-15,"Viareggio, Lucca, Tuscany, Italy",20914
46,1624330,Benjamin Caron,Male,,,432517


In [11]:
actorsDataFrame.replace(['-'], [None], inplace=True)
actorsDataFrame[actorsDataFrame.isnull().any(axis=1)]

Unnamed: 0,ID,Name,Gender,Birthday,Place of Birth,Movie ID
13,3086,Richard S. Castellano,,1933-09-04,"The Bronx, New York, U.S",238
24,6693,Jonathan Sagall,,1959-04-23,"Toronto, Ontario, Canada",424
29,226366,Brian Tyree Henry,Male,,,324857
35,19587,Rumi Hiiragi,,1987-08-01,"Tokyo, Japan",129
39,19594,Yumi Tamai,,,,129
49,9238,Amerigo Fontani,,1955-06-15,"Florence, italy",637
85,77927,Megumi Ogata,,,,18491
87,77931,Kotono Mitsuishi,Female,1967-12-08,,18491
89,77934,Fumihiko Tachiki,,,,18491
112,11478,Lorraine Bracco,,1954-10-02,Bay Ridge - Brooklyn - New York City - New Yor...,769


##### Remove rows with missing data

In [12]:
# Remove rows with None value
directorsDataFrame.dropna(inplace=True)
actorsDataFrame.dropna(inplace=True)

In [13]:
directorsDataFrame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 734 entries, 1 to 844
Data columns (total 6 columns):
ID                734 non-null object
Name              734 non-null object
Gender            734 non-null object
Birthday          734 non-null object
Place of Birth    734 non-null object
Movie ID          734 non-null int64
dtypes: int64(1), object(5)
memory usage: 40.1+ KB


In [14]:
actorsDataFrame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3243 entries, 0 to 3933
Data columns (total 6 columns):
ID                3243 non-null object
Name              3243 non-null object
Gender            3243 non-null object
Birthday          3243 non-null object
Place of Birth    3243 non-null object
Movie ID          3243 non-null int64
dtypes: int64(1), object(5)
memory usage: 177.4+ KB


#### Save processed data

In [16]:
directorsDataFrame.to_csv('data/Directors.csv', encoding='utf-8',index=False)
actorsDataFrame.to_csv('data/Actors.csv', encoding='utf-8', index=False)