# Read actors File

In [1]:
import pandas as pd
from os.path import exists
import glob
actors = pd.read_csv('../files/actorsOrig.csv', header=0)

# Apply Filters

In [2]:
print("Actor Count before Filter:",len(actors))
actors = actors.reset_index(drop=True)
print(len(actors))
actors.head()

Actor Count before Filter: 57041
57041


Unnamed: 0,nconst,primaryName,tconst
0,nm0000004,John Belushi,tt0077621
1,nm0005460,Mary Steenburgen,tt0077621
2,nm0000004,John Belushi,tt0077975
3,nm0000261,Karen Allen,tt0077975
4,nm0001371,Tom Hulce,tt0077975


# Check Samples

# Get Unique Titles for Actors

In [3]:
uniqTitlesList = list(set(actors['tconst']))
uniqTitlesList.sort()
print(len(uniqTitlesList))

18657


# Scrape Related Fuctions

In [4]:
from bs4 import BeautifulSoup
from selenium import webdriver 
from selenium.webdriver.chrome.options import Options
import re 
import urllib
import time

In [5]:
#create a webdriver object and set options for headless browsing
options = Options()
options.headless = True
driver = webdriver.Chrome('../chromedriver',options=options)

In [6]:
#uses webdriver object to execute javascript code and get dynamically loaded webcontent
def get_js_soup(url,driver):
    driver.get(url)
    res_html = driver.execute_script('return document.body.innerHTML')
    soup = BeautifulSoup(res_html,'html.parser') #beautiful soup object to be used for parsing html content
    return soup

#tidies extracted text 
def process_mov(mov):
    mov = mov.encode('ascii',errors='ignore').decode('utf-8')       #removes non-ascii characters
    mov = re.sub('\s+',' ',mov)       #repalces repeated whitespace characters with single space
    return mov

''' More tidying
Sometimes the text extracted HTML webpage may contain javascript code and some style elements. 
This function removes script and style tags from HTML so that extracted text does not contain them.
'''
def remove_script(soup):
    for script in soup(["script", "style"]):
        script.decompose()
    return soup


#Checks if bio_url is a valid faculty homepage
def is_valid_homepage(mov_url,dir_url):
    if mov_url.endswith('.pdf'): #we're not parsing pdfs
        return False
    try:
        #sometimes the homepage url points to the same page as the faculty profile page
        #which should be treated differently from an actual homepage
        ret_url = urllib.request.urlopen(mov_url).geturl() 
    except:
        return False       #unable to access bio_url
    urls = [re.sub('((https?://)|(www.))','',url) for url in [ret_url,dir_url]] #removes url scheme (https,http or www) 
    return not(urls[0]== urls[1])

def scrape_movie_page(mov_url,driver):
    soup = get_js_soup(mov_url,driver)
    homepage_found = False
    #profile_sec = soup.find('section',class_='main-content')
    profile_sec = soup.find('section',class_='article listo')
    if profile_sec == None:
        mov = ''
        print("Skipping mov : ",mov_url)
    else:
        mov = process_mov(profile_sec.get_text(separator=' '))
    return mov_url,mov

In [7]:
def write_lst(lst,file_):
    lst = lst.replace(' Plot Showing all 0 items Jump to: ','')
    lst = lst.replace(' Plot Showing all 1 items Jump to: Summaries (1) ','')
    lst = lst.replace(' Plot Showing all 2 items Jump to: Summaries (2) ','')
    lst = lst.replace(' Plot Showing all 3 items Jump to: Summaries (3) ','')
    lst = lst.replace(' Plot Showing all ','').replace(' items Jump to: Summaries (','')
    lst = lst.replace(') Synopsis (','').replace(' Synopsis','').replace(' Summaries','')
    lst = lst.replace('Edit ','')

    with open(file_,'w') as f:
        f.write(lst)

# Scrape Movie plot summary and store in "movieFile" dir 

- Skip already processed files in case of a Restart

In [8]:
fileList = glob.glob("../movieFile/*.txt")
fileListVals = []
for i in fileList:
    fileListVals.append(int(i.replace('../movieFile/','').replace('.txt','').replace('tt','')))
    
print(len(fileListVals),max(fileListVals))
    

if len(fileList) <= 0:
    last_scraped = 0
else:
    last_scraped = (max(fileListVals))


uniqTitlesInt = sorted(int(i.replace('tt','')) for i in uniqTitlesList if int(i.replace('tt','')) > last_scraped)

uniqTitles = []
for i in uniqTitlesInt:
    uniqTitles.append('tt'+str(i))
    
print('tt' + str(last_scraped))
print(len(uniqTitles))


8813 9916362
tt9916362
490


In [9]:
#Scrape homepages of all urls
tot_urls = len(uniqTitles)
for i,l in enumerate(uniqTitles):
    link="https://www.imdb.com/title/"+l+"/plotsummary"
    print ('-'*20,'Scraping movie url {}/{}'.format(i+1,tot_urls),'-'*20)
    print(link)
    mov_urls, movs = [],[]
    mov_url,mov = scrape_movie_page(link,driver)
    if len(mov) < 2000:
        print("SKIPPING")
    else:
        if mov.strip()!= '' and mov_url.strip()!='':
            mov_file = '../movieFile/'+l+'.txt'
            write_lst(mov,mov_file)
driver.close()

-------------------- Scraping movie url 1/490 --------------------
https://www.imdb.com/title/tt10003008/plotsummary
-------------------- Scraping movie url 2/490 --------------------
https://www.imdb.com/title/tt10006006/plotsummary
SKIPPING
-------------------- Scraping movie url 3/490 --------------------
https://www.imdb.com/title/tt10008784/plotsummary
SKIPPING
-------------------- Scraping movie url 4/490 --------------------
https://www.imdb.com/title/tt10009030/plotsummary
SKIPPING
-------------------- Scraping movie url 5/490 --------------------
https://www.imdb.com/title/tt10016180/plotsummary
-------------------- Scraping movie url 6/490 --------------------
https://www.imdb.com/title/tt10023024/plotsummary
SKIPPING
-------------------- Scraping movie url 7/490 --------------------
https://www.imdb.com/title/tt10037014/plotsummary
SKIPPING
-------------------- Scraping movie url 8/490 --------------------
https://www.imdb.com/title/tt10039344/plotsummary
-------------------

-------------------- Scraping movie url 67/490 --------------------
https://www.imdb.com/title/tt10272534/plotsummary
SKIPPING
-------------------- Scraping movie url 68/490 --------------------
https://www.imdb.com/title/tt10274284/plotsummary
-------------------- Scraping movie url 69/490 --------------------
https://www.imdb.com/title/tt10276470/plotsummary
SKIPPING
-------------------- Scraping movie url 70/490 --------------------
https://www.imdb.com/title/tt10279362/plotsummary
SKIPPING
-------------------- Scraping movie url 71/490 --------------------
https://www.imdb.com/title/tt10280276/plotsummary
SKIPPING
-------------------- Scraping movie url 72/490 --------------------
https://www.imdb.com/title/tt10280296/plotsummary
SKIPPING
-------------------- Scraping movie url 73/490 --------------------
https://www.imdb.com/title/tt10283902/plotsummary
SKIPPING
-------------------- Scraping movie url 74/490 --------------------
https://www.imdb.com/title/tt10287954/plotsummary
SK

SKIPPING
-------------------- Scraping movie url 133/490 --------------------
https://www.imdb.com/title/tt10483386/plotsummary
SKIPPING
-------------------- Scraping movie url 134/490 --------------------
https://www.imdb.com/title/tt10492998/plotsummary
SKIPPING
-------------------- Scraping movie url 135/490 --------------------
https://www.imdb.com/title/tt10508838/plotsummary
SKIPPING
-------------------- Scraping movie url 136/490 --------------------
https://www.imdb.com/title/tt10514222/plotsummary
-------------------- Scraping movie url 137/490 --------------------
https://www.imdb.com/title/tt10515086/plotsummary
SKIPPING
-------------------- Scraping movie url 138/490 --------------------
https://www.imdb.com/title/tt10515526/plotsummary
SKIPPING
-------------------- Scraping movie url 139/490 --------------------
https://www.imdb.com/title/tt10515988/plotsummary
SKIPPING
-------------------- Scraping movie url 140/490 --------------------
https://www.imdb.com/title/tt105210

SKIPPING
-------------------- Scraping movie url 198/490 --------------------
https://www.imdb.com/title/tt10805432/plotsummary
-------------------- Scraping movie url 199/490 --------------------
https://www.imdb.com/title/tt10808832/plotsummary
SKIPPING
-------------------- Scraping movie url 200/490 --------------------
https://www.imdb.com/title/tt10816484/plotsummary
SKIPPING
-------------------- Scraping movie url 201/490 --------------------
https://www.imdb.com/title/tt10831086/plotsummary
-------------------- Scraping movie url 202/490 --------------------
https://www.imdb.com/title/tt10843306/plotsummary
SKIPPING
-------------------- Scraping movie url 203/490 --------------------
https://www.imdb.com/title/tt10883302/plotsummary
SKIPPING
-------------------- Scraping movie url 204/490 --------------------
https://www.imdb.com/title/tt10883506/plotsummary
SKIPPING
-------------------- Scraping movie url 205/490 --------------------
https://www.imdb.com/title/tt10885444/plotsu

-------------------- Scraping movie url 264/490 --------------------
https://www.imdb.com/title/tt11127256/plotsummary
SKIPPING
-------------------- Scraping movie url 265/490 --------------------
https://www.imdb.com/title/tt11127690/plotsummary
SKIPPING
-------------------- Scraping movie url 266/490 --------------------
https://www.imdb.com/title/tt11140488/plotsummary
SKIPPING
-------------------- Scraping movie url 267/490 --------------------
https://www.imdb.com/title/tt11142762/plotsummary
SKIPPING
-------------------- Scraping movie url 268/490 --------------------
https://www.imdb.com/title/tt11161374/plotsummary
SKIPPING
-------------------- Scraping movie url 269/490 --------------------
https://www.imdb.com/title/tt11161474/plotsummary
-------------------- Scraping movie url 270/490 --------------------
https://www.imdb.com/title/tt11169050/plotsummary
SKIPPING
-------------------- Scraping movie url 271/490 --------------------
https://www.imdb.com/title/tt11176322/plotsu

SKIPPING
-------------------- Scraping movie url 329/490 --------------------
https://www.imdb.com/title/tt11576124/plotsummary
SKIPPING
-------------------- Scraping movie url 330/490 --------------------
https://www.imdb.com/title/tt11580854/plotsummary
SKIPPING
-------------------- Scraping movie url 331/490 --------------------
https://www.imdb.com/title/tt11581174/plotsummary
SKIPPING
-------------------- Scraping movie url 332/490 --------------------
https://www.imdb.com/title/tt11591306/plotsummary
SKIPPING
-------------------- Scraping movie url 333/490 --------------------
https://www.imdb.com/title/tt11591424/plotsummary
SKIPPING
-------------------- Scraping movie url 334/490 --------------------
https://www.imdb.com/title/tt11604676/plotsummary
SKIPPING
-------------------- Scraping movie url 335/490 --------------------
https://www.imdb.com/title/tt11614912/plotsummary
SKIPPING
-------------------- Scraping movie url 336/490 --------------------
https://www.imdb.com/title

SKIPPING
-------------------- Scraping movie url 394/490 --------------------
https://www.imdb.com/title/tt12676326/plotsummary
-------------------- Scraping movie url 395/490 --------------------
https://www.imdb.com/title/tt12677092/plotsummary
-------------------- Scraping movie url 396/490 --------------------
https://www.imdb.com/title/tt12680508/plotsummary
SKIPPING
-------------------- Scraping movie url 397/490 --------------------
https://www.imdb.com/title/tt12687276/plotsummary
SKIPPING
-------------------- Scraping movie url 398/490 --------------------
https://www.imdb.com/title/tt12731980/plotsummary
-------------------- Scraping movie url 399/490 --------------------
https://www.imdb.com/title/tt12735338/plotsummary
SKIPPING
-------------------- Scraping movie url 400/490 --------------------
https://www.imdb.com/title/tt12758600/plotsummary
SKIPPING
-------------------- Scraping movie url 401/490 --------------------
https://www.imdb.com/title/tt12763920/plotsummary
---

SKIPPING
-------------------- Scraping movie url 459/490 --------------------
https://www.imdb.com/title/tt13598976/plotsummary
SKIPPING
-------------------- Scraping movie url 460/490 --------------------
https://www.imdb.com/title/tt13698928/plotsummary
SKIPPING
-------------------- Scraping movie url 461/490 --------------------
https://www.imdb.com/title/tt13717980/plotsummary
SKIPPING
-------------------- Scraping movie url 462/490 --------------------
https://www.imdb.com/title/tt13723064/plotsummary
SKIPPING
-------------------- Scraping movie url 463/490 --------------------
https://www.imdb.com/title/tt13804084/plotsummary
SKIPPING
-------------------- Scraping movie url 464/490 --------------------
https://www.imdb.com/title/tt13834006/plotsummary
SKIPPING
-------------------- Scraping movie url 465/490 --------------------
https://www.imdb.com/title/tt13846542/plotsummary
SKIPPING
-------------------- Scraping movie url 466/490 --------------------
https://www.imdb.com/title

In [10]:
fileList = glob.glob("../movieFile/*.txt")
activeMovs = []
for i in fileList:
    activeMovs.append(i.replace('../movieFile/','').replace('.txt',''))
    
df_act_movs = pd.DataFrame(activeMovs)
df_act_movs.columns = ['tconst']
df_act_movs.head()

Unnamed: 0,tconst
0,tt0250687
1,tt0405393
2,tt2318092
3,tt0118647
4,tt0101316


In [11]:
print(len(actors))
actors = actors.merge(df_act_movs,how='inner')
print(len(actors))

57041
28176


# Store actors list with the scraped movies

In [12]:
print(len(actors))
actors.to_csv('../files/actors.csv',header=True,index=False)
actors.head()

28176


Unnamed: 0,nconst,primaryName,tconst
0,nm0000004,John Belushi,tt0077975
1,nm0000261,Karen Allen,tt0077975
2,nm0001371,Tom Hulce,tt0077975
3,nm0299122,Stephen Furst,tt0077975
4,nm0000004,John Belushi,tt0078723
