# Read actors File

In [10]:
import pandas as pd
from os.path import exists
import glob
actors = pd.read_csv('../files/actors.csv', header=0)

# Apply Filters

In [11]:
print("Actor Count before Filter:",len(actors))
#actors = actors[0:1000]
actors = actors.reset_index(drop=True)
print(len(actors))
actors.head()

Actor Count before Filter: 28176
28176


Unnamed: 0,nconst,primaryName,tconst
0,nm0000004,John Belushi,tt0077975
1,nm0000261,Karen Allen,tt0077975
2,nm0001371,Tom Hulce,tt0077975
3,nm0299122,Stephen Furst,tt0077975
4,nm0000004,John Belushi,tt0078723


# Check Samples

# Get Unique Titles for Actors

In [12]:
uniqTitlesList = list(set(actors['tconst']))
uniqTitlesList.sort()
print(len(uniqTitlesList))

8905


# Scrape Related Fuctions

In [13]:
from bs4 import BeautifulSoup
from selenium import webdriver 
from selenium.webdriver.chrome.options import Options
import re 
import urllib
import time

In [14]:
#create a webdriver object and set options for headless browsing
options = Options()
options.headless = True
driver = webdriver.Chrome('../chromedriver',options=options)


In [15]:
#uses webdriver object to execute javascript code and get dynamically loaded webcontent
def get_js_soup(url,driver):
    driver.get(url)
    res_html = driver.execute_script('return document.body.innerHTML')
    soup = BeautifulSoup(res_html,'html.parser') #beautiful soup object to be used for parsing html content
    return soup

#tidies extracted text 
def process_mov(mov):
    mov = mov.encode('ascii',errors='ignore').decode('utf-8')       #removes non-ascii characters
    mov = re.sub('\s+',' ',mov)       #repalces repeated whitespace characters with single space
    return mov

''' More tidying
Sometimes the text extracted HTML webpage may contain javascript code and some style elements. 
This function removes script and style tags from HTML so that extracted text does not contain them.
'''
def remove_script(soup):
    for script in soup(["script", "style"]):
        script.decompose()
    return soup


#Checks if bio_url is a valid faculty homepage
def is_valid_homepage(mov_url,dir_url):
    if mov_url.endswith('.pdf'): #we're not parsing pdfs
        return False
    try:
        #sometimes the homepage url points to the same page as the faculty profile page
        #which should be treated differently from an actual homepage
        ret_url = urllib.request.urlopen(mov_url).geturl() 
    except:
        return False       #unable to access bio_url
    urls = [re.sub('((https?://)|(www.))','',url) for url in [ret_url,dir_url]] #removes url scheme (https,http or www) 
    return not(urls[0]== urls[1])

def scrape_movie_page(mov_url,driver):
    soup = get_js_soup(mov_url,driver)
    homepage_found = False
    #profile_sec = soup.find('section',class_='main-content')
    profile_sec = soup.find('div',class_='article listo')
    if profile_sec == None:
        mov = ''
        print("Skipping mov : ",mov_url)
    else:
        mov = process_mov(profile_sec.get_text(separator=' '))
    return mov_url,mov

In [16]:
def write_lst(lst,file_):
    lst = lst.replace(' Plot Keywords Sort By: Relevance Alphabetical Showing all ',' ')
    lst = lst.replace(' found this relevant Relevant? Yes No ',' ')
    lst = lst.replace(' Is this relevant? Relevant? Yes No ',' ')
    lst = lst.replace(' 1 of 1 ',' ').replace(' 0 of 1 ',' ')
    lst = lst.replace(' 2 of 2 ',' ').replace(' 0 of 2 ',' ').replace(' 1 of 2 ',' ')
    lst = lst.replace(' 3 of 3 ',' ').replace(' 0 of 3 ',' ').replace(' 1 of 3 ',' ').replace(' 2 of 3 ',' ')
    lst = lst.replace(' Plot Showing all ','').replace(' items Jump to: Summaries (','')
    lst = lst.replace('  ',' ').replace('   ',' ')

    lst = lst.replace('Edit ','')


    with open(file_,'w') as f:
        f.write(lst)

# Scrape Movie Tags and store in "movieFileTags" dir 

In [17]:
fileList = glob.glob("../movieFileTags/*.txt")

fileListVals = []
for i in fileList:
    fileListVals.append(int(i.replace('../movieFileTags/','').replace('.txt','').replace('tt','')))
    
print(len(fileListVals),max(fileListVals))
    

if len(fileList) <= 0:
    last_scraped = 0
else:
    last_scraped = (max(fileListVals))


uniqTitlesInt = sorted(int(i.replace('tt','')) for i in uniqTitlesList if int(i.replace('tt','')) > last_scraped)

uniqTitles = []
for i in uniqTitlesInt:
    uniqTitles.append('tt'+str(i))
    
print('tt' + str(last_scraped))
print(len(uniqTitles))

7618 4591310
tt4591310
853


In [18]:
#Scrape homepages of all urls
tot_urls = len(uniqTitles)
for i,l in enumerate(uniqTitles):
    link="https://www.imdb.com/title/"+l+"/keywords"
    print ('-'*20,'Scraping movie url {}/{}'.format(i+1,tot_urls),'-'*20)
    print(link)
    mov_urls, movs = [],[]
    mov_url,mov = scrape_movie_page(link,driver)
    if len(mov) < 10:
        print("SKIPPING")
    else:
        if mov.strip()!= '' and mov_url.strip()!='':
            mov_file = '../movieFileTags/'+l+'.txt'
            write_lst(mov,mov_file)
driver.close()

-------------------- Scraping movie url 1/853 --------------------
https://www.imdb.com/title/tt4595882/keywords
-------------------- Scraping movie url 2/853 --------------------
https://www.imdb.com/title/tt4600952/keywords
-------------------- Scraping movie url 3/853 --------------------
https://www.imdb.com/title/tt4621630/keywords
-------------------- Scraping movie url 4/853 --------------------
https://www.imdb.com/title/tt4622512/keywords
-------------------- Scraping movie url 5/853 --------------------
https://www.imdb.com/title/tt4624424/keywords
-------------------- Scraping movie url 6/853 --------------------
https://www.imdb.com/title/tt4629266/keywords
-------------------- Scraping movie url 7/853 --------------------
https://www.imdb.com/title/tt4630562/keywords
-------------------- Scraping movie url 8/853 --------------------
https://www.imdb.com/title/tt4633690/keywords
-------------------- Scraping movie url 9/853 --------------------
https://www.imdb.com/title/tt

-------------------- Scraping movie url 73/853 --------------------
https://www.imdb.com/title/tt4846340/keywords
-------------------- Scraping movie url 74/853 --------------------
https://www.imdb.com/title/tt4849438/keywords
-------------------- Scraping movie url 75/853 --------------------
https://www.imdb.com/title/tt4853102/keywords
-------------------- Scraping movie url 76/853 --------------------
https://www.imdb.com/title/tt4857264/keywords
-------------------- Scraping movie url 77/853 --------------------
https://www.imdb.com/title/tt4858674/keywords
-------------------- Scraping movie url 78/853 --------------------
https://www.imdb.com/title/tt4865436/keywords
-------------------- Scraping movie url 79/853 --------------------
https://www.imdb.com/title/tt4866448/keywords
-------------------- Scraping movie url 80/853 --------------------
https://www.imdb.com/title/tt4876134/keywords
-------------------- Scraping movie url 81/853 --------------------
https://www.imdb.com

-------------------- Scraping movie url 145/853 --------------------
https://www.imdb.com/title/tt5143226/keywords
-------------------- Scraping movie url 146/853 --------------------
https://www.imdb.com/title/tt5144174/keywords
-------------------- Scraping movie url 147/853 --------------------
https://www.imdb.com/title/tt5153288/keywords
-------------------- Scraping movie url 148/853 --------------------
https://www.imdb.com/title/tt5155780/keywords
-------------------- Scraping movie url 149/853 --------------------
https://www.imdb.com/title/tt5157052/keywords
-------------------- Scraping movie url 150/853 --------------------
https://www.imdb.com/title/tt5157682/keywords
-------------------- Scraping movie url 151/853 --------------------
https://www.imdb.com/title/tt5158522/keywords
-------------------- Scraping movie url 152/853 --------------------
https://www.imdb.com/title/tt5164214/keywords
-------------------- Scraping movie url 153/853 --------------------
https://www

-------------------- Scraping movie url 217/853 --------------------
https://www.imdb.com/title/tt5580266/keywords
-------------------- Scraping movie url 218/853 --------------------
https://www.imdb.com/title/tt5580390/keywords
-------------------- Scraping movie url 219/853 --------------------
https://www.imdb.com/title/tt5592248/keywords
-------------------- Scraping movie url 220/853 --------------------
https://www.imdb.com/title/tt5592878/keywords
-------------------- Scraping movie url 221/853 --------------------
https://www.imdb.com/title/tt5598292/keywords
-------------------- Scraping movie url 222/853 --------------------
https://www.imdb.com/title/tt5600714/keywords
-------------------- Scraping movie url 223/853 --------------------
https://www.imdb.com/title/tt5606664/keywords
-------------------- Scraping movie url 224/853 --------------------
https://www.imdb.com/title/tt5607096/keywords
-------------------- Scraping movie url 225/853 --------------------
https://www

-------------------- Scraping movie url 289/853 --------------------
https://www.imdb.com/title/tt5886440/keywords
-------------------- Scraping movie url 290/853 --------------------
https://www.imdb.com/title/tt5892746/keywords
-------------------- Scraping movie url 291/853 --------------------
https://www.imdb.com/title/tt5913798/keywords
-------------------- Scraping movie url 292/853 --------------------
https://www.imdb.com/title/tt5918982/keywords
-------------------- Scraping movie url 293/853 --------------------
https://www.imdb.com/title/tt5929750/keywords
-------------------- Scraping movie url 294/853 --------------------
https://www.imdb.com/title/tt5935704/keywords
-------------------- Scraping movie url 295/853 --------------------
https://www.imdb.com/title/tt5941692/keywords
-------------------- Scraping movie url 296/853 --------------------
https://www.imdb.com/title/tt5956100/keywords
-------------------- Scraping movie url 297/853 --------------------
https://www

-------------------- Scraping movie url 361/853 --------------------
https://www.imdb.com/title/tt6285944/keywords
-------------------- Scraping movie url 362/853 --------------------
https://www.imdb.com/title/tt6288124/keywords
-------------------- Scraping movie url 363/853 --------------------
https://www.imdb.com/title/tt6288250/keywords
-------------------- Scraping movie url 364/853 --------------------
https://www.imdb.com/title/tt6288694/keywords
-------------------- Scraping movie url 365/853 --------------------
https://www.imdb.com/title/tt6292852/keywords
-------------------- Scraping movie url 366/853 --------------------
https://www.imdb.com/title/tt6294822/keywords
-------------------- Scraping movie url 367/853 --------------------
https://www.imdb.com/title/tt6294892/keywords
-------------------- Scraping movie url 368/853 --------------------
https://www.imdb.com/title/tt6296236/keywords
-------------------- Scraping movie url 369/853 --------------------
https://www

-------------------- Scraping movie url 433/853 --------------------
https://www.imdb.com/title/tt6735740/keywords
-------------------- Scraping movie url 434/853 --------------------
https://www.imdb.com/title/tt6738136/keywords
-------------------- Scraping movie url 435/853 --------------------
https://www.imdb.com/title/tt6742252/keywords
-------------------- Scraping movie url 436/853 --------------------
https://www.imdb.com/title/tt6749318/keywords
-------------------- Scraping movie url 437/853 --------------------
https://www.imdb.com/title/tt6751668/keywords
-------------------- Scraping movie url 438/853 --------------------
https://www.imdb.com/title/tt6769280/keywords
-------------------- Scraping movie url 439/853 --------------------
https://www.imdb.com/title/tt6772802/keywords
-------------------- Scraping movie url 440/853 --------------------
https://www.imdb.com/title/tt6772950/keywords
-------------------- Scraping movie url 441/853 --------------------
https://www

-------------------- Scraping movie url 505/853 --------------------
https://www.imdb.com/title/tt7126948/keywords
-------------------- Scraping movie url 506/853 --------------------
https://www.imdb.com/title/tt7131622/keywords
-------------------- Scraping movie url 507/853 --------------------
https://www.imdb.com/title/tt7131870/keywords
-------------------- Scraping movie url 508/853 --------------------
https://www.imdb.com/title/tt7134096/keywords
-------------------- Scraping movie url 509/853 --------------------
https://www.imdb.com/title/tt7137380/keywords
-------------------- Scraping movie url 510/853 --------------------
https://www.imdb.com/title/tt7137846/keywords
-------------------- Scraping movie url 511/853 --------------------
https://www.imdb.com/title/tt7146812/keywords
-------------------- Scraping movie url 512/853 --------------------
https://www.imdb.com/title/tt7153766/keywords
-------------------- Scraping movie url 513/853 --------------------
https://www

-------------------- Scraping movie url 577/853 --------------------
https://www.imdb.com/title/tt7668870/keywords
-------------------- Scraping movie url 578/853 --------------------
https://www.imdb.com/title/tt7689906/keywords
-------------------- Scraping movie url 579/853 --------------------
https://www.imdb.com/title/tt7690638/keywords
-------------------- Scraping movie url 580/853 --------------------
https://www.imdb.com/title/tt7690670/keywords
-------------------- Scraping movie url 581/853 --------------------
https://www.imdb.com/title/tt7691572/keywords
-------------------- Scraping movie url 582/853 --------------------
https://www.imdb.com/title/tt7698468/keywords
-------------------- Scraping movie url 583/853 --------------------
https://www.imdb.com/title/tt7703924/keywords
-------------------- Scraping movie url 584/853 --------------------
https://www.imdb.com/title/tt7713068/keywords
-------------------- Scraping movie url 585/853 --------------------
https://www

-------------------- Scraping movie url 649/853 --------------------
https://www.imdb.com/title/tt8332666/keywords
-------------------- Scraping movie url 650/853 --------------------
https://www.imdb.com/title/tt8332802/keywords
-------------------- Scraping movie url 651/853 --------------------
https://www.imdb.com/title/tt8332922/keywords
-------------------- Scraping movie url 652/853 --------------------
https://www.imdb.com/title/tt8333746/keywords
-------------------- Scraping movie url 653/853 --------------------
https://www.imdb.com/title/tt8350360/keywords
-------------------- Scraping movie url 654/853 --------------------
https://www.imdb.com/title/tt8361028/keywords
-------------------- Scraping movie url 655/853 --------------------
https://www.imdb.com/title/tt8364368/keywords
-------------------- Scraping movie url 656/853 --------------------
https://www.imdb.com/title/tt8366502/keywords
-------------------- Scraping movie url 657/853 --------------------
https://www

-------------------- Scraping movie url 721/853 --------------------
https://www.imdb.com/title/tt9214832/keywords
-------------------- Scraping movie url 722/853 --------------------
https://www.imdb.com/title/tt9243804/keywords
-------------------- Scraping movie url 723/853 --------------------
https://www.imdb.com/title/tt9243946/keywords
-------------------- Scraping movie url 724/853 --------------------
https://www.imdb.com/title/tt9264728/keywords
-------------------- Scraping movie url 725/853 --------------------
https://www.imdb.com/title/tt9308382/keywords
-------------------- Scraping movie url 726/853 --------------------
https://www.imdb.com/title/tt9340860/keywords
-------------------- Scraping movie url 727/853 --------------------
https://www.imdb.com/title/tt9347730/keywords
-------------------- Scraping movie url 728/853 --------------------
https://www.imdb.com/title/tt9354842/keywords
-------------------- Scraping movie url 729/853 --------------------
https://www

-------------------- Scraping movie url 792/853 --------------------
https://www.imdb.com/title/tt10481868/keywords
-------------------- Scraping movie url 793/853 --------------------
https://www.imdb.com/title/tt10514222/keywords
-------------------- Scraping movie url 794/853 --------------------
https://www.imdb.com/title/tt10530176/keywords
-------------------- Scraping movie url 795/853 --------------------
https://www.imdb.com/title/tt10539608/keywords
-------------------- Scraping movie url 796/853 --------------------
https://www.imdb.com/title/tt10589914/keywords
-------------------- Scraping movie url 797/853 --------------------
https://www.imdb.com/title/tt10618286/keywords
-------------------- Scraping movie url 798/853 --------------------
https://www.imdb.com/title/tt10633456/keywords
-------------------- Scraping movie url 799/853 --------------------
https://www.imdb.com/title/tt10665338/keywords
-------------------- Scraping movie url 800/853 --------------------
htt

In [19]:
fileList = glob.glob("../movieFileTags/*.txt")
activeMovs = []
for i in fileList:
    activeMovs.append(i.replace('../movieFileTags/','').replace('.txt',''))
    
df_act_movs = pd.DataFrame(activeMovs)
df_act_movs.columns = ['tconst']
df_act_movs.head()

Unnamed: 0,tconst
0,tt0250687
1,tt0405393
2,tt2318092
3,tt0118647
4,tt0101316


In [20]:
print(len(actors))
actors = actors.merge(df_act_movs,how='inner')
print(len(actors))

28176
26755


# Prepare actors list for scraped movies

In [21]:
print(len(actors))
actors.to_csv('../files/actors.csv',header=True,index=False)
actors.head()

26755


Unnamed: 0,nconst,primaryName,tconst
0,nm0000004,John Belushi,tt0077975
1,nm0000261,Karen Allen,tt0077975
2,nm0001371,Tom Hulce,tt0077975
3,nm0299122,Stephen Furst,tt0077975
4,nm0000004,John Belushi,tt0078723


In [22]:
print(len(df_act_movs))

8471
