# Download name.basics.tsv ad Read

In [1]:
import pandas as pd
actors = pd.read_csv('/Users/apillai2/Downloads/name.basics.tsv', sep='\t', header=0)

In [2]:
actors.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0031983,tt0072308,tt0050419,tt0053137"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0037382,tt0117057,tt0038355,tt0071877"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0057345,tt0056404,tt0054452,tt0049189"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0072562,tt0080455,tt0078723,tt0077975"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050976,tt0060827,tt0083922,tt0050986"


In [3]:
len(actors)

11359935

# Apply Filters

In [4]:
actors = actors[(actors.birthYear > '1960') & (actors.primaryProfession.str.contains('act'))]
actors['titleCount'] = actors['knownForTitles'].str.count(',')+1
actors = actors[(actors.titleCount > 3)]
print(len(actors))
actors = actors[0:100]
actors = actors.reset_index(drop=True)
print(len(actors))
actors.head()

771701
100


Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles,titleCount
0,nm0000084,Gong Li,1965,\N,actress,"tt0101640,tt0430357,tt0397535,tt0473444",4
1,nm0000093,Brad Pitt,1963,\N,"actor,producer,soundtrack","tt0114746,tt1210166,tt0356910,tt2935510",4
2,nm0000096,Gillian Anderson,1968,\N,"actress,producer,soundtrack","tt0106179,tt2294189,tt0455590,tt0442632",4
3,nm0000097,Pamela Anderson,1967,\N,"actress,producer,director","tt0115624,tt0267913,tt0306047,tt0426592",4
4,nm0000098,Jennifer Aniston,1969,\N,"actress,producer,soundtrack","tt1723121,tt0279113,tt0108778,tt3442006",4


# Check Samples

In [5]:
actors[actors.primaryName.str.contains('Brad Pitt')]

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles,titleCount
1,nm0000093,Brad Pitt,1963,\N,"actor,producer,soundtrack","tt0114746,tt1210166,tt0356910,tt2935510",4


In [6]:
actors[actors.primaryName.str.contains('Chris Nolan')]

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles,titleCount


# Get Unique Titles for Actors

In [7]:
allTitles = ['tt0118571']
for i in actors['knownForTitles'].tolist():
    allTitles=allTitles+i.split(',')

uniqTitles = list(set(allTitles))
len(uniqTitles)

362

# Scrape Related Fuctions

In [8]:
from bs4 import BeautifulSoup
from selenium import webdriver 
from selenium.webdriver.chrome.options import Options
import re 
import urllib
import time

In [9]:
#create a webdriver object and set options for headless browsing
options = Options()
options.headless = True
driver = webdriver.Chrome('./chromedriver',options=options)
#driver = webdriver.Chrome("/usr/local/bin/chromedriver",options=options)

In [10]:
#uses webdriver object to execute javascript code and get dynamically loaded webcontent
def get_js_soup(url,driver):
    driver.get(url)
    res_html = driver.execute_script('return document.body.innerHTML')
    soup = BeautifulSoup(res_html,'html.parser') #beautiful soup object to be used for parsing html content
    return soup

#tidies extracted text 
def process_bio(bio):
    bio = bio.encode('ascii',errors='ignore').decode('utf-8')       #removes non-ascii characters
    bio = re.sub('\s+',' ',bio)       #repalces repeated whitespace characters with single space
    return bio

''' More tidying
Sometimes the text extracted HTML webpage may contain javascript code and some style elements. 
This function removes script and style tags from HTML so that extracted text does not contain them.
'''
def remove_script(soup):
    for script in soup(["script", "style"]):
        script.decompose()
    return soup


#Checks if bio_url is a valid faculty homepage
def is_valid_homepage(bio_url,dir_url):
    if bio_url.endswith('.pdf'): #we're not parsing pdfs
        return False
    try:
        #sometimes the homepage url points to the same page as the faculty profile page
        #which should be treated differently from an actual homepage
        ret_url = urllib.request.urlopen(bio_url).geturl() 
    except:
        return False       #unable to access bio_url
    urls = [re.sub('((https?://)|(www.))','',url) for url in [ret_url,dir_url]] #removes url scheme (https,http or www) 
    return not(urls[0]== urls[1])

def scrape_movie_page(fac_url,driver):
    soup = get_js_soup(fac_url,driver)
    homepage_found = False
    bio_url = fac_url
    #profile_sec = soup.find('section',class_='main-content')
    profile_sec = soup.find('section',class_='article listo')
    if profile_sec == None:
        bio = ''
        print("Skipping bio : ",bio_url)
    else:
        bio = process_bio(profile_sec.get_text(separator=' '))
    #print("bio is ",bio_url)
    return bio_url,bio

In [11]:
def write_lst(lst,file_):
    lst = lst.replace(' Plot Showing all 0 items Jump to: ','')
    lst = lst.replace(' Plot Showing all 1 items Jump to: Summaries (1) ','')
    lst = lst.replace(' Plot Showing all 2 items Jump to: Summaries (2) ','')
    lst = lst.replace(' Plot Showing all 3 items Jump to: Summaries (3) ','')
    lst = lst.replace(' Plot Showing all ','').replace(' items Jump to: Summaries (','')
    lst = lst.replace(') Synopsis (','').replace(' Synopsis','').replace(' Summaries','')
    #lst = lst[6:]
    lst = lst.replace('Edit ','')
    #lst = lst.replace(' Summaries ','\n\nSummaries\n')
    #lst = lst.replace(' Synopsis ','\n\nSynopsis\n')

    with open(file_,'w') as f:
        f.write(lst)

# Scrape Movie plot summary and store in "movieFile" dir 

In [12]:
#Scrape homepages of all urls
tot_urls = len(uniqTitles)
for i,l in enumerate(uniqTitles):
    link="https://www.imdb.com/title/"+l+"/plotsummary"
    print ('-'*20,'Scraping movie url {}/{}'.format(i+1,tot_urls),'-'*20)
    print(link)
    bio_urls, bios = [],[]
    bio_url,bio = scrape_movie_page(link,driver)
    if bio.strip()!= '' and bio_url.strip()!='':  
        bio_file = 'movieFile/'+l+'.txt'
        write_lst(bio,bio_file)
driver.close()

-------------------- Scraping movie url 1/362 --------------------
https://www.imdb.com/title/tt0100405/plotsummary
-------------------- Scraping movie url 2/362 --------------------
https://www.imdb.com/title/tt0095169/plotsummary
-------------------- Scraping movie url 3/362 --------------------
https://www.imdb.com/title/tt0134119/plotsummary
-------------------- Scraping movie url 4/362 --------------------
https://www.imdb.com/title/tt0096018/plotsummary
-------------------- Scraping movie url 5/362 --------------------
https://www.imdb.com/title/tt0250797/plotsummary
-------------------- Scraping movie url 6/362 --------------------
https://www.imdb.com/title/tt0087538/plotsummary
-------------------- Scraping movie url 7/362 --------------------
https://www.imdb.com/title/tt1024648/plotsummary
-------------------- Scraping movie url 8/362 --------------------
https://www.imdb.com/title/tt0091790/plotsummary
-------------------- Scraping movie url 9/362 --------------------
https

-------------------- Scraping movie url 72/362 --------------------
https://www.imdb.com/title/tt0111438/plotsummary
-------------------- Scraping movie url 73/362 --------------------
https://www.imdb.com/title/tt0083929/plotsummary
-------------------- Scraping movie url 74/362 --------------------
https://www.imdb.com/title/tt0132347/plotsummary
-------------------- Scraping movie url 75/362 --------------------
https://www.imdb.com/title/tt0119654/plotsummary
-------------------- Scraping movie url 76/362 --------------------
https://www.imdb.com/title/tt0947798/plotsummary
-------------------- Scraping movie url 77/362 --------------------
https://www.imdb.com/title/tt2935510/plotsummary
-------------------- Scraping movie url 78/362 --------------------
https://www.imdb.com/title/tt0189142/plotsummary
-------------------- Scraping movie url 79/362 --------------------
https://www.imdb.com/title/tt0119141/plotsummary
-------------------- Scraping movie url 80/362 -----------------

-------------------- Scraping movie url 142/362 --------------------
https://www.imdb.com/title/tt0093779/plotsummary
-------------------- Scraping movie url 143/362 --------------------
https://www.imdb.com/title/tt0111257/plotsummary
-------------------- Scraping movie url 144/362 --------------------
https://www.imdb.com/title/tt2209418/plotsummary
-------------------- Scraping movie url 145/362 --------------------
https://www.imdb.com/title/tt0098936/plotsummary
-------------------- Scraping movie url 146/362 --------------------
https://www.imdb.com/title/tt0180093/plotsummary
-------------------- Scraping movie url 147/362 --------------------
https://www.imdb.com/title/tt0810788/plotsummary
-------------------- Scraping movie url 148/362 --------------------
https://www.imdb.com/title/tt0124298/plotsummary
-------------------- Scraping movie url 149/362 --------------------
https://www.imdb.com/title/tt0131857/plotsummary
-------------------- Scraping movie url 150/362 --------

-------------------- Scraping movie url 212/362 --------------------
https://www.imdb.com/title/tt0098258/plotsummary
-------------------- Scraping movie url 213/362 --------------------
https://www.imdb.com/title/tt0343660/plotsummary
-------------------- Scraping movie url 214/362 --------------------
https://www.imdb.com/title/tt0267913/plotsummary
-------------------- Scraping movie url 215/362 --------------------
https://www.imdb.com/title/tt0162222/plotsummary
-------------------- Scraping movie url 216/362 --------------------
https://www.imdb.com/title/tt0086827/plotsummary
-------------------- Scraping movie url 217/362 --------------------
https://www.imdb.com/title/tt0110912/plotsummary
-------------------- Scraping movie url 218/362 --------------------
https://www.imdb.com/title/tt0315327/plotsummary
-------------------- Scraping movie url 219/362 --------------------
https://www.imdb.com/title/tt0119229/plotsummary
-------------------- Scraping movie url 220/362 --------

-------------------- Scraping movie url 282/362 --------------------
https://www.imdb.com/title/tt0449088/plotsummary
-------------------- Scraping movie url 283/362 --------------------
https://www.imdb.com/title/tt5503686/plotsummary
-------------------- Scraping movie url 284/362 --------------------
https://www.imdb.com/title/tt0120632/plotsummary
-------------------- Scraping movie url 285/362 --------------------
https://www.imdb.com/title/tt0450385/plotsummary
-------------------- Scraping movie url 286/362 --------------------
https://www.imdb.com/title/tt0169547/plotsummary
-------------------- Scraping movie url 287/362 --------------------
https://www.imdb.com/title/tt0105793/plotsummary
-------------------- Scraping movie url 288/362 --------------------
https://www.imdb.com/title/tt0111400/plotsummary
-------------------- Scraping movie url 289/362 --------------------
https://www.imdb.com/title/tt1219827/plotsummary
-------------------- Scraping movie url 290/362 --------

-------------------- Scraping movie url 352/362 --------------------
https://www.imdb.com/title/tt1596350/plotsummary
-------------------- Scraping movie url 353/362 --------------------
https://www.imdb.com/title/tt0144084/plotsummary
-------------------- Scraping movie url 354/362 --------------------
https://www.imdb.com/title/tt0115736/plotsummary
-------------------- Scraping movie url 355/362 --------------------
https://www.imdb.com/title/tt0114436/plotsummary
-------------------- Scraping movie url 356/362 --------------------
https://www.imdb.com/title/tt0172493/plotsummary
-------------------- Scraping movie url 357/362 --------------------
https://www.imdb.com/title/tt0120738/plotsummary
-------------------- Scraping movie url 358/362 --------------------
https://www.imdb.com/title/tt0109040/plotsummary
-------------------- Scraping movie url 359/362 --------------------
https://www.imdb.com/title/tt0115963/plotsummary
-------------------- Scraping movie url 360/362 --------

# Cocatenate movies for an actor and store as a single document

In [13]:
for i in range(len(actors)):
    fl_1 = actors['nconst'][i]
    #dataAct=actors['primaryName'][i]+':\n'
    dataAct=actors['nconst'][i]+':\n'
    for fl_2 in actors['knownForTitles'][i].split(','):
        fl_2 = 'movieFile/'+fl_2+'.txt'
        with open(fl_2, 'r') as file:
            data = file.read().replace('\n \n', '')
            dataAct = dataAct+'\n'+data
    act_file = 'actorFile/'+fl_1+'.txt'
    write_lst(dataAct,act_file) 

In [14]:
actors.to_csv('actors.csv',header=True,index=False)
actors.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles,titleCount
0,nm0000084,Gong Li,1965,\N,actress,"tt0101640,tt0430357,tt0397535,tt0473444",4
1,nm0000093,Brad Pitt,1963,\N,"actor,producer,soundtrack","tt0114746,tt1210166,tt0356910,tt2935510",4
2,nm0000096,Gillian Anderson,1968,\N,"actress,producer,soundtrack","tt0106179,tt2294189,tt0455590,tt0442632",4
3,nm0000097,Pamela Anderson,1967,\N,"actress,producer,director","tt0115624,tt0267913,tt0306047,tt0426592",4
4,nm0000098,Jennifer Aniston,1969,\N,"actress,producer,soundtrack","tt1723121,tt0279113,tt0108778,tt3442006",4
