# Scrape VIFF films to Excel

Script to scrape film information from the Vancouver International Film Festival (VIFF) website

In [22]:
from platform import python_version
python_version()

'2.7.15'

In [7]:
# Python script to scrape film information from the
# Vancouver International Film Festival (VIFF) website
# using lxml

import pandas as pd
import numpy as np
import requests
import pickle
from lxml import html

try:
    # For Python 3
    from urllib.parse import urlparse
except ImportError:
    # For Python 2
    from urlparse import urlparse

# build the lxml tree from the chosen website
# Used this as guide:
# https://docs.python-guide.org/scenarios/scrape/

# Enter the base URL where the A-Z list of films is here (from
# the viff.org home page find the 'Search by Title' option):
start_page = "https://www.viff.org/Online/default.asp?" \
             "doWork::WScontent::loadArticle=Load&BOparam::" \
             "WScontent::loadArticle::" \
             "article_id=D5FA11B0-61FD-4217-AAF7-1FC44D897DA1"

start_page_parse_result = urlparse(start_page)
start_page_parse_result

ParseResult(scheme='https', netloc='www.viff.org', path='/Online/default.asp', params='', query='doWork::WScontent::loadArticle=Load&BOparam::WScontent::loadArticle::article_id=D5FA11B0-61FD-4217-AAF7-1FC44D897DA1', fragment='')

In [8]:
page = requests.get(start_page_parse_result.geturl())
tree = html.fromstring(page.text)

In [9]:
# Now extract the information we want

# Get a list of URLs to the VIFF pages of each film
film_elements = tree.xpath('/html/body//div[@class="article-container main-article-body"]//div/a')
film_page_links = {
    el.text.strip(): el.attrib['href'] for el in film_elements
}

print("Found links for %d film pages." % len(film_page_links))

Found links for 355 film pages.


In [11]:
def convert_link_to_url(link, scheme=start_page_parse_result.scheme,
                        netloc=start_page_parse_result.netloc,
                        path=start_page_parse_result.path):
    
    """Converts a relative link such as 'default.asp' into an absolute url
    such as 'https://www.viff.org/Online/default.asp' using the scheme,
    netloc, and path specified."""
    
    parse_result = urlparse(link)

    if parse_result.netloc is '':
        parse_result = parse_result._replace(netloc=netloc)
    if parse_result.scheme is '':
        parse_result = parse_result._replace(scheme=scheme)
    if start_page_parse_result.path.endswith(parse_result.path):
        parse_result = parse_result._replace(path=path)
    
    return parse_result.geturl()

def get_page_lxml_tree(url):
    """Request page from url and convert contents into lxml tree"""

    page = requests.get(url)

    return html.fromstring(page.content)

In [6]:
# Prepare dictionary to collect the film information
year = pd.datetime.now().date().year
data_filename = "viff_data_%d.pickle" % year
try:
    with open("viff_data_%d.pickle" % year, 'rb') as f:
        films = pickle.load(f)
except:
    films = {}
else:
    print("Data from %d films found in data file." % len(films))

Data from 10 films found in data file.


In [7]:
# Now load each film page and extract the information we are looking for

# Set number of films to parse each time (None for all)
batch = 10

already_done = list(films.keys())
total_count = len(films)

print("\nReading information on each film")

for title, link in film_page_links.items():
    
    # Skip if already in films dictionary
    if title in already_done:
        continue

    # Convert link to a complete url
    page_url = convert_link_to_url(link)

    # Request page and convert contents to lxml tree
    film_page_tree = get_page_lxml_tree(page_url)

    # Get film title (xpath: '//*[@class="movie-title"]')
    try:
        film_title = film_page_tree.find('.//h1[@class="movie-title"]').text.strip()
    except:
        print("WARNING: Getting title for film '%s' failed -> Skipped." % film_title)
        continue

    # Get film information
    film_info = {}
    try:
        film_information_elements = film_page_tree.find('.//div[@class="movie-information"]').getchildren()
    except:
        print("WARNING: Getting information for film '%s' failed." % film_title)
    else:
        # Put film information into a dictionary
        labels = ['Director', 'Year:', 'Country of Origin:', 
                  'Running Time:', 'Language:']
        for e in film_information_elements:
            text = e.text_content().strip()

            for label in labels:
                if text[0:len(label)] == label:

                    # remove ':' if there is one
                    if label[-1] == ':':
                        key = label[0:-1]
                    else:
                        key = label

                    film_info[key] = text[text.find(label) + len(label):].strip()

    # Get film description
    try:
        film_description = film_page_tree.find('.//div[@class="movie-description"]').text_content().strip()
    except:
        film_description = None
        print("WARNING: Description for film '%s' missing." % film_title)

    print(" %4d: %s -> %d info records found." % (total_count, film_title, len(film_info)))

    films[film_title] = {
        'Information': film_info,
        'Description': film_description
    }

    total_count += 1
    if batch:
        batch -= 1
        if batch < 1:
            break



Reading information on each film
   10: At War -> 5 info records found.
   11: Where -> 4 info records found.
   12: The Favourite -> 4 info records found.
   13: The Hymns of Muscovy -> 5 info records found.
   14: ante mis ojos -> 5 info records found.
   15: The Front Runner -> 4 info records found.
   16: Prawn -> 4 info records found.
   17: Colette -> 4 info records found.
   18: Daughter of Mine -> 5 info records found.
   19: Birds of Passage -> 5 info records found.


In [8]:
print("\nInformation on %d films now acquired." % len(films))

# Now save the film records to an excel file
with open(data_filename, 'wb') as f:
    pickle.dump(films, f)
print("Results saved to file '%s'." % data_filename)


Information on 20 films now acquired.
Results saved to file 'viff_data_2018.pickle'.


In [17]:
# Move data into pandas dataframe
columns = set()
data = dict()

for film in films: 
    for key in films[film]['Information']:
        columns = columns.union([key])

for column in columns:
    data[column] = []

data['Description'] = []
data['Title'] = []

for film in films:
    data['Title'].append(film)
    for column in columns:
        try:
            data[column].append(films[film]['Information'][column])
        except KeyError:
            data[column].append(np.nan)
    data['Description'].append(films[film]['Description'])

# Dataframe from data
df = pd.DataFrame(data=data)

# Re-order so title column is first
new_columns = list(df.columns.values)
new_columns.remove('Title')
new_columns = ['Title'] + new_columns
df = df[new_columns]

df.head()

In [19]:
excel_output_filename = "VIFF%d_all.xls" % year
print("Saving results to Excel file '%s'" % excel_output_filename)

# Save to excel
df.to_excel(excel_output_filename)

Saving results to Excel file 'VIFF2018_all.xls'


## Alternative script using Selenium to load page information

This version is able to read movie screening times

In [1]:
# Python script to scrape film information from the
# Vancouver International Film Festival (VIFF) website
# using Selenium

import pandas as pd
import numpy as np
import requests
import pickle
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

try:
    # For Python 3
    from urllib.parse import urlparse
except ImportError:
    # For Python 2
    from urlparse import urlparse

print("\n -------------- viff_scrape2.py --------------\n")

print("Scrape information on this year's films from 'https://www.viff.org'")
year = pd.datetime.now().date().year

print("Launching Selenium web-page driver...")
driver = webdriver.Safari()  # Other options include Chrome, Firefox

# Enter the base URL where the A-Z list of films is here (from
# the viff.org home page find the 'Search by Title' option):
start_page_url = "https://www.viff.org/Online/default.asp?" \
                 "doWork::WScontent::loadArticle=Load&BOparam::" \
                 "WScontent::loadArticle::" \
                 "article_id=D5FA11B0-61FD-4217-AAF7-1FC44D897DA1"

start_page_parse_result = urlparse(start_page_url)
start_page_parse_result


 -------------- viff_scrape2.py --------------

Scrape information on this year's films from 'https://www.viff.org'
Launching Selenium web-page driver...


ParseResult(scheme='https', netloc='www.viff.org', path='/Online/default.asp', params='', query='doWork::WScontent::loadArticle=Load&BOparam::WScontent::loadArticle::article_id=D5FA11B0-61FD-4217-AAF7-1FC44D897DA1', fragment='')

In [2]:
print("Opening Viff film list web-page...")
# Open webpage in driver
driver.get(start_page_url)

Opening Viff film list web-page...


In [3]:
# Now extract the information we want

# Get a list of URLs to the VIFF pages of each film
xp = '/html/body//div[@class="article-container main-article-body"]//div/a'
film_link_elements = driver.find_elements_by_xpath(xp)
film_page_links = [
    {'text': el.text.strip(), "href": el.get_attribute('href')} 
    for el in film_link_elements
]
print("Found links for %d film pages." % len(film_page_links))

Found links for 355 film pages.


In [4]:
def convert_link_to_url(link, scheme=start_page_parse_result.scheme,
                        netloc=start_page_parse_result.netloc,
                        path=start_page_parse_result.path):
    """Converts a relative link such as 'default.asp' into an absolute url
    such as 'https://www.viff.org/Online/default.asp' using the scheme,
    netloc, and path specified."""
    
    parse_result = urlparse(link)

    if parse_result.netloc is '':
        parse_result = parse_result._replace(netloc=netloc)
    if parse_result.scheme is '':
        parse_result = parse_result._replace(scheme=scheme)
    if start_page_parse_result.path.endswith(parse_result.path):
        parse_result = parse_result._replace(path=path)
    
    return parse_result.geturl()

def get_page_lxml_tree(url):
    """Request page from url and convert contents into lxml tree"""

    page = requests.get(url)

    return html.fromstring(page.content)

In [5]:
def get_film_details(driver, max_screenings=5):
    """Reads film information from Selenium web-driver."""
    
    # Get movie title
    xp = './/h1[@class="movie-title"]'
    try:
        film_title = driver.find_element_by_xpath(xp).text.strip()
    except:
        print("WARNING: Title for film '%s' not found on page " \
              "-> skipped." % film_title)
        return None

    # Get film information
    film_info = {}
    xp = './/div[@class="movie-information"]/*'
    try:
        film_info_elements = driver.find_elements_by_xpath(xp)
    except:
        print("WARNING: Getting information for film '%s' failed." % film_title)
    else:
        # Put film information into a dictionary
        labels = ['Director', 'Year:', 'Country of Origin:', 
                  'Running Time:', 'Language:']
        for e in film_info_elements:
            text = e.text.strip()

            for label in labels:
                if text[0:len(label)] == label:

                    # remove ':' if there is one
                    if label[-1] == ':':
                        key = label[0:-1]
                    else:
                        key = label

                    film_info[key] = text[text.find(label) + len(label):].strip()
    
    # Get film description
    xp = '//div[@class="movie-description"]'
    try:
        film_description = driver.find_element_by_xpath(xp).text.strip()
    except:
        film_description = None
        print("WARNING: Description for film '%s' missing." % film_title)
    
    # Get screening times
    xp = '//div[@class="movie-tickets"]/div[@name="avWidget"]' \
         '//div[@class="item-description result-box-item-details"]'
    search_result_elements = driver.find_elements_by_xpath(xp)
    screenings = []
    
    if len(search_result_elements) > max_screenings:
        print("WARNING: Film '%s' has more than %d screening times."
              "Only 4 will be saved" % (film_title, max_screenings))

    for element in search_result_elements:
        
        start_date_string = element.find_element_by_class_name('item-start-date') \
                            .find_element_by_class_name('start-date').text
        venue = element.find_element_by_class_name('item-venue').text
        
        screenings.append({
                'Start date': pd.to_datetime(start_date_string),
                'Venue': venue
            })
    
    film_details = {
        'Title': film_title,
        'Information': film_info,
        'Description': film_description,
        'Screenings': screenings
    }
    
    return film_details

In [6]:
# Prepare dictionary to collect the film information
year = pd.datetime.now().date().year
data_filename = "viff_data_%d_2.pickle" % year
try:
    with open(data_filename, 'rb') as f:
        films = pickle.load(f)
except:
    films = {}
else:
    print("Data from %d films found in data file." % len(films))

Data from 355 films found in data file.


In [7]:
# ----------------- MAIN LOOP -----------------
print("loading each film page to extract information...")

# Set number of films to parse each time (None for all)
batch = 10

already_done = list(films.keys())
total_count = len(films)
max_screenings = 3

film_page_links[2]
for page_link in film_page_links:
    
    title, link = page_link['text'], page_link['href']

    # Skip if already in films dictionary
    if title in already_done:
        continue

    # Convert link to a complete url
    page_url = convert_link_to_url(link)
    
    # Get film information
    driver.get(page_url)
    film_details = get_film_details(driver)
    
    if film_details is None:
        # Skip this film
        continue

    if film_details['Title'] != title:
        print("WARNING: Title for film '%s' does not match link." % title)
    
    films[title] = film_details
    print(" %4d: %s" % (total_count, film_details['Title']))

    total_count += 1
    if batch:
        batch -= 1
        if batch < 1:
            break

loading each film page to extract information...


In [8]:
print("\nInformation on %d films now acquired." % len(films))

# Now save the film records to an excel file
with open(data_filename, 'wb') as f:
    pickle.dump(films, f)
print("Results saved to file '%s'." % data_filename)

print("Closing Selenium web-page driver...")
driver.close()


Information on 355 films now acquired.
Results saved to file 'viff_data_2018_2.pickle'.
Closing Selenium web-page driver...


In [9]:
# Move data into pandas dataframe
info_cols = set()
data = dict()

for film in films: 
    for key in films[film]['Information']:
        info_cols = info_cols.union([key])

max_screenings = max([len(details['Screenings']) for 
                      details in films.values()])

date_cols = set(["Screening date %d" % (i + 1) for i in range(max_screenings)])
venue_cols = set(["Screening venue %d" % (i + 1) for i in range(max_screenings)])

# Add an empty list to dictionary for each series expected
for column in info_cols.union(date_cols).union(venue_cols):
    data[column] = []
data['Description'] = []
data['Title'] = []

for film in films:
    data['Title'].append(film)
    for col in info_cols:
        try:
            data[col].append(films[film]['Information'][col])
        except KeyError:
            data[col].append(np.nan)
    data['Description'].append(films[film]['Description'])
    for i, cols in enumerate(zip(date_cols, venue_cols)):
        try:
            data[cols[0]].append(films[film]['Screenings'][i]['Start date'])
            data[cols[1]].append(films[film]['Screenings'][i]['Venue'])
        except IndexError:
            data[cols[0]].append(np.nan)
            data[cols[1]].append(np.nan)

# Dataframe from data
df = pd.DataFrame(data=data)

# Re-order so title column is first
new_columns = list(df.columns.values)
new_columns.remove('Title')
new_columns = ['Title'] + new_columns
df = df[new_columns]

df.head()

Unnamed: 0,Title,Country of Origin,Description,Director,Language,Running Time,Screening date 1,Screening date 2,Screening date 3,Screening venue 1,Screening venue 2,Screening venue 3,Year
0,Unearthing. In Conversation,Austria,Traversing the intersection of theory and art ...,Belinda Kazeem-Kamiński,,13 mins,2018-10-02 18:15:00,2018-10-07 14:00:00,NaT,Vancity Theatre,Vancity Theatre,,2017
1,Microhabitat,South Korea,"If your rent goes up, do you give up 1) cigare...",Jeon Gowoon,In Korean with English subtitles,106 mins,2018-09-30 18:30:00,2018-10-02 16:00:00,NaT,International Village 9,International Village 9,,2017
2,Winter Flies,Czech Republic,Stand By Me meets Y Tu Mamá También on a stret...,Olmo Omerzu,In Czech with English subtitles,85 mins,2018-10-05 18:45:00,2018-10-11 10:45:00,NaT,SFU Goldcorp,SFU Goldcorp,,2018
3,First Generation,USA,"A teen Vietnamese-American girl, confused by m...",Jeannie Nguyen,,9 mins,2018-10-04 11:00:00,2018-10-07 18:00:00,NaT,International Village 9,International Village 9,,2017
4,Carmine Street Guitars,Canada,"Nestled in Greenwich Village, Carmine Street G...",Ron Mann,,80 mins,2018-10-02 18:15:00,2018-10-07 16:30:00,NaT,Rio Theatre,SFU Goldcorp,,2018


In [10]:
excel_output_filename = "VIFF%d_all_with_dates.xls" % year
print("Saving results to Excel file '%s'" % excel_output_filename)

# Save to excel
df.to_excel(excel_output_filename)

print("Finished.")

Saving results to Excel file 'VIFF2018_all_with_dates.xls'
Finished.


## Do some fun analysis

In [17]:
len(films)

355

In [52]:
title_text = ' '.join([film for film in films])

In [53]:
words = title_text.split()
len(words)

982

In [93]:
word_tally = {}
for word in words:
    word_lower = word.lower().strip('()-_:[]{}& ')
    if word_lower:
        word_tally[word_lower] = word_tally.get(word_lower, 0) + 1


In [96]:
exclude_words = ['the', 'of', 'a', 'and', 'in', 'is', 'at', 'an', 'what', 'on', 'la', 'to', '1', '2', '3']

for word in exclude_words:
    if word in word_tally:
        del(word_tally[word])

print(', '.join(exclude_words))

the, of, a, and, in, is, at, an, what, on, la, to, 1, 2, 3


In [97]:
pd.Series(word_tally).sort_values(ascending=False).head(20)

my                7
waves             6
war               6
shock             6
house             5
room              4
life              4
street            4
under             4
you               4
trilogy           4
quantification    4
flor              3
wild              3
happy             3
story             3
up                3
dreaming          3
part              3
song              3
dtype: int64

In [99]:
top10 = pd.Series(word_tally).sort_values(ascending=False).head(10).index.values.tolist()
print(', '.join(top10))

my, waves, war, shock, house, room, life, street, under, you


In [100]:
bytes(pd.Series(word_tally).sort_values(ascending=False).head(20).index.values.tolist()[14])

'happy'

In [101]:
for word in top10:
    print("\n%s:" % word)
    print(' ' + '\n '.join([film.strip('()-_:[]{} ') for film in films if (word in film.lower())]))


my:
 The Whistleblower of My Lai
 Shock Waves: Diary of My Mind
 Inside My Heart
 My Life Is a Joke
 In My Room (Germany
 In My Room (Israel
 My Clayey Conception

waves:
 Shock Waves: Diary of My Mind
 Shock Waves: The Valley
 Shock Waves: First Name Mathieu
 Shock Waves: Sirius
 Shock Waves: Program 1 & 2
 Shock Waves: Program 3 & 4

war:
 At War
 Theatre of War
 Woman at War
 A Private War
 Memoir of War
 Cold War

shock:
 Shock Waves: Diary of My Mind
 Shock Waves: The Valley
 Shock Waves: First Name Mathieu
 Shock Waves: Sirius
 Shock Waves: Program 1 & 2
 Shock Waves: Program 3 & 4

house:
 A Dreaming House
 Open House
 Julio Iglesias's House
 The Wolf House
 The House That Jack Built

room:
 A Room with a Coconut View
 Maybe if It Were a Nice Room
 In My Room (Germany
 In My Room (Israel

life:
 Another Day of Life
 Bergman - A Year in a Life
 This Mountain Life
 My Life Is a Joke

street:
 Carmine Street Guitars
 The Beetle at the End of the Street
 From Across the Street and 

In [75]:
films.keys()[0:10]

[u'Unearthing. In Conversation',
 u'Microhabitat',
 u'Winter Flies',
 u'First Generation',
 u'Carmine Street Guitars',
 u'Piazza Vittorio',
 u'Another Day of Life',
 u'Parallel',
 u'Sharkwater Extinction',
 u'LHB']

### Trying to fix encoding/decoding issues

In [11]:
'//*[@id="avWidget_F265B724-7400-4814-97B9-70ADA2D69B87_IDY1ISUEB1M3LBBE3WJI5ZR4D4SNOIU0LAGBK2J5K4VSTH1D2QFACO"]/div/div[1]/form/div'

'//*[@id="avWidget_F265B724-7400-4814-97B9-70ADA2D69B87_IDY1ISUEB1M3LBBE3WJI5ZR4D4SNOIU0LAGBK2J5K4VSTH1D2QFACO"]/div/div[1]/form/div'

In [12]:
desc = films['Animal Behaviour']['Description']
desc

u'Dealing with what comes naturally isn\u2019t easy, especially for animals. Five such beasts gather to discuss their inner angst in a group therapy session.'

In [13]:
desc.encode('utf8')

'Dealing with what comes naturally isn\xe2\x80\x99t easy, especially for animals. Five such beasts gather to discuss their inner angst in a group therapy session.'

In [14]:
print('isn`t easy')

isn`t easy


In [15]:
desc

u'Dealing with what comes naturally isn\u2019t easy, especially for animals. Five such beasts gather to discuss their inner angst in a group therapy session.'

In [16]:
bytes(desc)

UnicodeEncodeError: 'ascii' codec can't encode character u'\u2019' in position 37: ordinal not in range(128)

In [None]:
desc.decode("utf-8", "strict")  

In [None]:
link

In [None]:
link = 'https://www.viff.org/Online/default.asp?BOparam::WScontent::loadArticle::permalink=f30436-animal-behaviour'

In [None]:
get_page_lxml_tree(link)

In [None]:
xp = '//div[@id="tickets"]'
film_page_tree.xpath(xp)

In [None]:
film_page_tree.xpath(xp)[0].values()

In [None]:
xp2 = '//*[@name="avWidget"]'
film_page_tree.xpath(xp2)[0].text_content()

In [None]:
xp3 = '//*[@name="avWidget"]'
film_page_tree.xpath(xp3)[0].getchildren()[0].attrib

In [None]:
'//*[@name="avWidget"]/div/div[1]/form/div/div[1]/div[2]/div[2]/span'

In [None]:
dir(film_page_tree.xpath(xp)[0])