### Webscraping

In [None]:
# Make soup
import requests
from bs4 import BeautifulSoup
request = requests.get('https://films.criterionchannel.com/')
soup = BeautifulSoup(request.content, 'html.parser')

In [None]:
# Scrape titles, get rid of tabs and new lines
titles = []
for title in soup.findAll(class_ = "criterion-channel__td criterion-channel__td--title"):
    nt = title.get_text()
    no_t = nt.replace('\t', '')
    no_nt = no_t.replace('\n', '')
    titles.append(no_nt)
print(len(titles))

In [None]:
# Scrape urls
urls = []
for url in soup.findAll('a', href = True):
    urls.append(url.get('href'))
# Only keep urls that correspond to films
urls = urls[3:]
urls = urls[1:-21]
print(len(urls))

In [None]:
# Scrape directors
directors = []
for director in soup.findAll(class_ = 'criterion-channel__td criterion-channel__td--director'):
    nt = director.get_text()
    no_t = nt.replace('\t', '')
    no_nt = no_t.replace('\n', '')
    directors.append(no_nt)
print(len(directors))

In [None]:
# Scrape countries
countries = []
for country in soup.findAll(class_ = 'criterion-channel__td criterion-channel__td--country'):
    nt = country.get_text()
    no_t = nt.replace('\t', '')
    no_nt = no_t.replace('\n', '')
    no_comma = no_nt[:-1]
    countries.append(no_comma)
print(len(countries))

In [None]:
# Scrape years
years = []
for year in soup.findAll(class_ = 'criterion-channel__td criterion-channel__td--year'):
    nt = year.get_text()
    no_t = nt.replace('\t', '')
    no_nt = no_t.replace('\n', '')
    years.append(no_nt)
print(len(years))

In [None]:
# Create dataframe
import pandas as pd
data = pd.DataFrame({'Title': titles, 'Director': directors, 'Country': countries, 'Year': years, 'Url': urls})
# Remove rows without durations (parts > 1 of a film)
data = data[~data['Url'].str.contains('/videos/')]
# Remove two rows with urls that don't work
# ....
data = data.reset_index(drop = True)
print(len(data))

In [None]:
# # Check for broken links, do not run this, it takes a long time
# fourohfour = []
# for url in data['Url']:
#     # 200 = working, 404 = broken
#     fourohfour.append(requests.get(url))
#     print(url)
# print(len(fourohfour))
# # Save as text file (Excel often incorrectly reformats csv files upon opening)
# with open('data/Fourohfour.txt', 'w') as file:
#     for line in fourohfour:
#         file.write("%s\n" % line)
# print(len(fourohfour))

In [None]:
# Open pre-scraped 404 file
with open('data\Fourohfour.txt') as file:
    fourohfour = file.read().splitlines()
# Insert 404 column
data.insert(5, '404', fourohfour)
# Convert from BeautifulSoup type to string
data['404'] = data['404'].astype(str)
# Remove 404 rows from data
data = data[~data['404'].str.contains('404')]
print(len(data)) # Removed 52 broken links

In [None]:
# Reset index after filtering out rows
data = data.reset_index(drop = True)

In [None]:
data.head()

In [None]:
# # Scrape durations, do not run this, it takes a long time
# durations = []
# for url in data['Url']:
#     request = requests.get(url)
#     soup = BeautifulSoup(request.content, 'html.parser')
#     for duration in soup.findAll(class_ = 'duration-container')[:1]:
#         durations.append(duration.get_text())
#     print(url)
# # Save as text file
# with open('data/Durations.txt', 'w') as file:
#     for line in durations:
#         file.write("%s\n" % line)
# print(len(durations))

In [None]:
# Open pre-scraped duration file
with open('data\Durations.txt') as file:
    durations = file.read().splitlines()

In [None]:
# Clean durations
durations = durations[1:]
durations = durations[::3]
durations = [x.strip(' ') for x in durations]

In [None]:
# Insert duration column
try:
    data.insert(4, 'Duration', durations)
except:
    pass

In [None]:
# Remove seconds, keep only hours and minutes
data['Duration'] = data['Duration'].str[:-3]

In [None]:
# Append '0:' to beginning of duration to indicate 0 hours for all films < 1 hour
# that are not formatted consistently with the rest of the data
for i, duration in enumerate(data['Duration']):
    if ':' not in duration:
        data.loc[i, 'Duration'] = '0:' + duration

In [None]:
# Split duration by colon
hours_minutes = data['Duration'].str.split(':', expand = True)

In [None]:
# Insert hours and minutes columns
data.insert(5, 'Hours', hours_minutes[0])
data['Hours'] = data['Hours'].astype(int)
data.insert(6, 'Minutes', hours_minutes[1])
data['Minutes'] = data['Minutes'].astype(int)

In [None]:
# Calculate and insert total hours
total_hours = []
for i in range(len(data)):
    hours = (data.loc[i]['Hours'].astype(int) + data.loc[i]['Minutes'].astype(int)/60).round(2)
    total_hours.append(hours)
try:
    data.insert(7, 'Total Hours', total_hours)
except:
    pass
# Drop old columns
try:
    data = data.drop(['Minutes', 'Hours', '404'], axis = 1)
except:
    pass

In [None]:
# # Scrape descriptions, do not run this, it takes a long time
# descriptions = []
# for url in data['Url']:
#     request = requests.get(url)
#     soup = BeautifulSoup(request.content, 'html.parser')
#     paragraphs = soup.findAll('p')
#     # Select paragraph containing the description
#     paragraphs = paragraphs[1]
#     string = []
#     for x in paragraphs:
#         string.append(str(x))
#     descriptions.append(string[0])
#     print(url)
# # Save to csv (list is incorrectly loaded as text file)
# descriptions = pd.DataFrame({'Description': descriptions})
# descriptions.to_csv('data/Descriptions.csv', index = False)

In [None]:
# Open pre-scraped description file
descriptions = pd.read_csv('data\Descriptions.csv')

In [None]:
# Insert description column
data.insert(5, 'Description', descriptions)

In [None]:
# Remove films < 1 hour, as these are mostly shorts, not films
data = data[data['Total Hours'] > 1]

In [None]:
# Create decade column
import numpy as np
try:
    data.insert(4, 'Decade', (data['Year'].astype(int)/10).apply(np.floor))
except:
    pass
data['Decade'] = data['Decade'].astype(str)
data['Decade'] = data['Decade'].str.replace('.', '')
data['Decade'] = data['Decade'].astype(str) + 's'

In [None]:
# Replace NaN with 'None'
data = data.replace(np.nan, 'None', regex = True)

In [None]:
# Save to csv
data.to_csv('data\Criterion.csv', index = False)