In [15]:
# Make soup
import requests
from bs4 import BeautifulSoup
request = requests.get('https://films.criterionchannel.com/')
soup = BeautifulSoup(request.content, 'html.parser')

In [16]:
# Scrape titles
titles = []
for title in soup.findAll(class_ = "criterion-channel__td criterion-channel__td--title"):
    nt = title.get_text()
    no_t = nt.replace('\t', '')
    no_nt = no_t.replace('\n', '')
    titles.append(no_nt)
print(len(titles))
print(titles[-1])

2311
Zorns Lemma


In [17]:
# Scrape urls
urls = []
for url in soup.findAll('a', href = True):
    urls.append(url.get('href'))
urls = urls[3:]
urls = urls[1:2312]
print(len(urls))
print(urls[-1])

2311
https://www.criterionchannel.com/zorns-lemma


In [18]:
# Scrape directors
directors = []
for director in soup.findAll(class_ = 'criterion-channel__td criterion-channel__td--director'):
    nt = director.get_text()
    no_t = nt.replace('\t', '')
    no_nt = no_t.replace('\n', '')
    directors.append(no_nt)
print(len(directors))
print(directors[-1])

2311
Hollis Frampton


In [19]:
# Scrape countries
countries = []
for country in soup.findAll(class_ = 'criterion-channel__td criterion-channel__td--country'):
    nt = country.get_text()
    no_t = nt.replace('\t', '')
    no_nt = no_t.replace('\n', '')
    no_comma = no_nt[:-1]
    countries.append(no_comma)
print(len(countries))
print(countries[-1])

2311
United States


In [20]:
# Scrape years
years = []
for year in soup.findAll(class_ = 'criterion-channel__td criterion-channel__td--year'):
    nt = year.get_text()
    no_t = nt.replace('\t', '')
    no_nt = no_t.replace('\n', '')
    years.append(no_nt)
print(len(years))
print(years[-1])

2311
1970


In [21]:
# Create dataframe
import pandas as pd
data = pd.DataFrame({'Title': titles, 'Director': directors, 'Country': countries, 'Year': years, 'Url': urls})
# Remove rows without durations (parts > 1 of a film)
data = data[~data['Url'].str.contains('/videos/')]
# Remove two rows with urls that don't work
data = data[~data['Url'].str.contains('https://www.criterionchannel.com/yotsuya-kaidan-part-one')]
data = data[~data['Url'].str.contains('https://www.criterionchannel.com/yotsuya-kaidan-part-two')]
data = data.reset_index(drop = True)
print(len(data))

2059


In [None]:
# Check for broken links
fourohfour = []
for url in data['Url']:
    fourohfour.append(requests.get(url))
    print(url)

https://www.criterionchannel.com/2-or-3-things-i-know-about-her
https://www.criterionchannel.com/les-3-boutons
https://www.criterionchannel.com/3-faces
https://www.criterionchannel.com/4-months-3-weeks-and-2-days
https://www.criterionchannel.com/the-5-000-fingers-of-dr-t
https://www.criterionchannel.com/5x2
https://www.criterionchannel.com/the-vi-olympic-winter-games-oslo-1952
https://www.criterionchannel.com/7-p-cuis-s-de-b-a-saisir
https://www.criterionchannel.com/81-2
https://www.criterionchannel.com/8th-continent
https://www.criterionchannel.com/the-ix-olympiad-in-amsterdam
https://www.criterionchannel.com/ix-olympic-winter-games-innsbruck-1964
https://www.criterionchannel.com/12-o-clock-boys
https://www.criterionchannel.com/13-days-in-france
https://www.criterionchannel.com/xivth-olympiad-the-glory-of-sport
https://www.criterionchannel.com/16-days-of-glory
https://www.criterionchannel.com/21-days
https://www.criterionchannel.com/24-frames
https://www.criterionchannel.com/24-hours-

https://www.criterionchannel.com/being-two-isn-t-easy
https://www.criterionchannel.com/belle-de-jour
https://www.criterionchannel.com/la-belle-noiseuse
https://www.criterionchannel.com/benny-s-video
https://www.criterionchannel.com/bergman-island
https://www.criterionchannel.com/berlin-alexanderplatz
https://www.criterionchannel.com/best-offer
https://www.criterionchannel.com/la-bete-humaine
https://www.criterionchannel.com/betty-tells-her-story
https://www.criterionchannel.com/between-the-lines
https://www.criterionchannel.com/beware-of-a-holy-whore
https://www.criterionchannel.com/beyond-all-barriers
https://www.criterionchannel.com/beyond-the-hills
https://www.criterionchannel.com/beyond-the-law
https://www.criterionchannel.com/bezhin-meadow
https://www.criterionchannel.com/bicycle-thieves
https://www.criterionchannel.com/il-bidone
https://www.criterionchannel.com/the-bigamist
https://www.criterionchannel.com/the-big-city
https://www.criterionchannel.com/big-city-blues
https://www.c

In [28]:
# Scrape durations, do not run this, it takes a long time
durations = []
for url in data['Url']:
    request = requests.get(url)
    soup = BeautifulSoup(request.content, 'html.parser')
    for duration in soup.findAll(class_ = 'duration-container')[:1]:
        durations.append(duration.get_text())
    print(url)
# Save as text file (Excel often incorrectly reformats csv files upon opening)
# ! mkdir data
with open('data/Durations.txt', 'w') as file:
    for line in durations:
        file.write("%s\n" % line)
print(len(durations))
print(durations[-1])

https://www.criterionchannel.com/2-or-3-things-i-know-about-her
https://www.criterionchannel.com/les-3-boutons
https://www.criterionchannel.com/3-faces
https://www.criterionchannel.com/4-months-3-weeks-and-2-days
https://www.criterionchannel.com/the-5-000-fingers-of-dr-t
https://www.criterionchannel.com/5x2
https://www.criterionchannel.com/the-vi-olympic-winter-games-oslo-1952
https://www.criterionchannel.com/7-p-cuis-s-de-b-a-saisir
https://www.criterionchannel.com/81-2
https://www.criterionchannel.com/8th-continent
https://www.criterionchannel.com/the-ix-olympiad-in-amsterdam
https://www.criterionchannel.com/ix-olympic-winter-games-innsbruck-1964
https://www.criterionchannel.com/12-o-clock-boys
https://www.criterionchannel.com/13-days-in-france
https://www.criterionchannel.com/xivth-olympiad-the-glory-of-sport
https://www.criterionchannel.com/16-days-of-glory
https://www.criterionchannel.com/21-days
https://www.criterionchannel.com/24-frames
https://www.criterionchannel.com/24-hours-

https://www.criterionchannel.com/being-two-isn-t-easy
https://www.criterionchannel.com/belle-de-jour
https://www.criterionchannel.com/la-belle-noiseuse
https://www.criterionchannel.com/benny-s-video
https://www.criterionchannel.com/bergman-island
https://www.criterionchannel.com/berlin-alexanderplatz
https://www.criterionchannel.com/best-offer
https://www.criterionchannel.com/la-bete-humaine
https://www.criterionchannel.com/betty-tells-her-story
https://www.criterionchannel.com/between-the-lines
https://www.criterionchannel.com/beware-of-a-holy-whore
https://www.criterionchannel.com/beyond-all-barriers
https://www.criterionchannel.com/beyond-the-hills
https://www.criterionchannel.com/beyond-the-law
https://www.criterionchannel.com/bezhin-meadow
https://www.criterionchannel.com/bicycle-thieves
https://www.criterionchannel.com/il-bidone
https://www.criterionchannel.com/the-bigamist
https://www.criterionchannel.com/the-big-city
https://www.criterionchannel.com/big-city-blues
https://www.c

https://www.criterionchannel.com/charlotte-s-web
https://www.criterionchannel.com/charulata
https://www.criterionchannel.com/cheatin
https://www.criterionchannel.com/checking-out
https://www.criterionchannel.com/che-1
https://www.criterionchannel.com/the-chicken
https://www.criterionchannel.com/chiefs
https://www.criterionchannel.com/la-chienne
https://www.criterionchannel.com/the-children-are-watching-us
https://www.criterionchannel.com/children-of-nagasaki
https://www.criterionchannel.com/children-of-paradise
https://www.criterionchannel.com/children-of-the-century
https://www.criterionchannel.com/chimes-at-midnight
https://www.criterionchannel.com/chinese-odyssey-2002
https://www.criterionchannel.com/chinese-roulette
https://www.criterionchannel.com/chris-and-bernie
https://www.criterionchannel.com/a-christmas-tale
https://www.criterionchannel.com/christo-in-paris
https://www.criterionchannel.com/christo-s-valley-curtain
https://www.criterionchannel.com/chronicle-of-a-summer
https:/

https://www.criterionchannel.com/double-suicide
https://www.criterionchannel.com/douce
https://www.criterionchannel.com/down-by-law
https://www.criterionchannel.com/downhill
https://www.criterionchannel.com/down-there
https://www.criterionchannel.com/dragnet-girl
https://www.criterionchannel.com/dragon-inn
https://www.criterionchannel.com/dreams-1
https://www.criterionchannel.com/dr-jack
https://www.criterionchannel.com/the-drum
https://www.criterionchannel.com/drunken-angel
https://www.criterionchannel.com/dry-summer
https://www.criterionchannel.com/a-dry-white-season
https://www.criterionchannel.com/dry-wood
https://www.criterionchannel.com/duck-soup
https://www.criterionchannel.com/du-cote-de-la-cote
https://www.criterionchannel.com/dumbland
https://www.criterionchannel.com/dying-at-grace
https://www.criterionchannel.com/the-eagle-shooting-heroes
https://www.criterionchannel.com/the-ear-1
https://www.criterionchannel.com/early-spring
https://www.criterionchannel.com/early-summer
htt

https://www.criterionchannel.com/from-the-other-side
https://www.criterionchannel.com/the-front-page
https://www.criterionchannel.com/frownland-a-film-by-ronand-bronstein
https://www.criterionchannel.com/fry-day
https://www.criterionchannel.com/a-fuller-life
https://www.criterionchannel.com/the-funeral
https://www.criterionchannel.com/funny-games
https://www.criterionchannel.com/fun-with-dick-and-jane
https://www.criterionchannel.com/the-future
https://www.criterionchannel.com/gai-dimanche
https://www.criterionchannel.com/gallipoli
https://www.criterionchannel.com/the-games-of-the-v-olympiad-stockholm-1912
https://www.criterionchannel.com/games-of-the-xxi-olympiad
https://www.criterionchannel.com/ganja-hess
https://www.criterionchannel.com/gap-toothed-women
https://www.criterionchannel.com/the-garden-of-women
https://www.criterionchannel.com/garlic-is-as-good-as-ten-mothers
https://www.criterionchannel.com/gate-of-flesh
https://www.criterionchannel.com/gate-of-hell
https://www.criterio

https://www.criterionchannel.com/hot-biskits
https://www.criterionchannel.com/hot-dog
https://www.criterionchannel.com/hotel-du-nord
https://www.criterionchannel.com/hotel-monterey
https://www.criterionchannel.com/hot-pepper
https://www.criterionchannel.com/hour-of-the-wolf
https://www.criterionchannel.com/the-hours
https://www.criterionchannel.com/house-1
https://www.criterionchannel.com/the-housemaid
https://www.criterionchannel.com/how-some-jellyfish-are-born
https://www.criterionchannel.com/how-to-get-ahead-in-advertising
https://www.criterionchannel.com/how-to-kiss
https://www.criterionchannel.com/how-to-make-love-to-a-woman
https://www.criterionchannel.com/hua-yang-de-nian-hua
https://www.criterionchannel.com/humain-trop-humain
https://www.criterionchannel.com/the-human-condition-i
https://www.criterionchannel.com/the-human-condition-ii
https://www.criterionchannel.com/the-human-condition-iii
https://www.criterionchannel.com/l-humanite
https://www.criterionchannel.com/humanity-an

https://www.criterionchannel.com/kung-fu-master
https://www.criterionchannel.com/kuroneko
https://www.criterionchannel.com/kwaidan
https://www.criterionchannel.com/lacombe-lucien
https://www.criterionchannel.com/the-lady-and-the-beard
https://www.criterionchannel.com/lady-snowblood
https://www.criterionchannel.com/lady-snowblood-love-song-of-vengeance
https://www.criterionchannel.com/the-lady-vanishes
https://www.criterionchannel.com/lamb
https://www.criterionchannel.com/the-lamp
https://www.criterionchannel.com/land-makar
https://www.criterionchannel.com/land-of-milk-and-honey
https://www.criterionchannel.com/langlois
https://www.criterionchannel.com/last-holiday
https://www.criterionchannel.com/last-hurrah-for-chivalry
https://www.criterionchannel.com/the-last-metro
https://www.criterionchannel.com/the-last-wave
https://www.criterionchannel.com/late-autumn
https://www.criterionchannel.com/late-chrysanthemums
https://www.criterionchannel.com/late-spring
https://www.criterionchannel.co

https://www.criterionchannel.com/the-masseurs-and-a-woman
https://www.criterionchannel.com/a-master-builder
https://www.criterionchannel.com/master-of-the-house
https://www.criterionchannel.com/the-match-factory-girl
https://www.criterionchannel.com/maurice
https://www.criterionchannel.com/max-by-marcel
https://www.criterionchannel.com/maxwell-s-demon
https://www.criterionchannel.com/may-fools
https://www.criterionchannel.com/me-and-you-and-everyone-we-know
https://www.criterionchannel.com/meantime
https://www.criterionchannel.com/the-melbourne-rendez-vous
https://www.criterionchannel.com/memories-of-the-olympic-summer-of-1952
https://www.criterionchannel.com/memories-of-underdevelopment
https://www.criterionchannel.com/memory-for-max-claire-ida-and-company
https://www.criterionchannel.com/men-are-not-gods
https://www.criterionchannel.com/the-men-who-tread-on-the-tiger-s-tail
https://www.criterionchannel.com/the-merchant-of-four-seasons
https://www.criterionchannel.com/mercy-the-mummy-

https://www.criterionchannel.com/olympia-1
https://www.criterionchannel.com/olympic-games-1956
https://www.criterionchannel.com/the-olympic-games-amsterdam-1928
https://www.criterionchannel.com/the-olympic-games-as-they-were-practiced-in-ancient-greece
https://www.criterionchannel.com/the-olympic-games-held-at-chamonix-in-1924
https://www.criterionchannel.com/the-olympic-games-in-paris-1924
https://www.criterionchannel.com/olympic-glory-2
https://www.criterionchannel.com/the-olympics-in-mexico
https://www.criterionchannel.com/olympic-spirit
https://www.criterionchannel.com/once-a-thief
https://www.criterionchannel.com/on-demande-une-brute
https://www.criterionchannel.com/one-day-pina-asked
https://www.criterionchannel.com/one-hundred-and-one-nights
https://www.criterionchannel.com/one-light-one-world
https://www.criterionchannel.com/one-of-those-days
https://www.criterionchannel.com/one-sings-the-other-doesn-t
https://www.criterionchannel.com/one-way-ticket-to-love
https://www.criterio

https://www.criterionchannel.com/rebels-of-the-neon-god
https://www.criterionchannel.com/a-reckless-rover
https://www.criterionchannel.com/record-of-a-tenement-gentleman
https://www.criterionchannel.com/the-red-balloon
https://www.criterionchannel.com/red-beard
https://www.criterionchannel.com/red-desert
https://www.criterionchannel.com/redes
https://www.criterionchannel.com/red-road
https://www.criterionchannel.com/the-red-shoes
https://www.criterionchannel.com/the-red-tree
https://www.criterionchannel.com/regeneration
https://www.criterionchannel.com/rehearsals-for-extinct-anatomies
https://www.criterionchannel.com/rembrandt
https://www.criterionchannel.com/remorques
https://www.criterionchannel.com/rendez-vous-1
https://www.criterionchannel.com/les-rendez-vous-d-anna
https://www.criterionchannel.com/repast
https://www.criterionchannel.com/reponse-de-femmes
https://www.criterionchannel.com/a-report-on-the-party-and-guests
https://www.criterionchannel.com/the-return-of-bulldog-drummon

https://www.criterionchannel.com/sing-bing-sing
https://www.criterionchannel.com/sing-young-people
https://www.criterionchannel.com/sisters-1
https://www.criterionchannel.com/sisters-of-the-gion
https://www.criterionchannel.com/six-men-getting-sick
https://www.criterionchannel.com/skunk
https://www.criterionchannel.com/sleepwalk
https://www.criterionchannel.com/a-slightly-pregnant-man
https://www.criterionchannel.com/the-slow-escape
https://www.criterionchannel.com/the-small-back-room
https://www.criterionchannel.com/smiles-of-a-summer-night
https://www.criterionchannel.com/smithereens
https://www.criterionchannel.com/snow-canon
https://www.criterionchannel.com/the-snow-flurry
https://www.criterionchannel.com/snows-of-grenoble
https://www.criterionchannel.com/snow-trail
https://www.criterionchannel.com/socrates
https://www.criterionchannel.com/soft-fiction
https://www.criterionchannel.com/the-soft-skin
https://www.criterionchannel.com/solaris
https://www.criterionchannel.com/solar-walk

https://www.criterionchannel.com/those-were-the-days
https://www.criterionchannel.com/a-thousand-suns
https://www.criterionchannel.com/three-cases-of-murder
https://www.criterionchannel.com/three-colors-blue
https://www.criterionchannel.com/three-colors-red
https://www.criterionchannel.com/three-colors-white
https://www.criterionchannel.com/three-daughters
https://www.criterionchannel.com/three-examples-of-myself-as-a-queen
https://www.criterionchannel.com/three-on-a-match
https://www.criterionchannel.com/three-outlaw-samurai
https://www.criterionchannel.com/the-threepenny-opera
https://www.criterionchannel.com/three-resurrected-drunkards
https://www.criterionchannel.com/throne-of-blood
https://www.criterionchannel.com/through-a-glass-darkly
https://www.criterionchannel.com/through-the-olive-trees
https://www.criterionchannel.com/thru-the-wire
https://www.criterionchannel.com/thus-another-day
https://www.criterionchannel.com/tidy-up
https://www.criterionchannel.com/tie-me-up-tie-me-dow

https://www.criterionchannel.com/a-well-spent-life
https://www.criterionchannel.com/we-re-going-to-the-zoo
https://www.criterionchannel.com/werner-herzog-eats-his-shoe
https://www.criterionchannel.com/western
https://www.criterionchannel.com/westfront-1918
https://www.criterionchannel.com/we-won-t-grow-old-together
https://www.criterionchannel.com/what-did-the-lady-forget
https://www.criterionchannel.com/what-s-a-nice-girl-like-you-doing-in-a-place-like-this
https://www.criterionchannel.com/when-angels-fall
https://www.criterionchannel.com/when-a-woman-ascends-the-stairs
https://www.criterionchannel.com/when-it-rains
https://www.criterionchannel.com/when-pigs-fly
https://www.criterionchannel.com/when-we-lived-in-miami
https://www.criterionchannel.com/where-is-the-friend-s-house
https://www.criterionchannel.com/where-now-are-the-dreams-of-youth
https://www.criterionchannel.com/where-the-world-meets
https://www.criterionchannel.com/the-white-angel
https://www.criterionchannel.com/the-whi

In [None]:
# Open pre-scraped duration file
with open('data\Durations.txt') as file:
    durations = file.read().splitlines()

In [None]:
# Insert duration column
data.insert(4, 'Duration', durations)

In [None]:
# Remove whitespace
data['Duration'] = data['Duration'].replace(' ', '', regex = True)

In [None]:
# Remove seconds
data['Duration'] = data['Duration'].str[:-3]

In [None]:
# Add '0:' to indicate 0 hours for all films < 1 hour
for i, duration in enumerate(data['Duration']):
    if ':' not in duration:
        data.loc[i, 'Duration'] = '0:' + duration

In [None]:
# Split duration by colon
hours_minutes = data['Duration'].str.split(':', expand = True)

In [None]:
# Insert hours and minutes columns
data.insert(5, 'Hours', hours_minutes[0])
data['Hours'] = data['Hours'].astype(int)
data.insert(6, 'Minutes', hours_minutes[1])
data['Minutes'] = data['Minutes'].astype(int)

In [None]:
# Calculate and insert total hours
total_hours = []
for i in range(len(data)):
    hours = (data.loc[i]['Hours'].astype(int) + data.loc[i]['Minutes'].astype(int)/60).round(2)
    total_hours.append(hours)
data.insert(7, 'Total Hours', total_hours)
# Drop old columns
data = data.drop(['Duration', 'Minutes', 'Hours'], axis = 1)

In [None]:
# # Scrape descriptions, do not run this, it takes a long time
# descriptions = []
# for url in data['Url']:
#     request = requests.get(url)
#     soup = BeautifulSoup(request.content, 'html.parser')
#     paragraphs = soup.findAll('p')
#     paragraphs = paragraphs[1]
#     string = []
#     for x in paragraphs:
#         string.append(str(x))
#     descriptions.append(string[0])
#     print(url)
# # Save to csv (list is incorrectly loaded as text file)
# descriptions = pd.DataFrame({'Description': descriptions})
# descriptions.to_csv('data/Descriptions.csv', index = False)

In [None]:
# Open pre-scraped description file
descriptions = pd.read_csv('data\Descriptions.csv')

In [None]:
# Insert description column
data.insert(5, 'Description', descriptions)

In [None]:
# Remove films < 1 hour
data = data[data['Total Hours'] > 1]

In [None]:
# Save to csv
data.to_csv('data\Criterion.csv', index = False)

In [None]:
data.head()

In [None]:
# Director figure
from matplotlib import pyplot
import seaborn
dim = (20, 5)
fig, ax = pyplot.subplots(figsize = dim)
ax.set_xticklabels(ax.get_xticklabels(), rotation = 22.5)
ax = seaborn.countplot(x = data.Director, order = data.Director.value_counts().iloc[:10].index) # Sort by value_count

In [None]:
# Country figure
dim = (20, 5)
fig, ax = pyplot.subplots(figsize = dim)
ax.set_xticklabels(ax.get_xticklabels(), rotation = 22.5)
ax = seaborn.countplot(x = data.Country, order = data.Country.value_counts().iloc[:10].index) # Sort by value_count

In [None]:
# Create decade column
import numpy as np
data.insert(4, 'Decade', (data['Year'].astype(int)/10).apply(np.floor))

In [None]:
# Decade figure
dim = (20, 5)
fig, ax = pyplot.subplots(figsize = dim)
ax.set_xticklabels(ax.get_xticklabels(), rotation = 22.5)
ax = seaborn.countplot(x = data.Decade, order = data.Decade.value_counts().iloc[:10].index) # Sort by value_count