In [1]:
from datetime import date, datetime, timedelta
from itertools import cycle
import re
import time
import urllib

from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import requests
from titlecase import titlecase

# to render HTML in Pandas DataFrame output
from IPython.display import display, HTML

## Brattle

In [2]:
url = 'https://brattlefilm.org/coming-soon/'
response = requests.get(url)
text = response.text

soup = BeautifulSoup(text, 'html.parser')

show_details = soup.find_all('div', 'show-details')

In [3]:
get_dates_raw = lambda x: [' '.join(d.text.split()[1:]) for d in x.find_all('li', 'show-date')]
get_dates = lambda x: [datetime.strptime(d, "%b %d").strftime('%m-%d') for d in get_dates_raw(x)]
get_times = lambda x: [t.text.strip().split('\t')[0].upper() for t in x.find_all('a', 'showtime')]
get_title = lambda x: x.find('a', 'title').text
get_link = lambda x: x.find('a', 'title')['href']
get_poster = lambda x: x.find('div', 'show-poster').img['src']

In [4]:
brattle_df = pd.DataFrame([{'theater': 'Brattle',
                            'title': get_title(x),
                            'date': d,
                            'time': t,
                            'link': get_link(x),
                            'poster': get_poster(x)}
                           for x in show_details for d,t in list(zip(cycle(get_dates(x)), get_times(x)))])

brattle_df.to_csv('data/brattle.csv', index=False)

In [5]:
brattle_df = pd.read_csv('data/brattle.csv')
brattle_df.head()

Unnamed: 0,theater,title,date,time,link,poster
0,Brattle,Judgment at Nuremberg,08-22,3:00 PM,https://brattlefilm.org/movies/judgment-at-nur...,https://s3.amazonaws.com/nightjarprod/content/...
1,Brattle,Judgment at Nuremberg,08-22,6:30 PM,https://brattlefilm.org/movies/judgment-at-nur...,https://s3.amazonaws.com/nightjarprod/content/...
2,Brattle,A Child Is Waiting,08-23,4:30 PM,https://brattlefilm.org/movies/a-child-is-wait...,https://s3.amazonaws.com/nightjarprod/content/...
3,Brattle,A Child Is Waiting,08-23,6:45 PM,https://brattlefilm.org/movies/a-child-is-wait...,https://s3.amazonaws.com/nightjarprod/content/...
4,Brattle,Harvard Book Store presents R.F. Kuang,08-24,6:00 PM,https://brattlefilm.org/special_events/harvard...,https://s3.amazonaws.com/nightjarprod/content/...


# Coolidge

In [6]:
N = 60 # Number of days out to scrape
start_date = date.today()
cur_date = start_date
dates = [(start_date + timedelta(days=n)).strftime('%Y-%m-%d') for n in range(N)]
coolidge_url = 'https://coolidge.org/showtimes?date={date}'

In [7]:
def film_soup(url):
    c_results = requests.get(url)
    c_soup = BeautifulSoup(c_results.text, 'html.parser')
    c_films = (c_soup.find_all('div', 'film-card'))
    # filter out weird 'ResultSet' objects appearing in results
    c_films = [f for f in c_films if (str(type(f)) == "<class 'bs4.element.Tag'>")] 
    return c_films

In [8]:
get_title_raw = lambda x: x.find('div', 'film-card__title').text.strip()
get_title = lambda x: re.sub(r'Masked Matinees:', '', get_title_raw(x)).strip()
get_times = lambda x: [t.text.strip() for t in x.find_all('span', 'showtime-ticket__time')]
get_links = lambda x: [t['href'] for t in x.find_all('a', 'showtime-ticket__button')]
get_poster = lambda x: f"https://www.coolidge.org{x.picture.find_all('source')[-1]['srcset'].split()[0]}"

def get_director(x):
    info_link = x.a['href']
    # events have a 'wrapper' page around the movie info itself so update the link
    if info_link.startswith('/event'):
        event_page = requests.get('http://coolidge.org' + info_link)
        event_soup = BeautifulSoup(event_page.text, 'html.parser')
        # return empty string if updating link fails
        try:
            info_link = event_soup.find(class_='film-card__link')['href']
            assert info_link != 'view' # this happens when the Coolidge has dead links!
        except:
            print('Director not found:', info_link)
            return ''
    info_page = requests.get('http://coolidge.org' + info_link)
    info_soup = BeautifulSoup(info_page.text, 'html.parser')
    # return director's name or empty string if not found
    try:
        director = info_soup.find(class_='field--name-field-film-director').find_all('div')[-1].text.strip()
    except:
        print('Director not found:', info_link)
        return ''
    return director

In [9]:
coolidge_shows = []
for d in dates:
    url =  coolidge_url.format(date=d)
    soup = film_soup(url)
    coolidge_shows.extend([{'theater': 'Coolidge',
                            'title': get_title(x),
                            'date': datetime.strptime(d, "%Y-%m-%d").strftime('%m-%d'),
                            'time': re.sub('(am|pm)', ' \g<1>', t).upper(),
                            'link': l,
                            'poster': get_poster(x),
                            'info': get_director(x)} # used to help RT query
                           for x in soup for t, l in zip(get_times(x), get_links(x))])
    time.sleep(0.25)
                           # for t in get_times(x) for x in soup])

Director not found: /node/43661
Director not found: /node/43661
Director not found: view


In [10]:
coolidge_df = pd.DataFrame(coolidge_shows)
coolidge_df.to_csv('data/coolidge.csv', index=False)

In [11]:
coolidge_df = pd.read_csv('data/coolidge.csv')
coolidge_df.head()

Unnamed: 0,theater,title,date,time,link,poster,info
0,Coolidge,Nope,08-22,3:30 PM,https://store.coolidge.org/websales/pages/tick...,https://www.coolidge.org/sites/default/files/s...,Jordan Peele
1,Coolidge,Nope,08-22,6:45 PM,https://store.coolidge.org/websales/pages/tick...,https://www.coolidge.org/sites/default/files/s...,Jordan Peele
2,Coolidge,Nope,08-22,9:30 PM,https://store.coolidge.org/websales/pages/tick...,https://www.coolidge.org/sites/default/files/s...,Jordan Peele
3,Coolidge,Emily the Criminal,08-22,4:15 PM,https://store.coolidge.org/websales/pages/tick...,https://www.coolidge.org/sites/default/files/s...,John Patton Ford
4,Coolidge,Emily the Criminal,08-22,6:30 PM,https://store.coolidge.org/websales/pages/tick...,https://www.coolidge.org/sites/default/files/s...,John Patton Ford


## Somerville Theater

In [12]:
s_url = 'https://www.somervilletheatre.com/wp-content/themes/somerville/showtimes.xml'
s_results = requests.get(s_url)

s_soup = BeautifulSoup(s_results.text, 'xml')
s_soup = s_soup.find_all('filmtitle')

In [13]:
get_title_raw = lambda x: titlecase(x.find('name').text)
get_title = lambda x: re.sub(r'(\d\dm{1,2})|(4k)|(Masked Matinees:)', '', get_title_raw(x)).strip()
get_time_raw = lambda x: x.find('show').find('time').text 
get_time = lambda x: datetime.strptime(str(int(get_time_raw(x))//100), '%H').strftime('%l:%M %p').strip()
get_date_raw = lambda x: x.find('show').find('date').text 
get_date = lambda x: datetime.strptime(get_date_raw(x), '%m%d%Y').strftime('%m-%d')
get_link = lambda x: x.show.salelink.text

In [14]:
somerville_df = pd.DataFrame([{'theater': 'Somerville',
                                 'title': get_title(x), 
                                 'date': get_date(x),
                                 'time': get_time(x),
                                 'link': get_link(x),
                                } for x in s_soup])

somerville_df.to_csv('data/somerville.csv', index=False)

In [15]:
somerville_df = pd.read_csv('data/somerville.csv')
somerville_df

Unnamed: 0,theater,title,date,time,link
0,Somerville,Bodies Bodies Bodies,08-22,7:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...
1,Somerville,Crooklyn,08-24,9:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...
2,Somerville,Dirty Dancing,08-22,7:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...
3,Somerville,Do the Right Thing,08-24,7:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...
4,Somerville,Friday the 13th,08-27,11:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...
5,Somerville,Midsommar,08-28,1:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...
6,Somerville,Nashville,08-30,7:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...
7,Somerville,Nope,08-22,7:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...
8,Somerville,Now and Then,08-22,9:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...
9,Somerville,On Golden Pond,08-26,7:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...


# Merging 

In [16]:
df = pd.concat([brattle_df, coolidge_df, somerville_df]).sort_values(['date', 'time'])
df['info'] = df['info'].fillna('')
df[:25]

Unnamed: 0,theater,title,date,time,link,poster,info
0,Brattle,Judgment at Nuremberg,08-22,3:00 PM,https://brattlefilm.org/movies/judgment-at-nur...,https://s3.amazonaws.com/nightjarprod/content/...,
0,Coolidge,Nope,08-22,3:30 PM,https://store.coolidge.org/websales/pages/tick...,https://www.coolidge.org/sites/default/files/s...,Jordan Peele
3,Coolidge,Emily the Criminal,08-22,4:15 PM,https://store.coolidge.org/websales/pages/tick...,https://www.coolidge.org/sites/default/files/s...,John Patton Ford
6,Coolidge,Bodies Bodies Bodies,08-22,4:45 PM,https://store.coolidge.org/websales/pages/tick...,https://www.coolidge.org/sites/default/files/s...,Halina Reijn
1,Brattle,Judgment at Nuremberg,08-22,6:30 PM,https://brattlefilm.org/movies/judgment-at-nur...,https://s3.amazonaws.com/nightjarprod/content/...,
4,Coolidge,Emily the Criminal,08-22,6:30 PM,https://store.coolidge.org/websales/pages/tick...,https://www.coolidge.org/sites/default/files/s...,John Patton Ford
1,Coolidge,Nope,08-22,6:45 PM,https://store.coolidge.org/websales/pages/tick...,https://www.coolidge.org/sites/default/files/s...,Jordan Peele
9,Coolidge,Labyrinth,08-22,7:00 PM,https://store.coolidge.org/websales/pages/tick...,https://www.coolidge.org/sites/default/files/s...,Jim Henson
0,Somerville,Bodies Bodies Bodies,08-22,7:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...,,
2,Somerville,Dirty Dancing,08-22,7:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...,,


## Rotten Tomates for poster art and rating

The movie nameentires on the theater sites are sometimes sloppy and the RT search functionality is very limited.
Output below tries to identify (some) possible false matches.

The is very hacky and still grabs some wrongs posters and ratings, usually when there are multiple (rated) movies with the same title.

RT has their search results sorted by year. This stinks. Querying "The Thing" doesn't even show the cult classic 1986 Carpenter movie until the 2nd page of results! 

I'm probably just going to throw in the towel trying to scrape this info in the future and instead try an API or leverarge work other people have already done like this: https://pypi.org/project/rotten_tomatoes_client/

In [17]:
rt_df = pd.read_csv('data/rt.csv')
# rt_df.head()

In [18]:
def get_rt_info(query):
    N = 10 # number of entries to consider
    query_parsed = urllib.parse.quote(query)
    url = "https://rottentomatoes.com/search?search='{query_parsed}'"
    url = url.format(query_parsed=query_parsed)
    result = requests.get(url)
    soup = BeautifulSoup(result.content, 'html.parser')
    movie_section = soup.find('h2', attrs={'data-qa': 'search-result-title'}, text='Movies')
    listings = movie_section.find_next_sibling('ul').find_all('search-page-media-row')
    scores = [int(m['tomatometerscore']) if m['tomatometerscore'].isnumeric() else 0 for m in listings]
    # accept first exact title match that has a rating or highest rated listing if no exact match
    try:
        rated_listings = [x[0] for x in sorted(zip(listings[:N], scores), key=lambda x: x[1], reverse=True) if x[1] > 0]
        titles = [x.text.strip().lower() for x in rated_listings]
        entry = rated_listings[titles.index(query.split('|')[0].strip().lower())]
        exact_match = True
    except:
        entry = listings[np.argmax(scores[:N])]
        exact_match = False
        print(f"No exact match for \"{query}\", usings \"{entry.text.strip()}\" instead")
    try:
        rt_info = {
            'title': entry.text.strip(),
            'poster': entry.img['src'],
            'year': entry['releaseyear'],
            'cast': entry['cast'],
            'score': entry['tomatometerscore'], # technically redundant
            'cert': entry['tomatometerstate'],
            'rt_link': entry.a['href'],
            'exact_match': exact_match,
            'query_str': query # the string query that resulted in this info (for caching)
            }
    except:
        return None
    return rt_info

In [19]:
%%time
# example
get_rt_info("Vengeance | Novak")

CPU times: user 98.2 ms, sys: 266 µs, total: 98.4 ms
Wall time: 443 ms


{'title': 'Vengeance',
 'poster': 'https://resizing.flixster.com/YQlnTBaOXZazET7DPeIiueFGsDE=/fit-in/80x126/v2/https://resizing.flixster.com/xrx35GfPi6i2xPLrmj30cYIKO4c=/ems.cHJkLWVtcy1hc3NldHMvbW92aWVzLzdiZTdkNDdlLWE5M2EtNDNhNC1hYWE2LTljYzJjYjc3ZDc2OC5qcGc=',
 'year': '2022',
 'cast': 'B.J. Novak,Issa Rae,Ashton Kutcher',
 'score': '79',
 'cert': 'certified-fresh',
 'rt_link': 'https://www.rottentomatoes.com/m/vengeance_2022',
 'exact_match': True,
 'query_str': 'Vengeance | Novak'}

In [20]:
%%time
SLEEP_TIME = 0.1 # let's be nice and not hammer the servers too hard
no_dupes = df.drop_duplicates(['title', 'info'])
rt_data = []
for title, info in zip(no_dupes['title'], no_dupes['info']):
    # title and info delimited by '|' so it can be decomposed later
    query = f'{title} | {info}'.strip()
    try:
        d = get_rt_info(query)
        d.update(listing_title=title)
        rt_data.append(d)
    except:
        print(title, info)
    time.sleep(SLEEP_TIME)

No exact match for "Harvard Book Store presents R.F. Kuang |", usings "Hello, Bookstore" instead
No exact match for "Kill Bill Vol. 1 and Vol. 2 | Quentin Tarantino", usings "A One and a Two..." instead
No exact match for "Rock ‘n’ Roll High School |", usings "Rock 'n' Roll High School" instead
No exact match for "The Origins of American Independent Cinema



Part of The Origins of American Independent Cinema |", usings "The Decline of Western Civilization Part II: The Metal Years" instead
No exact match for "X (2022) | Ti West", usings "X" instead
No exact match for "Road Warrior (aka Mad Max 2) |", usings "Mad Max: Fury Road" instead
No exact match for "The Texas Chainsaw Massacre 2 | Tobe Hooper", usings "The Texas Chainsaw Massacre Part 2" instead
No exact match for "Season of the Witch '72 |", usings "Season of the Devil (Ang Panahon ng Halimaw)" instead
No exact match for "Straight Line Crazy | Nicholas Hytner", usings "The Lady in the Van" instead
No exact match for "Shaft (1971

In [21]:
rt_df = pd.DataFrame(rt_data)
rt_df.to_csv('data/rt.csv', index=False)

In [22]:
rt_df = pd.read_csv('data/rt.csv')
rt_df.head()

Unnamed: 0,title,poster,year,cast,score,cert,rt_link,exact_match,query_str,listing_title
0,Judgment at Nuremberg,https://resizing.flixster.com/aOTyYtddK0xgxGIT...,1961,"Spencer Tracy,Burt Lancaster,Richard Widmark",92,fresh,https://www.rottentomatoes.com/m/judgment_at_n...,True,Judgment at Nuremberg |,Judgment at Nuremberg
1,Nope,https://resizing.flixster.com/RqUUghr84Wd1Mdr_...,2022,"Daniel Kaluuya,Keke Palmer,Steven Yeun",82,certified-fresh,https://www.rottentomatoes.com/m/nope,True,Nope | Jordan Peele,Nope
2,Emily the Criminal,https://resizing.flixster.com/i7JSTWsBKyKdjQKi...,2022,"Aubrey Plaza,Theo Rossi,Jonathan Avigdori",93,certified-fresh,https://www.rottentomatoes.com/m/emily_the_cri...,True,Emily the Criminal | John Patton Ford,Emily the Criminal
3,Bodies Bodies Bodies,https://resizing.flixster.com/wUsCy5Orzb86Dexq...,2022,"Amandla Stenberg,Maria Bakalova,Myha'la Herrold",88,certified-fresh,https://www.rottentomatoes.com/m/bodies_bodies...,True,Bodies Bodies Bodies | Halina Reijn,Bodies Bodies Bodies
4,Labyrinth,https://resizing.flixster.com/jNnX_xp8w6xznQPx...,1986,"David Bowie,Jennifer Connelly,Toby Froud",75,fresh,https://www.rottentomatoes.com/m/labyrinth,True,Labyrinth | Jim Henson,Labyrinth


In [38]:
merged = pd.merge(df, rt_df, left_on='title', right_on='listing_title')

In [39]:
merged.columns

Index(['theater', 'title_x', 'date', 'time', 'link', 'poster_x', 'info',
       'title_y', 'poster_y', 'year', 'cast', 'score', 'cert', 'rt_link',
       'exact_match', 'query_str', 'listing_title'],
      dtype='object')

In [40]:
merged = merged.drop(['listing_title', 'poster_x', 'title_y'], axis=1)\
               .rename(columns={'title_x': 'title', 'poster_y': 'poster'})

In [46]:
output = merged[['theater', 'poster', 'exact_match', 'title', 'year', 'date', 'time', 'link', 'score', 'rt_link']].sort_values(['date','time'])
output= output.drop_duplicates()

In [47]:
# add HTML to poster and ticket & RT link strings so they 
# work when displayed in the DataFrame
output['exact_match'] = output.exact_match.apply(lambda x: '?' if not x else ' ')
output['poster']=  '''<img src="''' + output['poster'] + '''">''' 
output['link'] = output.link.apply(lambda x: f"<a href={x}>🎟️</a>")
output['rt_link'] = output.rt_link.apply(lambda x: f"<a href={x}>🍅</a>")

In [48]:
output.to_csv("data/showtimes.csv")

# Showtimes 

In [50]:
html_output = r'<link rel="stylesheet" type="text/css" media="screen" href="stylesheets/main.css" />' + '\n'

html_output += output.to_html(render_links=True, escape=False, index=False, justify='center')
with open('docs/index.html', 'w') as f:
    f.write(html_output)

In [51]:
HTML(output.to_html(render_links=True, escape=False, index=False, justify='center'))

theater,poster,exact_match,title,year,date,time,link,score,rt_link
Brattle,,,Judgment at Nuremberg,1961,08-22,3:00 PM,🎟️,92,🍅
Coolidge,,,Nope,2022,08-22,3:30 PM,🎟️,82,🍅
Coolidge,,,Emily the Criminal,2022,08-22,4:15 PM,🎟️,93,🍅
Coolidge,,,Bodies Bodies Bodies,2022,08-22,4:45 PM,🎟️,88,🍅
Brattle,,,Judgment at Nuremberg,1961,08-22,6:30 PM,🎟️,92,🍅
Coolidge,,,Emily the Criminal,2022,08-22,6:30 PM,🎟️,93,🍅
Coolidge,,,Nope,2022,08-22,6:45 PM,🎟️,82,🍅
Somerville,,,Nope,2022,08-22,7:00 PM,🎟️,82,🍅
Somerville,,,Bodies Bodies Bodies,2022,08-22,7:00 PM,🎟️,88,🍅
Coolidge,,,Labyrinth,1986,08-22,7:00 PM,🎟️,75,🍅


**Different movies playing today**

In [31]:
df[df['date'] == date.today().strftime('%m-%d')].title.unique()

array(['Judgment at Nuremberg', 'Nope', 'Emily the Criminal',
       'Bodies Bodies Bodies', 'Labyrinth', 'Dirty Dancing', 'Vengeance',
       'Now and Then'], dtype=object)