# [Jump to Movie Showtimes](#Showtimes)

In [1]:
from datetime import date, datetime, timedelta
import time
import re
import urllib
import numpy as np
import requests
import titlecase
from bs4 import BeautifulSoup
import pandas as pd
from IPython.display import display, HTML

In [2]:
url = 'https://brattlefilm.org/coming-soon/'
response = requests.get(url)

In [3]:
text = response.text
text[:300]

'<!DOCTYPE html>\n<html lang="en-US">\n\n<head>\n\n\n\t<meta charset="UTF-8">\n\t<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">\n\t<meta name="viewport" content="width=device-width, initial-scale=1">\n\t<link rel="profile" href="http://gmpg.org/xfn/11">\n\t<link rel="pingback" href="https://brattlef'

In [4]:
soup = BeautifulSoup(text, 'html.parser')

In [5]:
show_details = soup.find_all('div', 'show-details')

In [6]:
get_dates_raw = lambda x: [' '.join(d.text.split()[1:]) for d in x.find_all('li', 'show-date')]
get_dates = lambda x: [datetime.strptime(d, "%b %d").strftime('%m-%d') for d in get_dates_raw(x)]
get_times = lambda x: [t.text.strip().split('\t')[0].upper() for t in x.find_all('a', 'showtime')]
get_title = lambda x: x.find('a', 'title').text
get_link = lambda x: x.find('a', 'title')['href']
get_poster = lambda x: x.find('div', 'show-poster').img['src']

In [7]:
brattle_df = pd.DataFrame([{'theater': 'Brattle',
                            'title': get_title(x),
                            'date': d,
                            'time': t,
                            'link': get_link(x),
                            'poster': get_poster(x)}
                           for x in show_details for d,t in list(zip(get_dates(x), get_times(x)))])

In [8]:
brattle_df.to_csv('data/brattle.csv', index=False)

In [9]:
brattle_df = pd.read_csv('data/brattle.csv')
brattle_df.head()

Unnamed: 0,theater,title,date,time,link,poster
0,Brattle,Lost Highway,07-03,9:30 PM,https://brattlefilm.org/movies/lost-highway/,https://s3.amazonaws.com/nightjarprod/content/...
1,Brattle,Pee-wee’s Big Adventure,07-04,1:00 PM,https://brattlefilm.org/movies/pee-wees-big-ad...,https://s3.amazonaws.com/nightjarprod/content/...
2,Brattle,Midnight Run,07-05,3:45 PM,https://brattlefilm.org/movies/midnight-run/,https://s3.amazonaws.com/nightjarprod/content/...
3,Brattle,Harvard Book Store presents Patrick Radden Keefe,07-06,6:00 PM,https://brattlefilm.org/special_events/harvard...,https://s3.amazonaws.com/nightjarprod/content/...
4,Brattle,Queen & Slim,07-07,3:45 PM,https://brattlefilm.org/movies/queen-slim/,https://s3.amazonaws.com/nightjarprod/content/...


# Coolidge

In [10]:
N = 60 # Number of days scrape
start_date = date.today()
cur_date = start_date
dates = [(start_date + timedelta(days=n)).strftime('%Y-%m-%d') for n in range(N)]
coolidge_url = 'https://coolidge.org/showtimes?date={date}'

In [11]:
def film_soup(url):
    c_results = requests.get(url)
    c_soup = BeautifulSoup(c_results.text, 'html.parser')
    c_films = (c_soup.find_all('div', 'film-card'))
    # filter out weird 'ResultSet' objects appearing in results
    c_films = [f for f in c_films if (str(type(f)) == "<class 'bs4.element.Tag'>")] 
    return c_films

In [14]:
get_title_raw = lambda x: x.find('div', 'film-card__title').text.strip()
get_title = lambda x: re.sub(r'Masked Matinees:', '', get_title_raw(x)).strip()
get_times = lambda x: [t.text.strip() for t in x.find_all('span', 'showtime-ticket__time')]
get_links = lambda x: [t['href'] for t in x.find_all('a', 'showtime-ticket__button')]
get_poster = lambda x: f"https://www.coolidge.org{x.picture.find_all('source')[-1]['srcset'].split()[0]}"

In [15]:
coolidge_shows = []
for d in dates:
    url =  coolidge_url.format(date=d)
    soup = film_soup(url)
    coolidge_shows.extend([{'theater': 'Coolidge',
                            'title': get_title(x),
                            'date': datetime.strptime(d, "%Y-%m-%d").strftime('%m-%d'),
                            'time': re.sub('(am|pm)', ' \g<1>', t).upper(),
                            'link': l,
                            'poster': get_poster(x)}
                           for x in soup for t, l in zip(get_times(x), get_links(x))])
                           # for t in get_times(x) for x in soup])

In [16]:
coolidge_df = pd.DataFrame(coolidge_shows)
coolidge_df.to_csv('data/coolidge.csv', index=False)

In [17]:
coolidge_df = pd.read_csv('data/coolidge.csv')
coolidge_df.head()

Unnamed: 0,theater,title,date,time,link,poster
0,Coolidge,Marcel the Shell with Shoes On,07-03,11:00 AM,https://store.coolidge.org/websales/pages/Tick...,https://www.coolidge.org/sites/default/files/s...
1,Coolidge,Mr. Malcolm’s List,07-03,11:30 AM,https://store.coolidge.org/websales/pages/Tick...,https://www.coolidge.org/sites/default/files/s...
2,Coolidge,Mr. Malcolm’s List,07-03,2:00 PM,https://store.coolidge.org/websales/pages/Tick...,https://www.coolidge.org/sites/default/files/s...
3,Coolidge,Mr. Malcolm’s List,07-03,4:30 PM,https://store.coolidge.org/websales/pages/Tick...,https://www.coolidge.org/sites/default/files/s...
4,Coolidge,Mr. Malcolm’s List,07-03,7:00 PM,https://store.coolidge.org/websales/pages/Tick...,https://www.coolidge.org/sites/default/files/s...


## Somerville Theater

In [18]:
s_url = 'https://www.somervilletheatre.com/wp-content/themes/somerville/showtimes.xml'
s_results = requests.get(s_url)

In [19]:
s_soup = BeautifulSoup(s_results.text, 'xml')
s_soup = s_soup.find_all('filmtitle')

In [20]:
# should use the `titlecase` package
# TODO: change titlecase import
get_title_raw = lambda x: titlecase.titlecase(x.find('name').text)
get_title = lambda x: re.sub(r'(\d\dm{1,2})|(4k)|(Masked Matinees:)', '', get_title_raw(x)).strip()
get_time_raw = lambda x: x.find('show').find('time').text 
get_time = lambda x: datetime.strptime(str(int(get_time_raw(x))//100), '%H').strftime('%l:%M %p').strip()
get_date_raw = lambda x: x.find('show').find('date').text 
get_date = lambda x: datetime.strptime(get_date_raw(x), '%m%d%Y').strftime('%m-%d')
get_link = lambda x: x.show.salelink.text

In [21]:
somerville_df = pd.DataFrame([{'theater': 'Somerville',
                                 'title': get_title(x), 
                                 'date': get_date(x),
                                 'time': get_time(x),
                                 'link': get_link(x),
                                } for x in s_soup])

somerville_df.to_csv('data/somerville.csv', index=False)

In [22]:
somerville_df = pd.read_csv('data/somerville.csv')
somerville_df

Unnamed: 0,theater,title,date,time,link
0,Somerville,Airport,07-03,7:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...
1,Somerville,Blazing Saddles,07-30,11:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...
2,Somerville,Camp,07-05,9:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...
3,Somerville,Coffy,07-12,5:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...
4,Somerville,Crooklyn,08-24,9:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...
5,Somerville,Dirty Dancing,08-22,7:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...
6,Somerville,Do the Right Thing,08-24,7:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...
7,Somerville,Dolemite Is My Name,07-23,11:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...
8,Somerville,Elvis,07-03,1:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...
9,Somerville,Friday the 13th,08-27,11:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...


# Merging 

In [23]:
df = pd.concat([brattle_df, coolidge_df, somerville_df]).sort_values(['date', 'time'])
df[:50]

Unnamed: 0,theater,title,date,time,link,poster
0,Coolidge,Marcel the Shell with Shoes On,07-03,11:00 AM,https://store.coolidge.org/websales/pages/Tick...,https://www.coolidge.org/sites/default/files/s...
1,Coolidge,Mr. Malcolm’s List,07-03,11:30 AM,https://store.coolidge.org/websales/pages/Tick...,https://www.coolidge.org/sites/default/files/s...
6,Coolidge,Elvis,07-03,12:30 PM,https://store.coolidge.org/websales/pages/Tick...,https://www.coolidge.org/sites/default/files/s...
9,Coolidge,Official Competition,07-03,1:00 PM,https://store.coolidge.org/websales/pages/Tick...,https://www.coolidge.org/sites/default/files/s...
8,Somerville,Elvis,07-03,1:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...,
38,Somerville,Top Gun Maverick,07-03,1:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...,
2,Coolidge,Mr. Malcolm’s List,07-03,2:00 PM,https://store.coolidge.org/websales/pages/Tick...,https://www.coolidge.org/sites/default/files/s...
30,Somerville,Spartacus,07-03,2:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...,
11,Coolidge,Marcel the Shell with Shoes On,07-03,2:15 PM,https://store.coolidge.org/websales/pages/Tick...,https://www.coolidge.org/sites/default/files/s...
15,Coolidge,Everything Everywhere All at Once,07-03,3:45 PM,https://store.coolidge.org/websales/pages/Tick...,https://www.coolidge.org/sites/default/files/s...


In [45]:
def get_rt_info(query):
    N = 4 # number of entries to consider
    query = urllib.parse.quote(query)
    url = "https://rottentomatoes.com/search?search='{query}'"
    url = url.format(query=query)
    result = requests.get(url)
    soup = BeautifulSoup(result.content, 'html.parser')
    movie_section = soup.find('h2', attrs={'data-qa': 'search-result-title'}, text='Movies')
    listings = movie_section.find_next_sibling('ul').find_all('search-page-media-row')
    scores = [int(m['tomatometerscore']) if m['tomatometerscore'].isnumeric() else 0 for m in listings]
    entry = listings[np.argmax(scores[:4])]
    try:
        rt_info = {
            'title': entry.text.strip(),
            'poster': entry.img['src'],
            'year': entry['releaseyear'],
            'cast': entry['cast'],
            'score': entry['tomatometerscore'], # technically redundant
            'cert': entry['tomatometerstate'],
            'rt_link': entry.a['href']
            }
    except:
        return None
    return rt_info

In [46]:
%%time
get_rt_info("Pee wee's big aventure")

CPU times: user 99.5 ms, sys: 0 ns, total: 99.5 ms
Wall time: 1.05 s


{'title': "Pee-wee's Big Adventure",
 'poster': 'https://resizing.flixster.com/dX5asNZ4f_a6UPp8E9xS_IrZXqE=/fit-in/80x126/v2/https://flxt.tmsimg.com/assets/p8533_p_v8_ab.jpg',
 'year': '1985',
 'cast': 'Paul Reubens,Elizabeth Daily,Mark Holton',
 'score': '87',
 'cert': 'certified-fresh',
 'rt_link': 'https://www.rottentomatoes.com/m/peewees_big_adventure'}

In [47]:
%%time
titles = df.title.unique()
rt_data = []
for t in titles:
    try:
        d = get_rt_info(t)
        d.update(listing_title=t)
        rt_data.append(d)
    except:
        print(t)
    time.sleep(0.25)

Judgment at Nuremberg
CPU times: user 17 s, sys: 517 ms, total: 17.5 s
Wall time: 7min 45s


In [48]:
rt_df = pd.DataFrame(rt_data)
rt_df.to_csv('data/rt.csv', index=False)

In [49]:
rt_df = pd.read_csv('data/rt.csv')
rt_df.head()

Unnamed: 0,title,poster,year,cast,score,cert,rt_link,listing_title
0,Marcel the Shell with Shoes On,https://resizing.flixster.com/Zd9kpD_evBsdOqXQ...,2021,"Jenny Slate,Rosa Salazar,Thomas Mann",100.0,certified-fresh,https://www.rottentomatoes.com/m/marcel_the_sh...,Marcel the Shell with Shoes On
1,Mr. Malcolm's List,https://resizing.flixster.com/brGS1xAm7lsZweH7...,2022,"Freida Pinto,Sope Dirisu,Oliver Jackson-Cohen",80.0,fresh,https://www.rottentomatoes.com/m/mr_malcolms_list,Mr. Malcolm’s List
2,Elvis,https://resizing.flixster.com/Jyfwrykf_WmlzzjE...,2022,"Austin Butler,Tom Hanks,Helen Thomson",78.0,certified-fresh,https://www.rottentomatoes.com/m/elvis,Elvis
3,Official Competition,https://resizing.flixster.com/dTWt1cvRxbQfrGC_...,2021,"Penélope Cruz,Antonio Banderas,Oscar Martínez",96.0,certified-fresh,https://www.rottentomatoes.com/m/official_comp...,Official Competition
4,Top Gun: Maverick,https://resizing.flixster.com/mUdgPGdb-0EmWSiG...,2022,"Tom Cruise,Miles Teller,Jennifer Connelly",96.0,certified-fresh,https://www.rottentomatoes.com/m/top_gun_maverick,Top Gun Maverick


In [50]:
merged = pd.merge(df, rt_df, left_on='title', right_on='listing_title')

In [51]:
merged.columns

Index(['theater', 'title_x', 'date', 'time', 'link', 'poster_x', 'title_y',
       'poster_y', 'year', 'cast', 'score', 'cert', 'rt_link',
       'listing_title'],
      dtype='object')

In [52]:
merged = merged.drop(['listing_title', 'poster_x', 'title_y'], axis=1)\
               .rename(columns={'title_x': 'title', 'poster_y': 'poster'})

In [53]:
output = merged[['theater', 'poster', 'title', 'year', 'date', 'time', 'link', 'score', 'rt_link']].sort_values(['date','time'])

In [54]:
output['poster']=  '''<img src="''' + output['poster'] + '''">''' 
output['link'] = output.link.apply(lambda x: f"<a href={x}>🎟️</a>")
output['rt_link'] = output.rt_link.apply(lambda x: f"<a href={x}>🍅</a>")

# Showtimes 

In [55]:
HTML(output.to_html(render_links=True, escape=False, index=False, justify='center'))

theater,poster,title,year,date,time,link,score,rt_link
Coolidge,,Marcel the Shell with Shoes On,2021,07-03,11:00 AM,🎟️,100.0,🍅
Coolidge,,Mr. Malcolm’s List,2022,07-03,11:30 AM,🎟️,80.0,🍅
Coolidge,,Elvis,2022,07-03,12:30 PM,🎟️,78.0,🍅
Somerville,,Elvis,2022,07-03,1:00 PM,🎟️,78.0,🍅
Coolidge,,Official Competition,2021,07-03,1:00 PM,🎟️,96.0,🍅
Somerville,,Top Gun Maverick,2022,07-03,1:00 PM,🎟️,96.0,🍅
Coolidge,,Mr. Malcolm’s List,2022,07-03,2:00 PM,🎟️,80.0,🍅
Somerville,,Spartacus,1960,07-03,2:00 PM,🎟️,94.0,🍅
Coolidge,,Marcel the Shell with Shoes On,2021,07-03,2:15 PM,🎟️,100.0,🍅
Coolidge,,Everything Everywhere All at Once,2022,07-03,3:45 PM,🎟️,95.0,🍅


# Testing...

In [None]:
s = '<img src=https://resizing.flixster.com/Zd9kpD_evBsdOqXQuVj8t0s4JYE=/fit-in/80x126/v2/https://resizing.flixster.com/y629Pkvr6tTHrzCEOd5XgLtGuxE=/ems.cHJkLWVtcy1hc3NldHMvbW92aWVzLzMyODliMzFlLTMwZmEtNDZjOC04Zjk5LWMwNTIyNGEzYjVlNC5qcGc=>'

In [None]:
q = "The Wizard of Oz"
results = requests.get(f'https://rottentomatoes.com/search?search={q}')
soup = BeautifulSoup(results.content, 'html.parser')

In [None]:

# soup.find('ul', slot='list')

# soup.find('search-page-media-row')

**Different movies playing today**

In [None]:
df[df['date'] == date.today().strftime('%m-%d')].title.unique()