In [1]:
from datetime import date, datetime, timedelta
import time
import re
import urllib
import numpy as np
import requests
import titlecase
from bs4 import BeautifulSoup
import pandas as pd
from IPython.display import display, HTML

## Brattle

In [2]:
url = 'https://brattlefilm.org/coming-soon/'
response = requests.get(url)
text = response.text

soup = BeautifulSoup(text, 'html.parser')

show_details = soup.find_all('div', 'show-details')

In [3]:
get_dates_raw = lambda x: [' '.join(d.text.split()[1:]) for d in x.find_all('li', 'show-date')]
get_dates = lambda x: [datetime.strptime(d, "%b %d").strftime('%m-%d') for d in get_dates_raw(x)]
get_times = lambda x: [t.text.strip().split('\t')[0].upper() for t in x.find_all('a', 'showtime')]
get_title = lambda x: x.find('a', 'title').text
get_link = lambda x: x.find('a', 'title')['href']
get_poster = lambda x: x.find('div', 'show-poster').img['src']

In [4]:
brattle_df = pd.DataFrame([{'theater': 'Brattle',
                            'title': get_title(x),
                            'date': d,
                            'time': t,
                            'link': get_link(x),
                            'poster': get_poster(x)}
                           for x in show_details for d,t in list(zip(get_dates(x), get_times(x)))])

brattle_df.to_csv('data/brattle.csv', index=False)

In [5]:
brattle_df = pd.read_csv('data/brattle.csv')
brattle_df.head(20)

Unnamed: 0,theater,title,date,time,link,poster
0,Brattle,El Topo,08-11,7:00 PM,https://brattlefilm.org/movies/el-topo/,https://s3.amazonaws.com/nightjarprod/content/...
1,Brattle,The Holy Mountain,08-11,9:45 PM,https://brattlefilm.org/movies/the-holy-mountain/,https://s3.amazonaws.com/nightjarprod/content/...
2,Brattle,Morocco,08-12,5:00 PM,https://brattlefilm.org/movies/morocco/,https://s3.amazonaws.com/nightjarprod/content/...
3,Brattle,Dishonored,08-12,7:00 PM,https://brattlefilm.org/movies/dishonored/,https://s3.amazonaws.com/nightjarprod/content/...
4,Brattle,Blonde Venus,08-13,3:00 PM,https://brattlefilm.org/movies/blonde-venus/,https://s3.amazonaws.com/nightjarprod/content/...
5,Brattle,Shanghai Express,08-13,5:00 PM,https://brattlefilm.org/movies/shanghai-express/,https://s3.amazonaws.com/nightjarprod/content/...
6,Brattle,The Devil Is a Woman,08-14,12:30 PM,https://brattlefilm.org/movies/the-devil-is-a-...,https://s3.amazonaws.com/nightjarprod/content/...
7,Brattle,The Scarlet Empress,08-14,2:30 PM,https://brattlefilm.org/movies/the-scarlet-emp...,https://s3.amazonaws.com/nightjarprod/content/...
8,Brattle,For Me and My Gal,08-15,4:15 PM,https://brattlefilm.org/movies/for-me-and-my-gal/,https://s3.amazonaws.com/nightjarprod/content/...
9,Brattle,Summer Stock,08-15,6:30 PM,https://brattlefilm.org/movies/summer-stock/,https://s3.amazonaws.com/nightjarprod/content/...


# Coolidge

In [6]:
N = 60 # Number of days scrape
start_date = date.today()
cur_date = start_date
dates = [(start_date + timedelta(days=n)).strftime('%Y-%m-%d') for n in range(N)]
coolidge_url = 'https://coolidge.org/showtimes?date={date}'

In [7]:
def film_soup(url):
    c_results = requests.get(url)
    c_soup = BeautifulSoup(c_results.text, 'html.parser')
    c_films = (c_soup.find_all('div', 'film-card'))
    # filter out weird 'ResultSet' objects appearing in results
    c_films = [f for f in c_films if (str(type(f)) == "<class 'bs4.element.Tag'>")] 
    return c_films

In [8]:
get_title_raw = lambda x: x.find('div', 'film-card__title').text.strip()
get_title = lambda x: re.sub(r'Masked Matinees:', '', get_title_raw(x)).strip()
get_times = lambda x: [t.text.strip() for t in x.find_all('span', 'showtime-ticket__time')]
get_links = lambda x: [t['href'] for t in x.find_all('a', 'showtime-ticket__button')]
get_poster = lambda x: f"https://www.coolidge.org{x.picture.find_all('source')[-1]['srcset'].split()[0]}"

In [9]:
coolidge_shows = []
for d in dates:
    url =  coolidge_url.format(date=d)
    soup = film_soup(url)
    coolidge_shows.extend([{'theater': 'Coolidge',
                            'title': get_title(x),
                            'date': datetime.strptime(d, "%Y-%m-%d").strftime('%m-%d'),
                            'time': re.sub('(am|pm)', ' \g<1>', t).upper(),
                            'link': l,
                            'poster': get_poster(x)}
                           for x in soup for t, l in zip(get_times(x), get_links(x))])
                           # for t in get_times(x) for x in soup])

In [10]:
coolidge_df = pd.DataFrame(coolidge_shows)
coolidge_df.to_csv('data/coolidge.csv', index=False)

In [11]:
coolidge_df = pd.read_csv('data/coolidge.csv')
coolidge_df.head(20)

Unnamed: 0,theater,title,date,time,link,poster
0,Coolidge,Nope,08-11,3:30 PM,https://store.coolidge.org/websales/pages/Tick...,https://www.coolidge.org/sites/default/files/s...
1,Coolidge,Nope,08-11,6:30 PM,https://store.coolidge.org/websales/pages/Tick...,https://www.coolidge.org/sites/default/files/s...
2,Coolidge,Nope,08-11,9:45 PM,https://store.coolidge.org/websales/pages/Tick...,https://www.coolidge.org/sites/default/files/s...
3,Coolidge,Bullet Train,08-11,4:00 PM,https://store.coolidge.org/websales/pages/Tick...,https://www.coolidge.org/sites/default/files/s...
4,Coolidge,Bullet Train,08-11,7:15 PM,https://store.coolidge.org/websales/pages/Tick...,https://www.coolidge.org/sites/default/files/s...
5,Coolidge,Bullet Train,08-11,9:55 PM,https://store.coolidge.org/websales/pages/Tick...,https://www.coolidge.org/sites/default/files/s...
6,Coolidge,The Origins of American Independent Cinema\n\n...,08-11,6:30 PM,https://store.coolidge.org/websales/pages/entr...,https://www.coolidge.org/sites/default/files/s...
7,Coolidge,Rhinestone,08-11,7:00 PM,https://store.coolidge.org/websales/pages/Tick...,https://www.coolidge.org/sites/default/files/s...
8,Coolidge,Vengeance,08-11,9:15 PM,https://store.coolidge.org/websales/pages/Tick...,https://www.coolidge.org/sites/default/files/s...
9,Coolidge,Everything Everywhere All at Once,08-11,9:30 PM,https://store.coolidge.org/websales/pages/Tick...,https://www.coolidge.org/sites/default/files/s...


## Somerville Theater

In [12]:
s_url = 'https://www.somervilletheatre.com/wp-content/themes/somerville/showtimes.xml'
s_results = requests.get(s_url)

s_soup = BeautifulSoup(s_results.text, 'xml')
s_soup = s_soup.find_all('filmtitle')

In [13]:
# should use the `titlecase` package
# TODO: change titlecase import
get_title_raw = lambda x: titlecase.titlecase(x.find('name').text)
get_title = lambda x: re.sub(r'(\d\dm{1,2})|(4k)|(Masked Matinees:)', '', get_title_raw(x)).strip()
get_time_raw = lambda x: x.find('show').find('time').text 
get_time = lambda x: datetime.strptime(str(int(get_time_raw(x))//100), '%H').strftime('%l:%M %p').strip()
get_date_raw = lambda x: x.find('show').find('date').text 
get_date = lambda x: datetime.strptime(get_date_raw(x), '%m%d%Y').strftime('%m-%d')
get_link = lambda x: x.show.salelink.text

In [14]:
somerville_df = pd.DataFrame([{'theater': 'Somerville',
                                 'title': get_title(x), 
                                 'date': get_date(x),
                                 'time': get_time(x),
                                 'link': get_link(x),
                                } for x in s_soup])

somerville_df.to_csv('data/somerville.csv', index=False)

In [15]:
somerville_df = pd.read_csv('data/somerville.csv')
somerville_df

Unnamed: 0,theater,title,date,time,link
0,Somerville,Bodies Bodies Bodies,08-12,5:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...
1,Somerville,Bullet Train,08-11,5:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...
2,Somerville,Crooklyn,08-24,9:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...
3,Somerville,Dirty Dancing,08-22,7:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...
4,Somerville,Do the Right Thing,08-24,7:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...
5,Somerville,Friday the 13th,08-27,11:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...
6,Somerville,I Love My Dad,08-11,5:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...
7,Somerville,Ingrid Goes West,08-17,7:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...
8,Somerville,Mermaids,08-15,7:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...
9,Somerville,Midsommar,08-28,1:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...


# Merging 

In [16]:
df = pd.concat([brattle_df, coolidge_df, somerville_df]).sort_values(['date', 'time'])
df[:50]

Unnamed: 0,theater,title,date,time,link,poster
0,Coolidge,Nope,08-11,3:30 PM,https://store.coolidge.org/websales/pages/Tick...,https://www.coolidge.org/sites/default/files/s...
3,Coolidge,Bullet Train,08-11,4:00 PM,https://store.coolidge.org/websales/pages/Tick...,https://www.coolidge.org/sites/default/files/s...
1,Somerville,Bullet Train,08-11,5:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...,
6,Somerville,I Love My Dad,08-11,5:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...,
11,Somerville,Nope,08-11,5:00 PM,https://20163.formovietickets.com:2235/T.ASP?W...,
1,Coolidge,Nope,08-11,6:30 PM,https://store.coolidge.org/websales/pages/Tick...,https://www.coolidge.org/sites/default/files/s...
6,Coolidge,The Origins of American Independent Cinema\n\n...,08-11,6:30 PM,https://store.coolidge.org/websales/pages/entr...,https://www.coolidge.org/sites/default/files/s...
0,Brattle,El Topo,08-11,7:00 PM,https://brattlefilm.org/movies/el-topo/,https://s3.amazonaws.com/nightjarprod/content/...
7,Coolidge,Rhinestone,08-11,7:00 PM,https://store.coolidge.org/websales/pages/Tick...,https://www.coolidge.org/sites/default/files/s...
4,Coolidge,Bullet Train,08-11,7:15 PM,https://store.coolidge.org/websales/pages/Tick...,https://www.coolidge.org/sites/default/files/s...


## Rotten Tomates for poster art and rating

In [17]:
def get_rt_info(query):
    N = 4 # number of entries to consider
    query = urllib.parse.quote(query)
    url = "https://rottentomatoes.com/search?search='{query}'"
    url = url.format(query=query)
    result = requests.get(url)
    soup = BeautifulSoup(result.content, 'html.parser')
    movie_section = soup.find('h2', attrs={'data-qa': 'search-result-title'}, text='Movies')
    listings = movie_section.find_next_sibling('ul').find_all('search-page-media-row')
    scores = [int(m['tomatometerscore']) if m['tomatometerscore'].isnumeric() else 0 for m in listings]
    entry = listings[np.argmax(scores[:4])]
    try:
        rt_info = {
            'title': entry.text.strip(),
            'poster': entry.img['src'],
            'year': entry['releaseyear'],
            'cast': entry['cast'],
            'score': entry['tomatometerscore'], # technically redundant
            'cert': entry['tomatometerstate'],
            'rt_link': entry.a['href']
            }
    except:
        return None
    return rt_info

In [18]:
%%time
# example
get_rt_info("Pee wee's big aventure")

CPU times: user 105 ms, sys: 7.73 ms, total: 113 ms
Wall time: 2.3 s


{'title': "Pee-wee's Big Adventure",
 'poster': 'https://resizing.flixster.com/dX5asNZ4f_a6UPp8E9xS_IrZXqE=/fit-in/80x126/v2/https://flxt.tmsimg.com/assets/p8533_p_v8_ab.jpg',
 'year': '1985',
 'cast': 'Paul Reubens,Elizabeth Daily,Mark Holton',
 'score': '87',
 'cert': 'certified-fresh',
 'rt_link': 'https://www.rottentomatoes.com/m/peewees_big_adventure'}

In [19]:
%%time
titles = df.title.unique()
rt_data = []
for t in titles:
    try:
        d = get_rt_info(t)
        d.update(listing_title=t)
        rt_data.append(d)
    except:
        print(t)
    time.sleep(0.1)

CPU times: user 11 s, sys: 256 ms, total: 11.3 s
Wall time: 1min 45s


In [20]:
rt_df = pd.DataFrame(rt_data)
rt_df.to_csv('data/rt.csv', index=False)

In [21]:
rt_df = pd.read_csv('data/rt.csv')
rt_df.head()

Unnamed: 0,title,poster,year,cast,score,cert,rt_link,listing_title
0,Nope,https://resizing.flixster.com/RqUUghr84Wd1Mdr_...,2022,"Daniel Kaluuya,Keke Palmer,Steven Yeun",82.0,certified-fresh,https://www.rottentomatoes.com/m/nope,Nope
1,Bullet Train,https://resizing.flixster.com/5_452AiySCc1Qedp...,2022,"Brad Pitt,Joey King,Aaron Taylor-Johnson",54.0,rotten,https://www.rottentomatoes.com/m/bullet_train_...,Bullet Train
2,I Love My Dad,https://resizing.flixster.com/jPHL1K2L_bj5NfXF...,2022,"Patton Oswalt,James Morosini,Claudia Sulewski",70.0,fresh,https://www.rottentomatoes.com/m/i_love_my_dad,I Love My Dad
3,The Decline of Western Civilization Part II: T...,https://resizing.flixster.com/QQiD-pdoCgPxeXyH...,1988,"Steven Tyler,Joe Perry,Alice Cooper",91.0,fresh,https://www.rottentomatoes.com/m/decline_of_we...,The Origins of American Independent Cinema\n\n...
4,Tinker Tailor Soldier Spy,https://resizing.flixster.com/JXiB5VkW7LCXm-V_...,2011,"Gary Oldman,Kathy Burke,Benedict Cumberbatch",84.0,certified-fresh,https://www.rottentomatoes.com/m/tinker_tailor...,El Topo


In [22]:
merged = pd.merge(df, rt_df, left_on='title', right_on='listing_title')

In [23]:
merged.columns

Index(['theater', 'title_x', 'date', 'time', 'link', 'poster_x', 'title_y',
       'poster_y', 'year', 'cast', 'score', 'cert', 'rt_link',
       'listing_title'],
      dtype='object')

In [24]:
merged = merged.drop(['listing_title', 'poster_x', 'title_y'], axis=1)\
               .rename(columns={'title_x': 'title', 'poster_y': 'poster'})

In [25]:
output = merged[['theater', 'poster', 'title', 'year', 'date', 'time', 'link', 'score', 'rt_link']].sort_values(['date','time'])

In [26]:
output['poster']=  '''<img src="''' + output['poster'] + '''">''' 
output['link'] = output.link.apply(lambda x: f"<a href={x}>🎟️</a>")
output['rt_link'] = output.rt_link.apply(lambda x: f"<a href={x}>🍅</a>")

In [33]:
output.to_csv("data/showtimes.csv")

# Showtimes 

In [27]:
HTML(output.to_html(render_links=True, escape=False, index=False, justify='center'))

theater,poster,title,year,date,time,link,score,rt_link
Coolidge,,Nope,2022,08-11,3:30 PM,🎟️,82.0,🍅
Coolidge,,Bullet Train,2022,08-11,4:00 PM,🎟️,54.0,🍅
Somerville,,Nope,2022,08-11,5:00 PM,🎟️,82.0,🍅
Somerville,,Bullet Train,2022,08-11,5:00 PM,🎟️,54.0,🍅
Somerville,,I Love My Dad,2022,08-11,5:00 PM,🎟️,70.0,🍅
Coolidge,,Nope,2022,08-11,6:30 PM,🎟️,82.0,🍅
Coolidge,,The Origins of American Independent Cinema\n\n\n\nPart of The Origins of American Independent Cinema,1988,08-11,6:30 PM,🎟️,91.0,🍅
Brattle,,El Topo,2011,08-11,7:00 PM,🎟️,84.0,🍅
Coolidge,,Rhinestone,1984,08-11,7:00 PM,🎟️,14.0,🍅
Coolidge,,Bullet Train,2022,08-11,7:15 PM,🎟️,54.0,🍅


# Testing...

In [28]:
s = '<img src=https://resizing.flixster.com/Zd9kpD_evBsdOqXQuVj8t0s4JYE=/fit-in/80x126/v2/https://resizing.flixster.com/y629Pkvr6tTHrzCEOd5XgLtGuxE=/ems.cHJkLWVtcy1hc3NldHMvbW92aWVzLzMyODliMzFlLTMwZmEtNDZjOC04Zjk5LWMwNTIyNGEzYjVlNC5qcGc=>'

In [29]:
q = "The Wizard of Oz"
results = requests.get(f'https://rottentomatoes.com/search?search={q}')
soup = BeautifulSoup(results.content, 'html.parser')

In [30]:

# soup.find('ul', slot='list')

# soup.find('search-page-media-row')

**Different movies playing today**

In [31]:
df[df['date'] == date.today().strftime('%m-%d')].title.unique()

array(['Nope', 'Bullet Train', 'I Love My Dad',
       'The Origins of American Independent Cinema\n\n\n\nPart of\xa0The Origins of American Independent Cinema',
       'El Topo', 'Rhinestone', 'Vengeance',
       'Everything Everywhere All at Once', 'The Holy Mountain'],
      dtype=object)