In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression
from bs4 import BeautifulSoup
import requests
import dateutil.parser

def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

def money_to_int(moneystring):
    moneystring = moneystring.replace('$', '').replace(',', '')
    return int(moneystring)



In [3]:
movie_source_21 = requests.get('https://www.boxofficemojo.com/daily/2021/?view=year')
movie_source_20 = requests.get('https://www.boxofficemojo.com/daily/2020/?view=year')
soup_21 = BeautifulSoup(movie_source_21.text, 'html5lib')
soup_20 = BeautifulSoup(movie_source_20.text, 'html5lib')

In [4]:
table_21 = soup_21.find('table')
table_20 = soup_20.find('table')
rows21 = [row for row in table_21.find_all('tr')]
rows20 = [row for row in table_20.find_all('tr')]

In [5]:
rows21[1]
links21 = []
for i in range(len(rows21)):
    if i != 0:
        link = rows21[i].find_all('td')[1].find('a')['href']
        links21.append(link)
links21
links20 = []
for i in range(len(rows20)):
    if i != 0:
        link = rows20[i].find_all('td')[1].find('a')['href']
        links20.append(link)
lonk = rows21[4].find_all('td')[1].find('a')['href']
links_total = links21 + links20
links_total

In [6]:
def get_summary_dict(link_list):
    '''
    From BoxOfficeMojo link stub, request movie html, parse with BeautifulSoup, and
    collect 
        - title 
        - daily domestic gross
        - ranking and past ranking
        - percent gross change per day/week
        - day of gross
        - theaters released in and average gross per theater
        - distributer
    Return information as a dictionary.
    '''
    
    base_url = 'https://www.boxofficemojo.com'
    
   
    
    headers = ['Date', 'rank', 'rank_yest',
               'movie_title', 'daily_gross', 'gross_change_day', 
               'gross_change_week', 'num_of_theaters', 'avg_gross_per_theater', 
               'gross_to_date', 'days_in_release', 'distributor' ]
    ranks = []
    titles = []
    rank_yests = []
    daily_grosss = []
    gross_day_pcts = []
    gross_week_pcts = []
    theaterss = []
    avg_theaters = []
    gross_dates = []
    days_rels = []
    distribs = []
    dates = []
    for link in link_list:
        #Create full url to scrape
        url = base_url + link
    
        #Request HTML and parse
        response = requests.get(url)
        page = response.text
        soup = BeautifulSoup(page,"lxml")

    
    
        date = soup.find('h1').text.replace('Domestic Box Office For ', '')
    
        for tr in soup.find_all('tr')[1:]:
            tds = tr.find_all('td')
        
            titles.append(tds[2].text)
        
            ranks.append(tds[0].text)
        
            rank_yests.append(tds[1].text)
        
            raw_daily_gross = tds[3].text
            daily_grosss.append(raw_daily_gross)
        
            gross_day_pcts.append(tds[4].text)
        
            gross_week_pcts.append(tds[5].text)
        
            theaterss.append(tds[6].text)
        
            avg_theater = tds[7].text
            avg_theaters.append(avg_theater)
        
            gross_date_raw = tds[8].text
            gross_dates.append(gross_date_raw)
        
            days_rels.append(tds[9].text)

            distribs.append(tds[10].text.strip())
            dates.append(date)


  
                                    
                
    
    #Create movie dictionary and return
    movie_dict = dict(zip(headers, [dates, ranks, rank_yests, titles, daily_grosss, 
                                    gross_day_pcts, gross_week_pcts, theaterss, avg_theaters, 
                                    gross_dates, days_rels , distribs]))
    
    return movie_dict

In [7]:
movies_dict = get_summary_dict(links_total)

1 of 1000, waited 2.411 sec.
2 of 1000, waited 2.454 sec.
3 of 1000, waited 0.5816 sec.
4 of 1000, waited 0.8997 sec.
5 of 1000, waited 2.471 sec.
6 of 1000, waited 1.119 sec.
7 of 1000, waited 1.416 sec.
8 of 1000, waited 1.448 sec.
9 of 1000, waited 0.8083 sec.
10 of 1000, waited 1.352 sec.
11 of 1000, waited 1.118 sec.
12 of 1000, waited 0.5331 sec.
13 of 1000, waited 1.131 sec.
14 of 1000, waited 1.491 sec.
15 of 1000, waited 1.608 sec.
16 of 1000, waited 1.822 sec.
17 of 1000, waited 1.345 sec.
18 of 1000, waited 2.295 sec.
19 of 1000, waited 1.096 sec.
20 of 1000, waited 0.8153 sec.
21 of 1000, waited 2.004 sec.
22 of 1000, waited 1.107 sec.
23 of 1000, waited 0.5572 sec.
24 of 1000, waited 0.5154 sec.
25 of 1000, waited 2.377 sec.
26 of 1000, waited 0.5184 sec.
27 of 1000, waited 2.14 sec.
28 of 1000, waited 2.201 sec.
29 of 1000, waited 2.028 sec.
30 of 1000, waited 0.8768 sec.
31 of 1000, waited 0.7019 sec.
32 of 1000, waited 0.9785 sec.
33 of 1000, waited 0.7798 sec.
34 of 10

268 of 1000, waited 2.387 sec.
269 of 1000, waited 0.7522 sec.
270 of 1000, waited 2.069 sec.
271 of 1000, waited 0.8971 sec.
272 of 1000, waited 1.149 sec.
273 of 1000, waited 2.378 sec.
274 of 1000, waited 1.765 sec.
275 of 1000, waited 1.528 sec.
276 of 1000, waited 2.201 sec.
277 of 1000, waited 1.107 sec.
278 of 1000, waited 0.8175 sec.
279 of 1000, waited 2.103 sec.
280 of 1000, waited 2.25 sec.
281 of 1000, waited 1.778 sec.
282 of 1000, waited 0.7763 sec.
283 of 1000, waited 1.205 sec.
284 of 1000, waited 0.9816 sec.
285 of 1000, waited 1.099 sec.
286 of 1000, waited 1.319 sec.
287 of 1000, waited 1.473 sec.
288 of 1000, waited 1.552 sec.
289 of 1000, waited 0.8574 sec.
290 of 1000, waited 0.9925 sec.
291 of 1000, waited 0.6402 sec.
292 of 1000, waited 1.588 sec.
293 of 1000, waited 2.436 sec.
294 of 1000, waited 1.015 sec.
295 of 1000, waited 1.868 sec.
296 of 1000, waited 0.915 sec.
297 of 1000, waited 0.7554 sec.
298 of 1000, waited 0.6413 sec.
299 of 1000, waited 0.7473 sec

ConnectionError: HTTPSConnectionPool(host='www.boxofficemojo.com', port=443): Max retries exceeded with url: /title/tt2660888/?ref_=bo_cso_table_146 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7ff4fe627c40>: Failed to establish a new connection: [Errno 60] Operation timed out'))

In [None]:
movie_df = pd.DataFrame.from_dict(movies_dict)
movie_df.info()

In [None]:
movie_df['Date']= pd.to_datetime(movie_df['Date'])
movie_df.head()


In [None]:
cdc_data = pd.read_csv('COVID-19_Vaccinations_in_the_United_States_Jurisdiction.csv') #Data downloaded from https://data.cdc.gov/Vaccinations/COVID-19-Vaccinations-in-the-United-States-Jurisdi/unsk-b7fc/data

In [None]:
cdc_data['Date'] = pd.to_datetime(cdc_data['Date'])
cdc_data_new = cdc_data.groupby(['Date','MMWR_week'])['Series_Complete_Yes'].sum().reset_index()
cdc_data_new.tail()

In [None]:
movies_vaccinesdf = pd.merge(movie_df, cdc_data, on='Date')
movies_vaccinesdf.columns

In [None]:


movies_vaccinesdf.to_csv('Movies_and_CDC_Data.csv', header=['Date', 'rank', 'rank_yest', 'movie_title', 'daily_gross',
       'gross_change_day', 'gross_change_week', 'num_of_theaters',
       'avg_gross_per_theater', 'gross_to_date', 'days_in_release',
       'distributor', 'MMWR_week', 'Location', 'Distributed',
       'Distributed_Janssen', 'Distributed_Moderna', 'Distributed_Pfizer',
       'Distributed_Unk_Manuf', 'Dist_Per_100K', 'Distributed_Per_100k_12Plus',
       'Distributed_Per_100k_18Plus', 'Distributed_Per_100k_65Plus',
       'Administered', 'Administered_12Plus', 'Administered_18Plus',
       'Administered_65Plus', 'Administered_Janssen', 'Administered_Moderna',
       'Administered_Pfizer', 'Administered_Unk_Manuf', 'Administered_Fed_LTC',
       'Administered_Fed_LTC_Residents', 'Administered_Fed_LTC_Staff',
       'Administered_Fed_LTC_Unk', 'Administered_Fed_LTC_Dose1',
       'Administered_Fed_LTC_Dose1_Residents',
       'Administered_Fed_LTC_Dose1_Staff', 'Administered_Fed_LTC_Dose1_Unk',
       'Admin_Per_100K', 'Admin_Per_100k_12Plus', 'Admin_Per_100k_18Plus',
       'Admin_Per_100k_65Plus', 'Recip_Administered',
       'Administered_Dose1_Recip', 'Administered_Dose1_Pop_Pct',
       'Administered_Dose1_Recip_12Plus',
       'Administered_Dose1_Recip_12PlusPop_Pct',
       'Administered_Dose1_Recip_18Plus',
       'Administered_Dose1_Recip_18PlusPop_Pct',
       'Administered_Dose1_Recip_65Plus',
       'Administered_Dose1_Recip_65PlusPop_Pct', 'Series_Complete_Yes',
       'Series_Complete_Pop_Pct', 'Series_Complete_12Plus',
       'Series_Complete_12PlusPop_Pct', 'Series_Complete_18Plus',
       'Series_Complete_18PlusPop_Pct', 'Series_Complete_65Plus',
       'Series_Complete_65PlusPop_Pct', 'Series_Complete_Janssen',
       'Series_Complete_Moderna', 'Series_Complete_Pfizer',
       'Series_Complete_Unk_Manuf', 'Series_Complete_Janssen_12Plus',
       'Series_Complete_Moderna_12Plus', 'Series_Complete_Pfizer_12Plus',
       'Series_Complete_Unk_Manuf_12Plus', 'Series_Complete_Janssen_18Plus',
       'Series_Complete_Moderna_18Plus', 'Series_Complete_Pfizer_18Plus',
       'Series_Complete_Unk_Manuf_18Plus', 'Series_Complete_Janssen_65Plus',
       'Series_Complete_Moderna_65Plus', 'Series_Complete_Pfizer_65Plus',
       'Series_Complete_Unk_Manuf_65Plus', 'Series_Complete_FedLTC',
       'Series_Complete_FedLTC_Residents', 'Series_Complete_FedLTC_Staff',
       'Series_Complete_FedLTC_Unknown'], index=False)

