In [28]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import random
import pickle
from collections import defaultdict

variable names:
* title = movie title
* desc = description
* gross = dict( Gross income)

In [11]:
#https://www.boxofficemojo.com/year/2019/?ref_=bo_yl_table_2
#Need Todo/ Nice to have.  Choose link based on condition that opening theaters is >1000
url = "https://www.boxofficemojo.com/release/rl3059975681/?ref_=bo_yld_table_1"

response = requests.get(url)

page = response.text

soup = BeautifulSoup(page)


In [154]:
def get_movie_value(soup, field_name):
    
    '''Grab a value from Box Office Mojo HTML
    
    Takes a string attribute of a movie on the page and returns the string in
    the next sibling object (the value for that attribute) or None if nothing is found.
    '''
    
    obj = soup.find(text=re.compile(field_name))
    
    if not obj: 
        return None
    
    # this works for most of the values
    next_element = obj.findNext()
    
    if next_element:
        return next_element.text 
    else:
        return None

In [16]:
#The only thing with h1 is the movie title!
soup.find_all('h1')

[<h1 class="a-size-extra-large">Avengers: Endgame</h1>]

In [157]:
#Need to get links from main page
#2019 
url = "https://www.boxofficemojo.com/year/2019/?ref_=bo_yl_table_2"
response = requests.get(url)
page = response.text

soup = BeautifulSoup(page, 'lxml')

In [195]:
rows = soup.find('table').find_all('tr')

In [284]:
#Need to get links from main page
#2019 
#url = "https://www.boxofficemojo.com/year/2019/?ref_=bo_yl_table_2"

def get_links(url, year):
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, 'lxml')
    rows = soup.find('table').find_all('tr')
    curr_dict = {}
    
    for row in rows[1:]:

        try:
            theaters = int(row.find_all('td')[6].text.replace(',',''))
        except:
            theaters = 0

        if theaters < 500:
            continue
        else:
            pass
        title = row.find_all('td')[1].text
        #Update Release Date for EVERY YEAR
        release_date = row.find_all('td')[8].text+', '+year
        link_stub = row.find_all('td')[1].find('a')['href']

        curr_dict[title] = [link_stub, release_date, theaters]
    
    df = pd.DataFrame(curr_dict).T
    df.columns = ['link_stub', 'release', 'theaters']
    return df

In [337]:
def get_movie_dict(link):
    """ 
    Takes Links from existing dataframe and parses data from each page
    input: link url
    returns: (dict)
    """
    base_url = "https://www.boxofficemojo.com"
    url = base_url+link
    
    response = requests.get(url)
    print('Status_code: {}'.format(response.status_code))
    page = response.text
    soup = BeautifulSoup(page)

    
    #get movie title
    title = soup.find('h1').text

    #get description
    desc = soup.find('p', {'class':'a-size-medium'}).text

    #get gross income values
    keys = ['gross_dom', 'gross_inter', 'gross_world']
    moneys =[]
    for span in soup.find(class_='mojo-performance-summary-table').find_all('span', class_='money'):
        moneys.append(span.text)
    gross = dict(zip(keys,moneys))

    #get movie distributor
    distr = get_movie_value(soup, 'Distri')

    #get opening income
    opening = get_movie_value(soup, 'Open')

    #get budget
    budget = get_movie_value(soup, 'Budget')

    #get ratings
    mpaa = get_movie_value(soup, 'MPA')

    #get runtime
    runtime = get_movie_value(soup, 'Run')

    #get genres
    genres = list(get_movie_value(soup, 'Genres').split())
    s = ''
    for each in genres:
        s+= each
        s+= ', '
    genres = s[:-2]

    #compile all the above to dict
    headers = ['title', 'desc', 'distr', 'opening', 'budget', 'mpaa', 'runtime', 'genres']
    moviedict = dict(zip(headers, [title, desc, distr, opening, budget, mpaa, runtime, genres]))
    moviedict.update(gross)
    
    return moviedict

In [275]:
import time

In [287]:
movie_stubs = pd.DataFrame()
def get_stubs()
    for year in range(2000,2020):
        url = 'https://www.boxofficemojo.com/year/{}/?grossesOption=calendarGrosses'.format(year)
        movie_stubs = pd.concat([get_links(url, str(year)), movie_stubs])
        time.sleep(2)
#get_stubs()

In [288]:
movie_stubs

Unnamed: 0,link_stub,release,theaters
Avengers: Endgame,/release/rl3059975681/?ref_=bo_yld_table_1,"Apr 26, 2019",4662
The Lion King,/release/rl3321923073/?ref_=bo_yld_table_2,"Jul 19, 2019",4802
Toy Story 4,/release/rl3798500865/?ref_=bo_yld_table_3,"Jun 21, 2019",4575
Frozen II,/release/rl2424210945/?ref_=bo_yld_table_4,"Nov 22, 2019",4440
Captain Marvel,/release/rl3009644033/?ref_=bo_yld_table_5,"Mar 8, 2019",4310
...,...,...,...
Tarzan,/release/rl2742388225/?ref_=bo_yld_table_337,"Jun 16, 2000",3131
Music of the Heart,/release/rl761562625/?ref_=bo_yld_table_344,"Oct 29, 2000",1353
The Messenger: The Story of Joan of Arc,/release/rl1986168321/?ref_=bo_yld_table_350,"Nov 12, 2000",2147
Snatch,/release/rl141592065/?ref_=bo_yld_table_401,"Dec 8, 2000",1444


In [297]:
list(movie_stubs.link_stub.items()

[('Avengers: Endgame', '/release/rl3059975681/?ref_=bo_yld_table_1'),
 ('The Lion King', '/release/rl3321923073/?ref_=bo_yld_table_2'),
 ('Toy Story 4', '/release/rl3798500865/?ref_=bo_yld_table_3'),
 ('Frozen II', '/release/rl2424210945/?ref_=bo_yld_table_4'),
 ('Captain Marvel', '/release/rl3009644033/?ref_=bo_yld_table_5'),
 ('Star Wars: Episode IX - The Rise of Skywalker',
  '/release/rl3305145857/?ref_=bo_yld_table_6'),
 ('Spider-Man: Far from Home', '/release/rl3791750657/?ref_=bo_yld_table_7'),
 ('Aladdin', '/release/rl3246360065/?ref_=bo_yld_table_8'),
 ('Joker', '/release/rl252151297/?ref_=bo_yld_table_9'),
 ('It Chapter Two', '/release/rl1107461633/?ref_=bo_yld_table_10'),
 ('Jumanji: The Next Level', '/release/rl755467777/?ref_=bo_yld_table_11'),
 ('Us', '/release/rl1711506945/?ref_=bo_yld_table_12'),
 ('Fast & Furious Presents: Hobbs & Shaw',
  '/release/rl2919400961/?ref_=bo_yld_table_13'),
 ('John Wick: Chapter 3 - Parabellum',
  '/release/rl1476560385/?ref_=bo_yld_table_

In [304]:
movie_stubs.iloc[5:]

Unnamed: 0,link_stub,release,theaters
Star Wars: Episode IX - The Rise of Skywalker,/release/rl3305145857/?ref_=bo_yld_table_6,"Dec 20, 2019",4406
Spider-Man: Far from Home,/release/rl3791750657/?ref_=bo_yld_table_7,"Jul 2, 2019",4634
Aladdin,/release/rl3246360065/?ref_=bo_yld_table_8,"May 24, 2019",4476
Joker,/release/rl252151297/?ref_=bo_yld_table_9,"Oct 4, 2019",4374
It Chapter Two,/release/rl1107461633/?ref_=bo_yld_table_10,"Sep 6, 2019",4570
...,...,...,...
Tarzan,/release/rl2742388225/?ref_=bo_yld_table_337,"Jun 16, 2000",3131
Music of the Heart,/release/rl761562625/?ref_=bo_yld_table_344,"Oct 29, 2000",1353
The Messenger: The Story of Joan of Arc,/release/rl1986168321/?ref_=bo_yld_table_350,"Nov 12, 2000",2147
Snatch,/release/rl141592065/?ref_=bo_yld_table_401,"Dec 8, 2000",1444


In [307]:
movie_stubs.iloc[0:5]

Unnamed: 0,link_stub,release,theaters
Avengers: Endgame,/release/rl3059975681/?ref_=bo_yld_table_1,"Apr 26, 2019",4662
The Lion King,/release/rl3321923073/?ref_=bo_yld_table_2,"Jul 19, 2019",4802
Toy Story 4,/release/rl3798500865/?ref_=bo_yld_table_3,"Jun 21, 2019",4575
Frozen II,/release/rl2424210945/?ref_=bo_yld_table_4,"Nov 22, 2019",4440
Captain Marvel,/release/rl3009644033/?ref_=bo_yld_table_5,"Mar 8, 2019",4310


In [335]:
movie_info = []

In [332]:
2+2*random.random()

3.1253290959290547

In [347]:
def scrape_boxofficemojo():
    for i, link in enumerate(movie_stubs.link_stub):
        try:
            movie_info.append(get_movie_dict(link))
        except:
            last_index = i
            print('The last index location was {}'.format(i))
        time.sleep(2+2*random.random())
        print("Current Iteration: {}".format(i))


Status_code: 200
Current Iteration: 0
Status_code: 200
Current Iteration: 1
Status_code: 200
Current Iteration: 2
Status_code: 200
Current Iteration: 3
Status_code: 200
Current Iteration: 4
Status_code: 200
Current Iteration: 5
Status_code: 200
Current Iteration: 6
Status_code: 200
Current Iteration: 7
Status_code: 200
Current Iteration: 8
Status_code: 200
Current Iteration: 9
Status_code: 200
Current Iteration: 10
Status_code: 200
Current Iteration: 11
Status_code: 200
Current Iteration: 12
Status_code: 200
Current Iteration: 13
Status_code: 200
Current Iteration: 14
Status_code: 200
Current Iteration: 15
Status_code: 200
Current Iteration: 16
Status_code: 200
Current Iteration: 17
Status_code: 200
Current Iteration: 18
Status_code: 200
Current Iteration: 19
Status_code: 200
Current Iteration: 20
Status_code: 200
Current Iteration: 21
Status_code: 200
Current Iteration: 22
Status_code: 200
Current Iteration: 23
Status_code: 200
Current Iteration: 24
Status_code: 200
Current Iteration:

Status_code: 200
Current Iteration: 207
Status_code: 200
The last index location was 208
Current Iteration: 208
Status_code: 200
Current Iteration: 209
Status_code: 200
Current Iteration: 210
Status_code: 200
Current Iteration: 211
Status_code: 200
Current Iteration: 212
Status_code: 200
Current Iteration: 213
Status_code: 200
Current Iteration: 214
Status_code: 200
Current Iteration: 215
Status_code: 200
Current Iteration: 216
Status_code: 200
Current Iteration: 217
Status_code: 200
Current Iteration: 218
Status_code: 200
Current Iteration: 219
Status_code: 200
Current Iteration: 220
Status_code: 200
Current Iteration: 221
Status_code: 200
Current Iteration: 222
Status_code: 200
Current Iteration: 223
Status_code: 200
Current Iteration: 224
Status_code: 200
Current Iteration: 225
Status_code: 200
Current Iteration: 226
Status_code: 200
Current Iteration: 227
Status_code: 200
Current Iteration: 228
Status_code: 200
Current Iteration: 229
Status_code: 200
Current Iteration: 230
Status_c

Current Iteration: 405
Status_code: 200
Current Iteration: 406
Status_code: 200
Current Iteration: 407
Status_code: 200
Current Iteration: 408
Status_code: 200
Current Iteration: 409
Status_code: 200
Current Iteration: 410
Status_code: 200
Current Iteration: 411
Status_code: 200
Current Iteration: 412
Status_code: 200
Current Iteration: 413
Status_code: 200
Current Iteration: 414
Status_code: 200
Current Iteration: 415
Status_code: 200
Current Iteration: 416
Status_code: 200
Current Iteration: 417
Status_code: 200
Current Iteration: 418
Status_code: 200
Current Iteration: 419
Status_code: 200
Current Iteration: 420
Status_code: 200
Current Iteration: 421
Status_code: 200
Current Iteration: 422
Status_code: 200
Current Iteration: 423
Status_code: 200
Current Iteration: 424
Status_code: 200
Current Iteration: 425
Status_code: 200
Current Iteration: 426
Status_code: 200
Current Iteration: 427
Status_code: 200
Current Iteration: 428
Status_code: 200
Current Iteration: 429
Status_code: 200


Current Iteration: 595
Status_code: 200
Current Iteration: 596
Status_code: 200
Current Iteration: 597
Status_code: 200
Current Iteration: 598
Status_code: 200
Current Iteration: 599
Status_code: 200
Current Iteration: 600
Status_code: 200
Current Iteration: 601
Status_code: 200
Current Iteration: 602
Status_code: 200
Current Iteration: 603
Status_code: 200
Current Iteration: 604
Status_code: 200
Current Iteration: 605
Status_code: 200
Current Iteration: 606
Status_code: 200
Current Iteration: 607
Status_code: 200
Current Iteration: 608
Status_code: 200
Current Iteration: 609
Status_code: 200
Current Iteration: 610
Status_code: 200
Current Iteration: 611
Status_code: 200
Current Iteration: 612
Status_code: 200
Current Iteration: 613
Status_code: 200
Current Iteration: 614
Status_code: 200
Current Iteration: 615
Status_code: 200
Current Iteration: 616
Status_code: 200
Current Iteration: 617
Status_code: 200
Current Iteration: 618
Status_code: 200
Current Iteration: 619
Status_code: 200


Current Iteration: 800
Status_code: 200
Current Iteration: 801
Status_code: 200
Current Iteration: 802
Status_code: 200
Current Iteration: 803
Status_code: 200
Current Iteration: 804
Status_code: 200
Current Iteration: 805
Status_code: 200
Current Iteration: 806
Status_code: 200
Current Iteration: 807
Status_code: 200
Current Iteration: 808
Status_code: 200
Current Iteration: 809
Status_code: 200
Current Iteration: 810
Status_code: 200
Current Iteration: 811
Status_code: 200
Current Iteration: 812
Status_code: 200
Current Iteration: 813
Status_code: 200
Current Iteration: 814
Status_code: 200
Current Iteration: 815
Status_code: 200
Current Iteration: 816
Status_code: 200
Current Iteration: 817
Status_code: 200
Current Iteration: 818
Status_code: 200
Current Iteration: 819
Status_code: 200
Current Iteration: 820
Status_code: 200
Current Iteration: 821
Status_code: 200
Current Iteration: 822
Status_code: 200
Current Iteration: 823
Status_code: 200
Current Iteration: 824
Status_code: 200


Current Iteration: 1005
Status_code: 200
Current Iteration: 1006
Status_code: 200
Current Iteration: 1007
Status_code: 200
Current Iteration: 1008
Status_code: 200
Current Iteration: 1009
Status_code: 200
Current Iteration: 1010
Status_code: 200
Current Iteration: 1011
Status_code: 200
Current Iteration: 1012
Status_code: 200
Current Iteration: 1013
Status_code: 200
Current Iteration: 1014
Status_code: 200
Current Iteration: 1015
Status_code: 200
Current Iteration: 1016
Status_code: 200
Current Iteration: 1017
Status_code: 200
Current Iteration: 1018
Status_code: 200
Current Iteration: 1019
Status_code: 200
Current Iteration: 1020
Status_code: 200
Current Iteration: 1021
Status_code: 200
Current Iteration: 1022
Status_code: 200
Current Iteration: 1023
Status_code: 200
Current Iteration: 1024
Status_code: 200
Current Iteration: 1025
Status_code: 200
Current Iteration: 1026
Status_code: 200
Current Iteration: 1027
Status_code: 200
Current Iteration: 1028
Status_code: 200
Current Iteratio

Current Iteration: 1205
Status_code: 200
Current Iteration: 1206
Status_code: 200
Current Iteration: 1207
Status_code: 200
Current Iteration: 1208
Status_code: 200
Current Iteration: 1209
Status_code: 200
Current Iteration: 1210
Status_code: 200
Current Iteration: 1211
Status_code: 200
Current Iteration: 1212
Status_code: 200
Current Iteration: 1213
Status_code: 200
Current Iteration: 1214
Status_code: 200
Current Iteration: 1215
Status_code: 200
Current Iteration: 1216
Status_code: 200
Current Iteration: 1217
Status_code: 200
Current Iteration: 1218
Status_code: 200
Current Iteration: 1219
Status_code: 200
Current Iteration: 1220
Status_code: 200
Current Iteration: 1221
Status_code: 200
Current Iteration: 1222
Status_code: 200
Current Iteration: 1223
Status_code: 200
Current Iteration: 1224
Status_code: 200
Current Iteration: 1225
Status_code: 200
Current Iteration: 1226
Status_code: 200
Current Iteration: 1227
Status_code: 200
Current Iteration: 1228
Status_code: 200
Current Iteratio

Status_code: 200
Current Iteration: 1405
Status_code: 200
Current Iteration: 1406
Status_code: 200
Current Iteration: 1407
Status_code: 200
Current Iteration: 1408
Status_code: 200
Current Iteration: 1409
Status_code: 200
Current Iteration: 1410
Status_code: 200
Current Iteration: 1411
Status_code: 200
Current Iteration: 1412
Status_code: 200
Current Iteration: 1413
Status_code: 200
Current Iteration: 1414
Status_code: 200
Current Iteration: 1415
Status_code: 200
Current Iteration: 1416
Status_code: 200
Current Iteration: 1417
Status_code: 200
Current Iteration: 1418
Status_code: 200
Current Iteration: 1419
Status_code: 200
Current Iteration: 1420
Status_code: 200
Current Iteration: 1421
Status_code: 200
Current Iteration: 1422
Status_code: 200
Current Iteration: 1423
Status_code: 200
Current Iteration: 1424
Status_code: 200
Current Iteration: 1425
Status_code: 200
Current Iteration: 1426
Status_code: 200
Current Iteration: 1427
Status_code: 200
Current Iteration: 1428
Status_code: 200

Status_code: 200
Current Iteration: 1605
Status_code: 200
Current Iteration: 1606
Status_code: 200
Current Iteration: 1607
Status_code: 200
Current Iteration: 1608
Status_code: 200
Current Iteration: 1609
Status_code: 200
Current Iteration: 1610
Status_code: 200
Current Iteration: 1611
Status_code: 200
Current Iteration: 1612
Status_code: 200
Current Iteration: 1613
Status_code: 200
Current Iteration: 1614
Status_code: 200
Current Iteration: 1615
Status_code: 200
Current Iteration: 1616
Status_code: 200
Current Iteration: 1617
Status_code: 200
Current Iteration: 1618
Status_code: 200
Current Iteration: 1619
Status_code: 200
Current Iteration: 1620
Status_code: 200
Current Iteration: 1621
Status_code: 200
Current Iteration: 1622
Status_code: 200
Current Iteration: 1623
Status_code: 200
Current Iteration: 1624
Status_code: 200
Current Iteration: 1625
Status_code: 200
Current Iteration: 1626
Status_code: 200
Current Iteration: 1627
Status_code: 200
Current Iteration: 1628
Status_code: 200

Status_code: 200
Current Iteration: 1805
Status_code: 200
Current Iteration: 1806
Status_code: 200
Current Iteration: 1807
Status_code: 200
Current Iteration: 1808
Status_code: 200
Current Iteration: 1809
Status_code: 200
Current Iteration: 1810
Status_code: 200
Current Iteration: 1811
Status_code: 200
Current Iteration: 1812
Status_code: 200
Current Iteration: 1813
Status_code: 200
Current Iteration: 1814
Status_code: 200
Current Iteration: 1815
Status_code: 200
Current Iteration: 1816
Status_code: 200
Current Iteration: 1817
Status_code: 200
Current Iteration: 1818
Status_code: 200
Current Iteration: 1819
Status_code: 200
Current Iteration: 1820
Status_code: 200
Current Iteration: 1821
Status_code: 200
Current Iteration: 1822
Status_code: 200
Current Iteration: 1823
Status_code: 200
Current Iteration: 1824
Status_code: 200
Current Iteration: 1825
Status_code: 200
Current Iteration: 1826
Status_code: 200
Current Iteration: 1827
Status_code: 200
Current Iteration: 1828
Status_code: 200

Status_code: 200
Current Iteration: 2005
Status_code: 200
Current Iteration: 2006
Status_code: 200
Current Iteration: 2007
Status_code: 200
Current Iteration: 2008
Status_code: 200
Current Iteration: 2009
Status_code: 200
Current Iteration: 2010
Status_code: 200
Current Iteration: 2011
Status_code: 200
Current Iteration: 2012
Status_code: 200
Current Iteration: 2013
Status_code: 200
Current Iteration: 2014
Status_code: 200
Current Iteration: 2015
Status_code: 200
Current Iteration: 2016
Status_code: 200
Current Iteration: 2017
Status_code: 200
Current Iteration: 2018
Status_code: 200
Current Iteration: 2019
Status_code: 200
Current Iteration: 2020
Status_code: 200
Current Iteration: 2021
Status_code: 200
Current Iteration: 2022
Status_code: 200
Current Iteration: 2023
Status_code: 200
Current Iteration: 2024
Status_code: 200
Current Iteration: 2025
Status_code: 200
Current Iteration: 2026
Status_code: 200
Current Iteration: 2027
Status_code: 200
Current Iteration: 2028
Status_code: 200

Current Iteration: 2204
Status_code: 200
Current Iteration: 2205
Status_code: 200
Current Iteration: 2206
Status_code: 200
Current Iteration: 2207
Status_code: 200
Current Iteration: 2208
Status_code: 200
Current Iteration: 2209
Status_code: 200
Current Iteration: 2210
Status_code: 200
Current Iteration: 2211
Status_code: 200
Current Iteration: 2212
Status_code: 200
Current Iteration: 2213
Status_code: 200
Current Iteration: 2214
Status_code: 200
Current Iteration: 2215
Status_code: 200
Current Iteration: 2216
Status_code: 200
Current Iteration: 2217
Status_code: 200
Current Iteration: 2218
Status_code: 200
Current Iteration: 2219
Status_code: 200
Current Iteration: 2220
Status_code: 200
Current Iteration: 2221
Status_code: 200
Current Iteration: 2222
Status_code: 200
Current Iteration: 2223
Status_code: 200
Current Iteration: 2224
Status_code: 200
Current Iteration: 2225
Status_code: 200
Current Iteration: 2226
Status_code: 200
Current Iteration: 2227
Status_code: 200
Current Iteratio

Current Iteration: 2404
Status_code: 200
Current Iteration: 2405
Status_code: 200
Current Iteration: 2406
Status_code: 200
Current Iteration: 2407
Status_code: 200
Current Iteration: 2408
Status_code: 200
Current Iteration: 2409
Status_code: 200
Current Iteration: 2410
Status_code: 200
Current Iteration: 2411
Status_code: 200
Current Iteration: 2412
Status_code: 200
Current Iteration: 2413
Status_code: 200
Current Iteration: 2414
Status_code: 200
Current Iteration: 2415
Status_code: 200
Current Iteration: 2416
Status_code: 200
Current Iteration: 2417
Status_code: 200
Current Iteration: 2418
Status_code: 200
Current Iteration: 2419
Status_code: 200
Current Iteration: 2420
Status_code: 200
Current Iteration: 2421
Status_code: 200
Current Iteration: 2422
Status_code: 200
Current Iteration: 2423
Status_code: 200
Current Iteration: 2424
Status_code: 200
Current Iteration: 2425
Status_code: 200
Current Iteration: 2426
Status_code: 200
Current Iteration: 2427
Status_code: 200
Current Iteratio

Current Iteration: 2604
Status_code: 200
Current Iteration: 2605
Status_code: 200
Current Iteration: 2606
Status_code: 200
Current Iteration: 2607
Status_code: 200
Current Iteration: 2608
Status_code: 200
Current Iteration: 2609
Status_code: 200
Current Iteration: 2610
Status_code: 200
Current Iteration: 2611
Status_code: 200
Current Iteration: 2612
Status_code: 200
Current Iteration: 2613
Status_code: 200
Current Iteration: 2614
Status_code: 200
Current Iteration: 2615
Status_code: 200
Current Iteration: 2616
Status_code: 200
Current Iteration: 2617
Status_code: 200
Current Iteration: 2618
Status_code: 200
Current Iteration: 2619
Status_code: 200
Current Iteration: 2620
Status_code: 200
Current Iteration: 2621
Status_code: 200
Current Iteration: 2622
Status_code: 200
Current Iteration: 2623
Status_code: 200
Current Iteration: 2624
Status_code: 200
Current Iteration: 2625
Status_code: 200
Current Iteration: 2626
Status_code: 200
Current Iteration: 2627
Status_code: 200
Current Iteratio

Current Iteration: 2804
Status_code: 200
Current Iteration: 2805
Status_code: 200
Current Iteration: 2806
Status_code: 200
Current Iteration: 2807
Status_code: 200
Current Iteration: 2808
Status_code: 200
Current Iteration: 2809
Status_code: 200
Current Iteration: 2810
Status_code: 200
Current Iteration: 2811
Status_code: 200
Current Iteration: 2812
Status_code: 200
Current Iteration: 2813
Status_code: 200
Current Iteration: 2814
Status_code: 200
Current Iteration: 2815
Status_code: 200
Current Iteration: 2816
Status_code: 200
Current Iteration: 2817
Status_code: 200
Current Iteration: 2818
Status_code: 200
Current Iteration: 2819
Status_code: 200
Current Iteration: 2820
Status_code: 200
Current Iteration: 2821
Status_code: 200
Current Iteration: 2822
Status_code: 200
Current Iteration: 2823
Status_code: 200
Current Iteration: 2824
Status_code: 200
Current Iteration: 2825
Status_code: 200
Current Iteration: 2826
Status_code: 200
Current Iteration: 2827
Status_code: 200
Current Iteratio

Current Iteration: 3004
Status_code: 200
Current Iteration: 3005
Status_code: 200
Current Iteration: 3006
Status_code: 200
Current Iteration: 3007
Status_code: 200
Current Iteration: 3008
Status_code: 200
Current Iteration: 3009
Status_code: 200
Current Iteration: 3010
Status_code: 200
Current Iteration: 3011
Status_code: 200
Current Iteration: 3012
Status_code: 200
Current Iteration: 3013
Status_code: 200
Current Iteration: 3014
Status_code: 200
Current Iteration: 3015
Status_code: 200
Current Iteration: 3016
Status_code: 200
Current Iteration: 3017
Status_code: 200
Current Iteration: 3018
Status_code: 200
Current Iteration: 3019
Status_code: 200
Current Iteration: 3020
Status_code: 200
Current Iteration: 3021
Status_code: 200
Current Iteration: 3022
Status_code: 200
Current Iteration: 3023
Status_code: 200
Current Iteration: 3024
Status_code: 200
Current Iteration: 3025
Status_code: 200
Current Iteration: 3026
Status_code: 200
Current Iteration: 3027
Status_code: 200
Current Iteratio

Current Iteration: 3204
Status_code: 200
Current Iteration: 3205
Status_code: 200
Current Iteration: 3206
Status_code: 200
Current Iteration: 3207
Status_code: 200
Current Iteration: 3208
Status_code: 200
Current Iteration: 3209
Status_code: 200
Current Iteration: 3210
Status_code: 200
Current Iteration: 3211
Status_code: 200
Current Iteration: 3212
Status_code: 200
Current Iteration: 3213
Status_code: 200
Current Iteration: 3214
Status_code: 200
Current Iteration: 3215
Status_code: 200
Current Iteration: 3216
Status_code: 200
Current Iteration: 3217
Status_code: 200
Current Iteration: 3218
Status_code: 200
Current Iteration: 3219
Status_code: 200
Current Iteration: 3220
Status_code: 200
Current Iteration: 3221
Status_code: 200
Current Iteration: 3222
Status_code: 200
Current Iteration: 3223
Status_code: 200
Current Iteration: 3224
Status_code: 200
Current Iteration: 3225
Status_code: 200
Current Iteration: 3226
Status_code: 200
Current Iteration: 3227
Status_code: 200
Current Iteratio

Current Iteration: 3404
Status_code: 200
Current Iteration: 3405
Status_code: 200
Current Iteration: 3406
Status_code: 200
Current Iteration: 3407
Status_code: 200
Current Iteration: 3408
Status_code: 200
Current Iteration: 3409
Status_code: 200
Current Iteration: 3410
Status_code: 200
Current Iteration: 3411
Status_code: 200
Current Iteration: 3412
Status_code: 200
Current Iteration: 3413
Status_code: 200
Current Iteration: 3414
Status_code: 200
Current Iteration: 3415
Status_code: 200
Current Iteration: 3416
Status_code: 200
Current Iteration: 3417
Status_code: 200
Current Iteration: 3418
Status_code: 200
Current Iteration: 3419
Status_code: 200
Current Iteration: 3420
Status_code: 200
Current Iteration: 3421
Status_code: 200
Current Iteration: 3422
Status_code: 200
Current Iteration: 3423
Status_code: 200
Current Iteration: 3424
Status_code: 200
Current Iteration: 3425
Status_code: 200
Current Iteration: 3426
Status_code: 200
Current Iteration: 3427
Status_code: 200
Current Iteratio

Current Iteration: 3604
Status_code: 200
Current Iteration: 3605
Status_code: 200
Current Iteration: 3606
Status_code: 200
Current Iteration: 3607
Status_code: 200
Current Iteration: 3608
Status_code: 200
Current Iteration: 3609
Status_code: 200
Current Iteration: 3610
Status_code: 200
Current Iteration: 3611
Status_code: 200
Current Iteration: 3612
Status_code: 200
Current Iteration: 3613
Status_code: 200
Current Iteration: 3614
Status_code: 200
Current Iteration: 3615
Status_code: 200
Current Iteration: 3616
Status_code: 200
Current Iteration: 3617
Status_code: 200
Current Iteration: 3618
Status_code: 200
Current Iteration: 3619
Status_code: 200
Current Iteration: 3620
Status_code: 200
Current Iteration: 3621
Status_code: 200
Current Iteration: 3622
Status_code: 200
Current Iteration: 3623
Status_code: 200
Current Iteration: 3624
Status_code: 200
Current Iteration: 3625
Status_code: 200
Current Iteration: 3626
Status_code: 200
Current Iteration: 3627
Status_code: 200
Current Iteratio

Current Iteration: 3804
Status_code: 200
Current Iteration: 3805
Status_code: 200
Current Iteration: 3806
Status_code: 200
Current Iteration: 3807
Status_code: 200
Current Iteration: 3808
Status_code: 200
Current Iteration: 3809
Status_code: 200
Current Iteration: 3810
Status_code: 200
Current Iteration: 3811
Status_code: 200
Current Iteration: 3812
Status_code: 200
Current Iteration: 3813
Status_code: 200
Current Iteration: 3814
Status_code: 200
Current Iteration: 3815
Status_code: 200
Current Iteration: 3816
Status_code: 200
Current Iteration: 3817
Status_code: 200
Current Iteration: 3818
Status_code: 200
Current Iteration: 3819
Status_code: 200
Current Iteration: 3820
Status_code: 200
Current Iteration: 3821
Status_code: 200
Current Iteration: 3822
Status_code: 200
Current Iteration: 3823
Status_code: 200
Current Iteration: 3824
Status_code: 200
Current Iteration: 3825
Status_code: 200
Current Iteration: 3826
Status_code: 200
Current Iteration: 3827
Status_code: 200
Current Iteratio

Status_code: 200
Current Iteration: 4004
Status_code: 200
Current Iteration: 4005
Status_code: 200
Current Iteration: 4006
Status_code: 200
Current Iteration: 4007
Status_code: 200
Current Iteration: 4008
Status_code: 200
Current Iteration: 4009
Status_code: 200
Current Iteration: 4010
Status_code: 200
Current Iteration: 4011
Status_code: 200
Current Iteration: 4012
Status_code: 200
Current Iteration: 4013
Status_code: 200
Current Iteration: 4014
Status_code: 200
Current Iteration: 4015
Status_code: 200
Current Iteration: 4016
Status_code: 200
Current Iteration: 4017
Status_code: 200
Current Iteration: 4018
Status_code: 200
Current Iteration: 4019
Status_code: 200
Current Iteration: 4020
Status_code: 200
Current Iteration: 4021
Status_code: 200
Current Iteration: 4022
Status_code: 200
Current Iteration: 4023
Status_code: 200
Current Iteration: 4024
Status_code: 200
Current Iteration: 4025
Status_code: 200
Current Iteration: 4026
Status_code: 200
Current Iteration: 4027
Status_code: 200

Status_code: 200
Current Iteration: 4204
Status_code: 200
Current Iteration: 4205
Status_code: 200
Current Iteration: 4206
Status_code: 200
Current Iteration: 4207
Status_code: 200
Current Iteration: 4208
Status_code: 200
Current Iteration: 4209
Status_code: 200
Current Iteration: 4210
Status_code: 200
Current Iteration: 4211
Status_code: 200
Current Iteration: 4212
Status_code: 200
Current Iteration: 4213
Status_code: 200
Current Iteration: 4214
Status_code: 200
Current Iteration: 4215
Status_code: 200
Current Iteration: 4216
Status_code: 200
Current Iteration: 4217
Status_code: 200
Current Iteration: 4218
Status_code: 200
Current Iteration: 4219
Status_code: 200
Current Iteration: 4220
Status_code: 200
Current Iteration: 4221
Status_code: 200
Current Iteration: 4222
Status_code: 200
Current Iteration: 4223
Status_code: 200
Current Iteration: 4224
Status_code: 200
Current Iteration: 4225
Status_code: 200
Current Iteration: 4226
Status_code: 200
Current Iteration: 4227
Status_code: 200

In [338]:
movie_info

[{'title': 'Avengers: Endgame',
  'desc': "After the devastating events of Avengers: Infinity War, the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe.",
  'distr': 'Walt Disney Studios Motion PicturesSee full company information\n\n',
  'opening': '$357,115,0074,662\n            theaters',
  'budget': '$356,000,000',
  'mpaa': 'PG-13',
  'runtime': '3 hr 1 min',
  'genres': 'Action, Adventure, Drama, Sci-Fi',
  'gross_dom': '$858,373,000',
  'gross_inter': '$1,939,427,564',
  'gross_world': '$2,797,800,564'},
 {'title': 'The Lion King',
  'desc': 'After the murder of his father, a young lion prince flees his kingdom only to learn the true meaning of responsibility and bravery.',
  'distr': 'Walt Disney Studios Motion PicturesSee full company information\n\n',
  'opening': '$191,770,7594,725\n            theaters',
  'budget': '$260,000,000',
  'mpaa': 'PG',
  'runtime': '1 

In [348]:
with open('movie_info.pickle', 'wb') as to_write:
    pickle.dump(movie_info, to_write)

In [349]:
ls

Data_Scraping_Prototype.ipynb  movie_info.pickle  movie_stubs.pickle  README.md


In [350]:
movie_info

[{'title': 'Avengers: Endgame',
  'desc': "After the devastating events of Avengers: Infinity War, the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe.",
  'distr': 'Walt Disney Studios Motion PicturesSee full company information\n\n',
  'opening': '$357,115,0074,662\n            theaters',
  'budget': '$356,000,000',
  'mpaa': 'PG-13',
  'runtime': '3 hr 1 min',
  'genres': 'Action, Adventure, Drama, Sci-Fi',
  'gross_dom': '$858,373,000',
  'gross_inter': '$1,939,427,564',
  'gross_world': '$2,797,800,564'},
 {'title': 'The Lion King',
  'desc': 'After the murder of his father, a young lion prince flees his kingdom only to learn the true meaning of responsibility and bravery.',
  'distr': 'Walt Disney Studios Motion PicturesSee full company information\n\n',
  'opening': '$191,770,7594,725\n            theaters',
  'budget': '$260,000,000',
  'mpaa': 'PG',
  'runtime': '1 

In [352]:
movie_info_df = pd.DataFrame(movie_info)

In [355]:
movie_info_df.set_index(['title'], inplace=True)

In [356]:
movie_info_df

Unnamed: 0_level_0,desc,distr,opening,budget,mpaa,runtime,genres,gross_dom,gross_inter,gross_world
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Avengers: Endgame,After the devastating events of Avengers: Infi...,Walt Disney Studios Motion PicturesSee full co...,"$357,115,0074,662\n theaters","$356,000,000",PG-13,3 hr 1 min,"Action, Adventure, Drama, Sci-Fi","$858,373,000","$1,939,427,564","$2,797,800,564"
The Lion King,"After the murder of his father, a young lion p...",Walt Disney Studios Motion PicturesSee full co...,"$191,770,7594,725\n theaters","$260,000,000",PG,1 hr 58 min,"Adventure, Animation, Drama, Family, Musical","$543,638,043","$1,113,305,351","$1,656,943,394"
Toy Story 4,"When a new toy called ""Forky"" joins Woody and ...",Walt Disney Studios Motion PicturesSee full co...,"$120,908,0654,575\n theaters","$200,000,000",G,1 hr 40 min,"Adventure, Animation, Comedy, Family, Fantasy","$434,038,008","$639,356,585","$1,073,394,593"
Frozen II,"Anna, Elsa, Kristoff, Olaf and Sven leave Aren...",Walt Disney Studios Motion PicturesSee full co...,"$130,263,3584,440\n theaters","$150,000,000",PG,1 hr 43 min,"Adventure, Animation, Comedy, Family, Fantasy,...","$477,373,578","$972,653,355","$1,450,026,933"
Captain Marvel,Carol Danvers becomes one of the universe's mo...,Walt Disney Studios Motion PicturesSee full co...,"$153,433,4234,310\n theaters","$160,000,000",PG-13,2 hr 3 min,"Action, Adventure, Sci-Fi","$426,829,839","$701,444,955","$1,128,274,794"
...,...,...,...,...,...,...,...,...,...,...
Tarzan,A man raised by gorillas must decide where he ...,Walt Disney Studios Motion PicturesSee full co...,"$34,221,9683,005\n theaters","$130,000,000",G,1 hr 28 min,"Adventure, Animation, Family, Romance","$171,091,819","$277,100,000","$448,191,819"
Music of the Heart,Story of a schoolteacher's struggle to teach v...,MiramaxSee full company information\n\n,"$3,653,2811,349\n theaters","$27,000,000",PG,2 hr 4 min,"Drama, Music","$14,859,394","$14,859,394",
The Messenger: The Story of Joan of Arc,A young girl receives a vision that drives her...,Sony Pictures Entertainment (SPE)See full comp...,"$6,360,9682,147\n theaters","$85,000,000",R,2 hr 38 min,"Adventure, Biography, Drama, History, War","$14,276,317","$52,700,000","$66,976,317"
Snatch,"Unscrupulous boxing promoters, violent bookmak...",Screen GemsSee full company information\n\n,"$27,9321\n theaters",,R,1 hr 44 min,"Comedy, Crime","$30,328,156","$53,229,716","$83,557,872"


In [357]:
movie_stubs.merge(movie_info_df, left_index=True, right_index=True)

Unnamed: 0,link_stub,release,theaters,desc,distr,opening,budget,mpaa,runtime,genres,gross_dom,gross_inter,gross_world
10 Cloverfield Lane,/release/rl1329956353/?ref_=bo_yld_table_44,"Mar 11, 2016",3427,"After getting in a car accident, a woman is he...",Paramount PicturesSee full company information...,"$24,727,4373,391\n theaters",,PG-13,1 hr 43 min,"Action, Drama, Horror, Mystery, Sci-Fi, Thriller","$72,082,998","$38,134,000","$110,216,998"
"10,000 BC",/release/rl977372673/?ref_=bo_yld_table_26,"Mar 7, 2008",3454,"In the prehistoric past, D'Leh is a mammoth hu...",Warner Bros.See full company information\n\n,"$35,867,4883,410\n theaters","$105,000,000",PG-13,1 hr 49 min,"Action, Adventure, Drama, Fantasy, History","$94,784,201","$175,000,000","$269,784,201"
102 Dalmatians,/release/rl927041025/?ref_=bo_yld_table_137,"Nov 22, 2001",2704,Cruella DeVil gets out of prison and goes afte...,Walt Disney Studios Motion PicturesSee full co...,"$19,883,3512,704\n theaters","$85,000,000",,1 hr 40 min,"Adventure, Comedy, Family","$66,957,026","$116,654,745","$183,611,771"
102 Dalmatians,/release/rl927041025/?ref_=bo_yld_table_137,"Nov 22, 2001",2704,Cruella DeVil gets out of prison and goes afte...,Walt Disney Studios Motion PicturesSee full co...,"$19,883,3512,704\n theaters","$85,000,000",,1 hr 40 min,"Adventure, Comedy, Family","$66,957,026","$116,654,745","$183,611,771"
102 Dalmatians,/release/rl927041025/?ref_=bo_yld_table_44,"Nov 22, 2000",2704,Cruella DeVil gets out of prison and goes afte...,Walt Disney Studios Motion PicturesSee full co...,"$19,883,3512,704\n theaters","$85,000,000",,1 hr 40 min,"Adventure, Comedy, Family","$66,957,026","$116,654,745","$183,611,771"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
xXx: State of the Union,/release/rl1467647489/?ref_=bo_yld_table_96,"Apr 29, 2005",3480,"Darius Stone, a new agent in the xXx program, ...",Revolution StudiosSee full company information...,"$12,712,2723,480\n theaters",,PG-13,1 hr 41 min,"Action, Adventure, Crime, Sci-Fi, Thriller","$26,873,932","$44,148,761","$71,022,693"
Æon Flux,/release/rl1212188161/?ref_=bo_yld_table_270,"Dec 2, 2006",2608,Aeon Flux is a mysterious assassin working for...,Paramount PicturesSee full company information...,"$12,661,1122,608\n theaters","$62,000,000",PG-13,1 hr 33 min,"Action, Adventure, Sci-Fi, Thriller","$25,874,337","$27,447,336","$53,321,673"
Æon Flux,/release/rl1212188161/?ref_=bo_yld_table_270,"Dec 2, 2006",2608,Aeon Flux is a mysterious assassin working for...,Paramount PicturesSee full company information...,"$12,661,1122,608\n theaters","$62,000,000",PG-13,1 hr 33 min,"Action, Adventure, Sci-Fi, Thriller","$25,874,337","$27,447,336","$53,321,673"
Æon Flux,/release/rl1212188161/?ref_=bo_yld_table_102,"Dec 2, 2005",2608,Aeon Flux is a mysterious assassin working for...,Paramount PicturesSee full company information...,"$12,661,1122,608\n theaters","$62,000,000",PG-13,1 hr 33 min,"Action, Adventure, Sci-Fi, Thriller","$25,874,337","$27,447,336","$53,321,673"


In [358]:
ls

Data_Scraping_Prototype.ipynb  movie_info.pickle  movie_stubs.pickle  README.md


In [3]:
#need to get Cast Info
movie_df = pickle.load(open('clean_movie_df.pickle', 'rb'))
cast_stubs = movie_df[['link_stub']]

In [9]:
cast_stubs.reset_index(inplace=True)

In [37]:
url = "https://www.boxofficemojo.com/title/tt1179933/?ref_=bo_rl_ti"

response = requests.get(url)

page = response.text

soup = BeautifulSoup(page)


In [38]:
soup.find(id_="tabs")

In [7]:
soup.find('a',class_='a-link-normal mojo-title-link refiner-display-highlight')['href']

'/title/tt1179933/?ref_=bo_tt_ti'

In [39]:
title_summary = {}
for link in cast_stubs.link_stub:
    base_url = "https://www.boxofficemojo.com"
    url = base_url+link
    
    response = requests.get(url)
    title = soup.find('h1').text
    
    title_sum_link = soup.find('a',class_='a-link-normal mojo-title-link refiner-display-highlight')['href']
    title_summary[title] = title_sum_link
    
    


In [21]:
cast_stubs.rename(columns ={'index':'title'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [40]:
title_summary

{'10 Cloverfield Lane (2016)': '/title/tt1179933/?ref_=bo_tt_ti'}

In [1]:
ls

Data_Scraping_Prototype.ipynb           cast_info_list_start_to_1500.pickle
Data_Scraping_Prototype_Selenium.ipynb  clean_movie_df.pickle
Feature Engineering.ipynb               cleaning_prototype.ipynb
README.md                               geckodriver.log
Regression prototype-V2.ipynb           movie_info.pickle
Regression prototype.ipynb              movie_stubs.pickle
Untitled.ipynb                          processed_movie_df.pickle
cast_info.pickle                        usholidays.csv
cast_info_list1500_to_end.pickle


In [2]:
import pickle

In [3]:
movie_stubs = pickle.load(open('movie_stubs.pickle', 'rb'))

In [4]:
movie_stubs.head()

Unnamed: 0,link_stub,release,theaters
Avengers: Endgame,/release/rl3059975681/?ref_=bo_yld_table_1,"Apr 26, 2019",4662
The Lion King,/release/rl3321923073/?ref_=bo_yld_table_2,"Jul 19, 2019",4802
Toy Story 4,/release/rl3798500865/?ref_=bo_yld_table_3,"Jun 21, 2019",4575
Frozen II,/release/rl2424210945/?ref_=bo_yld_table_4,"Nov 22, 2019",4440
Captain Marvel,/release/rl3009644033/?ref_=bo_yld_table_5,"Mar 8, 2019",4310


In [6]:
url = "https://www.boxofficemojo.com/release/rl3059975681/?ref_=bo_yld_table_1"
response = requests.get(url)
page = response.text

soup = BeautifulSoup(page, 'lxml')

In [9]:
newlink = soup.find('a', class_="a-link-normal mojo-title-link refiner-display-highlight")['href']

In [11]:
newlink.replace('?ref_=bo_rl_ti','credits/?ref_=bo_tt_tab#tabs')

'/title/tt4154796/credits/?ref_=bo_tt_tab#tabs'

In [22]:
#get cast Crew info from each site:

def get_cast_info2(stub):
    
    base = 'https://www.boxofficemojo.com'
    url = base+stub
    
    response = requests.get(url)
    page = response.text
    
    soup = BeautifulSoup(page, 'lxml')
    
    newlink = soup.find('a', class_="a-link-normal mojo-title-link refiner-display-highlight")['href']    
    newlink = newlink.replace('?ref_=bo_rl_ti','credits/?ref_=bo_tt_tab#tabs')
    url = base+newlink
    
    response = requests.get(url)
    page = response.text 
    soup = BeautifulSoup(page, 'lxml')
    
    crew_dict = defaultdict(list)
    
    #find Table Element 
    tables = soup.find_all('table')

    crew_rows = tables[0].find_all('tr')
    for row in crew_rows[1:]:
        curr_row = row.find_all('td')
        role = curr_row[1].text.rstrip()
        name = curr_row[0].text.rstrip()
        #add to crew dictionairy
        crew_dict[role].append(name)


    actor_rows = tables[1].find_all('tr')
    for row in actor_rows[1:]:
        curr_row = row.find_all('td')
        name = curr_row[0].text.rstrip()
        crew_dict['Actors'].append(name)

    cast_dict = {}
    title = soup.find('h1').text
    cast_dict['title'] = title
    
    for key in crew_dict:
        cast_dict[key] = ', '.join(crew_dict[key])
        
    return cast_dict


In [16]:
movie_stubs.link_stub

Avengers: Endgame                            /release/rl3059975681/?ref_=bo_yld_table_1
The Lion King                                /release/rl3321923073/?ref_=bo_yld_table_2
Toy Story 4                                  /release/rl3798500865/?ref_=bo_yld_table_3
Frozen II                                    /release/rl2424210945/?ref_=bo_yld_table_4
Captain Marvel                               /release/rl3009644033/?ref_=bo_yld_table_5
                                                               ...                     
Tarzan                                     /release/rl2742388225/?ref_=bo_yld_table_337
Music of the Heart                          /release/rl761562625/?ref_=bo_yld_table_344
The Messenger: The Story of Joan of Arc    /release/rl1986168321/?ref_=bo_yld_table_350
Snatch                                      /release/rl141592065/?ref_=bo_yld_table_401
The Gift                                    /release/rl945063425/?ref_=bo_yld_table_411
Name: link_stub, Length: 4279, d

In [36]:
cast_info_list = []

In [38]:
c = 0
for link in movie_stubs.link_stub:
    try:
        cast_info_list.append(get_cast_info2(link))
    except:
        c+=1
        print('{} rows missed', end="\r", flush=True)
    if len(cast_info_list)%10==0:
        with open('complete_cast_info.pickle', 'wb') as to_write:
            pickle.dump(cast_info_list, to_write)
    time.sleep(0.1)
    print('{} rows completed'.format(len(cast_info_list)), end='\r', flush=True)

4255 rows completed

In [39]:
with open('complete_cast_info.pickle', 'wb') as to_write:
    pickle.dump(cast_info_list, to_write)

In [40]:
cast_info_list

[{'title': 'Avengers: Endgame (2019)',
  'Director': 'Anthony Russo, Joe Russo',
  'Writer': 'Christopher Markus, Stephen McFeely, Stan Lee, Jack Kirby, Joe Simon, Jack Kirby, Steve Englehart, Steve Gan, Bill Mantlo, Keith Giffen, Jim Starlin, Stan Lee, Larry Lieber, Jack Kirby, Steve Englehart, Don Heck',
  'Producer': 'Kevin Feige',
  'Composer': 'Alan Silvestri',
  'Cinematographer': 'Trent Opaloch',
  'Editor': 'Jeffrey Ford, Matthew Schmidt',
  'Production Designer': 'Charles Wood',
  'Actors': 'Robert Downey Jr., Chris Evans, Mark Ruffalo, Chris Hemsworth'},
 {'title': 'The Lion King (2019)',
  'Director': 'Jon Favreau',
  'Writer': 'Jeff Nathanson, Irene Mecchi, Jonathan Roberts, Linda Woolverton',
  'Producer': 'Jon Favreau, Karen Gilchrist, Jeffrey Silver',
  'Composer': 'Hans Zimmer',
  'Cinematographer': 'Caleb Deschanel',
  'Editor': 'Adam Gerstel, Mark Livolsi',
  'Production Designer': 'James Chinlund',
  'Actors': 'Donald Glover, Beyoncé, Seth Rogen, Chiwetel Ejiofor'},
