# Disney Movies Wikipedia Scraper

This webscraper consists of scraping the list of Walt Disney picture films on wikipedia and its corresponding extra information needed for each film.

## Webscraping

### bs4 set up

In [1]:
from bs4 import BeautifulSoup
import requests

In [2]:
# website to be scraped
url = 'https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films'
website = requests.get(url).content
soup = BeautifulSoup(website, 'lxml')

### start of by selecting a movie and scraping the info box

In [3]:
url = 'https://en.wikipedia.org/wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)'
website = requests.get(url).text
soup_movie = BeautifulSoup(website, 'lxml')

In [4]:
# soup of the info box
info_box = soup_movie.find(class_='infobox vevent')
info_rows = info_box.select('tr')

In [5]:
# create empty dictionary to scrape title and respective information for movie
movie_info = {}

In [6]:
for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['title'] = row.find('th').text
    elif index == 1:
        continue
    else:
        content_key = row.find('th').text
        content_value = row.find('td').text
        movie_info[content_key] = content_value

In [7]:
movie_info

{'title': 'Snow White and the Seven Dwarfs',
 'Directed by': 'Supervising Director \nDavid Hand\n\nSequence Directors \nWilliam Cottrell\nWilfred Jackson\nLarry Morey\nPerce Pearce\nBen Sharpsteen\n',
 'Written by': '\nTed Sears\nRichard Creedon\nOtto Englander\nDick Rickard\nEarl Hurd\nMerrill De Maris\nDorothy Ann Blank\nWebb Smith\n',
 'Based on': 'Snow Whiteby The Brothers Grimm',
 'Produced by': 'Walt Disney',
 'Starring': '\nAdriana Caselotti\nLucille La Verne\nHarry Stockwell\nRoy Atwell\nPinto Colvig\nOtis Harlan\nScotty Mattraw\nBilly Gilbert\nEddie Collins\nMoroni Olsen\nStuart Buchanan\n',
 'Music by': '\nFrank Churchill\nPaul Smith\nLeigh Harline\n',
 'Productioncompany': 'Walt Disney Productions',
 'Distributed by': 'RKO Radio Pictures',
 'Release dates': '\nDecember\xa021,\xa01937\xa0(1937-12-21) (Carthay Circle Theatre)\nFebruary\xa04,\xa01938\xa0(1938-02-04) (United States)\n',
 'Running time': '83 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget'

In [8]:
# clean dictionary: get rid of \xa0 using replace, [1] using regex,'productioncompany', strip in get text to get rid of white space, deal with <br> tags
import re

def get_content_value(row_data):
    if row.find('li'):
        return [re.sub(r'\[\d*?\]', '', li.get_text(' ', strip=True).replace('\xa0', ' ')) for li in row_data.find_all('li')]
    else:
        return re.sub(r'\[\d*?\]', '', row_data.get_text(' ', strip=True).replace('\xa0', ' '))
    
for index, row in enumerate(info_rows): # provides an index to each table row that has to be looped through
    if index == 0: # first index is a title
        movie_info['title'] = row.find('th').get_text(' ', strip=True)
    elif index == 1: # second index is useless
        continue
    else: # index 3- is what we need
        content_key = row.find('th').get_text(' ', strip=True)        
        content_value = get_content_value(row.find('td'))
        movie_info[content_key] = content_value

movie_info

{'title': 'Snow White and the Seven Dwarfs',
 'Directed by': ['David Hand',
  'William Cottrell',
  'Wilfred Jackson',
  'Larry Morey',
  'Perce Pearce',
  'Ben Sharpsteen'],
 'Written by': ['Ted Sears',
  'Richard Creedon',
  'Otto Englander',
  'Dick Rickard',
  'Earl Hurd',
  'Merrill De Maris',
  'Dorothy Ann Blank',
  'Webb Smith'],
 'Based on': 'Snow White by The Brothers Grimm',
 'Produced by': 'Walt Disney',
 'Starring': ['Adriana Caselotti',
  'Lucille La Verne',
  'Harry Stockwell',
  'Roy Atwell',
  'Pinto Colvig',
  'Otis Harlan',
  'Scotty Mattraw',
  'Billy Gilbert',
  'Eddie Collins',
  'Moroni Olsen',
  'Stuart Buchanan'],
 'Music by': ['Frank Churchill', 'Paul Smith', 'Leigh Harline'],
 'Productioncompany': 'Walt Disney Productions',
 'Distributed by': 'RKO Radio Pictures',
 'Release dates': ['December 21, 1937 ( 1937-12-21 ) ( Carthay Circle Theatre )',
  'February 4, 1938 ( 1938-02-04 ) (United States)'],
 'Running time': '83 minutes',
 'Country': 'United States',
 '

### Make sure to scrape for each movie by getting each link

In [9]:
# empty dict to append to
movies = {}

In [10]:
# gives a list of all the movies and links
links = soup.select('table > tbody > tr > td > i > a')

movies['title'] = [link.get_text(' ', strip=True) for link in links]
movies['link'] = [link.get('href') for link in links]

In [11]:
for index, link in enumerate(movies['link']):
    movies['link'][index] = 'https://en.wikipedia.org' + link

In [12]:
movies['link'][1]

'https://en.wikipedia.org/wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)'

More cleaning of data

In [13]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

# get rids of number tags, re was creating problems with the string list. Span gets rid of the extra date in brackets.
def clean_tags(soup):
    for tag in soup.find_all(["sup", "span"]):
        tag.decompose()

def get_info_box(url):
    website = requests.get(url).content
    soup = BeautifulSoup(website, 'lxml')
    info_box = soup.find(class_="infobox vevent")
    info_rows = info_box.select("tr")
    
    clean_tags(soup)
        
    movie_info = {}
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['title'] = row.find("th").get_text(" ", strip=True)
        else:
            header = row.find('th') # deals with some links not having .text
            if header:
                content_key = row.find("th").get_text(" ", strip=True)
                content_value = get_content_value(row.find("td"))
                movie_info[content_key] = content_value
            
    return movie_info    

In [14]:
get_info_box('https://en.wikipedia.org/wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)')

{'title': 'Snow White and the Seven Dwarfs',
 'Directed by': ['David Hand',
  'William Cottrell',
  'Wilfred Jackson',
  'Larry Morey',
  'Perce Pearce',
  'Ben Sharpsteen'],
 'Written by': ['Ted Sears',
  'Richard Creedon',
  'Otto Englander',
  'Dick Rickard',
  'Earl Hurd',
  'Merrill De Maris',
  'Dorothy Ann Blank',
  'Webb Smith'],
 'Based on': ['Snow White', 'by The', 'Brothers Grimm'],
 'Produced by': 'Walt Disney',
 'Starring': ['Adriana Caselotti',
  'Lucille La Verne',
  'Harry Stockwell',
  'Roy Atwell',
  'Pinto Colvig',
  'Otis Harlan',
  'Scotty Mattraw',
  'Billy Gilbert',
  'Eddie Collins',
  'Moroni Olsen',
  'Stuart Buchanan'],
 'Music by': ['Frank Churchill', 'Paul Smith', 'Leigh Harline'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'RKO Radio Pictures',
 'Release dates': ['December 21, 1937 ( Carthay Circle Theatre )',
  'February 4, 1938 (United States)'],
 'Running time': '83 minutes',
 'Country': 'United States',
 'Language': 'English',

### Run script for every link

In [15]:
url = 'https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films'
website = requests.get(url).content
soup = BeautifulSoup(website, 'lxml')

movie_list = []

for index, movie in enumerate(movies['link']):
    try:
        movie_list.append(get_info_box(movie))
    except Exception as e:
        print(movie)
        print(e)

https://en.wikipedia.org/wiki/Zorro_(1957_TV_series)#Theatrical
'NoneType' object has no attribute 'find'
https://en.wikipedia.org/wiki/Zorro_(1957_TV_series)#Theatrical
'NoneType' object has no attribute 'find'
https://en.wikipedia.org/wiki/The_Omega_Connection
'NoneType' object has no attribute 'find'
https://en.wikipedia.org/wiki/The_Beatles:_Get_Back#The_Beatles:_Get_Back_–_The_Rooftop_Concert
'NoneType' object has no attribute 'find'
https://en.wikipedia.org/wiki/Chris_Paul
'NoneType' object has no attribute 'select'
https://en.wikipedia.org/wiki/All_Night_Long_(All_Night)
'NoneType' object has no attribute 'find'
https://en.wikipedia.org/wiki/Keeper_of_the_Lost_Cities#Film_adaptation
'NoneType' object has no attribute 'select'
https://en.wikipedia.org/wiki/Jim_Henson#Legacy
'NoneType' object has no attribute 'select'
https://en.wikipedia.org/wiki/Sister_Act_3
'NoneType' object has no attribute 'find'
https://en.wikipedia.org/wiki/The_Thief_(Turner_novel)
'NoneType' object has no 

In [16]:
total_links = len(movies['link'])

In [17]:
print(f'{len(movie_list)}/{total_links} scraped.')

520/535 scraped.


## Export file

In [18]:
import json

def save_data(title, data):
    with open (title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
        
def load_data(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)

In [19]:
save_data("disney_data.json", movie_list)

## Load Data In

In [20]:
movie_raw = load_data("disney_data.json")

In [21]:
movie_raw[1:5]

[{'title': 'Snow White and the Seven Dwarfs',
  'Directed by': ['David Hand',
   'William Cottrell',
   'Wilfred Jackson',
   'Larry Morey',
   'Perce Pearce',
   'Ben Sharpsteen'],
  'Written by': ['Ted Sears',
   'Richard Creedon',
   'Otto Englander',
   'Dick Rickard',
   'Earl Hurd',
   'Merrill De Maris',
   'Dorothy Ann Blank',
   'Webb Smith'],
  'Based on': ['Snow White', 'by The', 'Brothers Grimm'],
  'Produced by': 'Walt Disney',
  'Starring': ['Adriana Caselotti',
   'Lucille La Verne',
   'Harry Stockwell',
   'Roy Atwell',
   'Pinto Colvig',
   'Otis Harlan',
   'Scotty Mattraw',
   'Billy Gilbert',
   'Eddie Collins',
   'Moroni Olsen',
   'Stuart Buchanan'],
  'Music by': ['Frank Churchill', 'Paul Smith', 'Leigh Harline'],
  'Production company': 'Walt Disney Productions',
  'Distributed by': 'RKO Radio Pictures',
  'Release dates': ['December 21, 1937 ( Carthay Circle Theatre )',
   'February 4, 1938 (United States)'],
  'Running time': '83 minutes',
  'Country': 'Unit

## Further cleaning

__Cleaning__:

- Convert time to int
- Convert date to datetime
- Convert revenue, budget, etc to int

In [22]:
# for movie in movies, get the running time, if it does not have one, make it equal to N/A
x = [movie.get('Running time', 'N/A') for movie in movie_raw]
x[20:40]

['77 minutes',
 '92 minutes',
 '69 minutes',
 '81 minutes',
 ['60 minutes (VHS version)', '71 minutes (original)'],
 '127 minutes',
 '93 minutes',
 '76 minutes',
 '75 minutes',
 '73 minutes',
 '85 minutes',
 '81 minutes',
 '70 minutes',
 '90 min.',
 '80 minutes',
 '75 minutes',
 '83 minutes',
 '83 minutes',
 '72 minutes',
 '97 minutes']

- Convert time to int

In [23]:
def convert_time(running_time):
    if running_time == 'N/A': # if it has no value
        return None
    elif isinstance(running_time, list): # deals with lists by only taking first value
        entry = running_time[0]
        return int(entry.split(" ")[0])
    else:
        return int(running_time.split(" ")[0])

In [24]:
for movie in movie_raw:
    # add a new key to the dictionary with fixed values
    movie['Running time (int)'] = convert_time(movie.get('Running time', 'N/A'))

In [25]:
movie_raw[-1]

{'title': 'The Rocketeer',
 'Directed by': 'Joe Johnston',
 'Screenplay by': ['Danny Bilson', 'Paul De Meo'],
 'Story by': ['Danny Bilson', 'Paul De Meo', 'William Dear'],
 'Based on': ['The Rocketeer', 'by', 'Dave Stevens'],
 'Produced by': ['Charles Gordon', 'Lawrence Gordon', 'Lloyd Levin'],
 'Starring': ['Bill Campbell',
  'Alan Arkin',
  'Jennifer Connelly',
  'Paul Sorvino',
  'Timothy Dalton'],
 'Cinematography': 'Hiro Narita',
 'Edited by': ['Michael A. Stevenson', 'Arthur Schmidt'],
 'Music by': 'James Horner',
 'Production companies': ['Walt Disney Pictures',
  'Touchstone Pictures',
  'Silver Screen Partners IV',
  'Gordon Company'],
 'Distributed by': 'Buena Vista Pictures Distribution',
 'Release date': ['June 21, 1991'],
 'Running time': '108 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$35-40 million',
 'Box office': '$46.7 million (USA)',
 'Running time (int)': 108}

- Convert revenue, budget, etc to int

In [26]:
[movie.get('Budget', 'N/A') for movie in movie_raw]

['N/A',
 '$1.49 million',
 '$2.6 million',
 '$2.28 million',
 '$600,000',
 '$950,000',
 '$858,000',
 'N/A',
 '$788,000',
 'N/A',
 '$1.35 million',
 '$2.125 million',
 'N/A',
 '$1.5 million',
 '$1.5 million',
 'N/A',
 '$2.2 million',
 '$1,800,000',
 '$3 million',
 'N/A',
 '$4 million',
 '$2 million',
 '$300,000',
 '$1.8 million',
 'N/A',
 '$5 million',
 'N/A',
 '$4 million',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 '$700,000',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 '$6 million',
 'under $1 million or $1,250,000',
 'N/A',
 '$2 million',
 'N/A',
 'N/A',
 '$2.5 million',
 'N/A',
 'N/A',
 '$4 million',
 '$3.6 million',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 '$3 million',
 'N/A',
 '$3 million',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 '$3 million',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 '$4.4–6 million',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 '$4 million',
 'N/A',
 '$5 million',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 '

In [27]:
[movie.get('Box office', 'N/A') for movie in movie_raw]

['$45.472',
 '$418 million',
 '$164 million',
 '$76.4–$83.3 million (United States and Canada)',
 '$960,000 (worldwide rentals)',
 '>$1.3 million (est. United States/Canada rentals, 1941)',
 '$267.4 million',
 '$1.135 million (worldwide rentals)',
 '$799,000',
 '$3.355 million (worldwide rentals)',
 '$3.275 million (worldwide rentals)',
 '$65 million',
 '$3.165 million (worldwide rentals)',
 '$2.56 million (worldwide rentals)',
 '$3.7 million (U.S. rental) $575,000 (foreign rental)',
 '$1.625 million (worldwide rentals)',
 '$182 million',
 '$4,100,000 (worldwide rentals)',
 ['$2.4 million (1951, domestic)', '$3.5 million (1974, domestic)'],
 '$2.1 million (US rentals)',
 '$87.4 million (United States and Canada)',
 '$1 million (US)',
 '$2.6 million (US)',
 'N/A',
 '$1.75 million (US and Canadian rentals)',
 '$28.2 million',
 '$2,150,000 (US)',
 '$187 million',
 '$2.1 million (US)',
 '$1.6 million (US)',
 '$1.7 million (US)',
 'N/A',
 'N/A',
 '$2.75 million (US)',
 'N/A',
 '$1.75 millio

In [28]:
import re

amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

word_re = rf"\${number}(-|\sto\s|–|\s-|\s-\s)?({number})?\s({amounts})"
value_re = rf"\${number}"

# \d+  one or more digits 
# (,\d{3})*  a comma followed by three digits and the star means zero or more of this combo
# \.*  a decimal point, zero or more 
# \d* zero or more numbers
# \${number} makes it so that you have the dollar sign followed by a number 
# \s space
# (-|\sto\s|–) - or, space to space, or -
# ? it exists or it does not exist


def word_to_value(word):
    value_dict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
    return value_dict[word]

def parse_word_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    word = re.search(amounts, string, flags=re.I).group().lower()
    word_value = word_to_value(word)
    return value*word_value

def parse_value_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    return value

'''
money_conversion("$12.2 million") --> 12200000 ## Word syntax
money_conversion("$790,000") --> 790000        ## Value syntax
'''

def money_conversion(money):
    if money == "N/A":
        return None

    if isinstance(money, list):
        money = money[0]
        
    word_syntax = re.search(word_re, money, flags=re.I)
    value_syntax = re.search(value_re, money)

    if word_syntax:
        return parse_word_syntax(word_syntax.group())

    elif value_syntax:
        return parse_value_syntax(value_syntax.group())

    else:
        return None

In [29]:
money_conversion("$36 - 39.6 million")

36000000.0

In [30]:
for movie in movie_raw:
    movie['Budget (float)'] = money_conversion(movie.get('Budget', "N/A"))
    movie['Box office (float)'] = money_conversion(movie.get('Box office', "N/A"))

In [31]:
print([movie['Budget (float)'] for movie in movie_raw[0:10]])

[None, 1490000.0, 2600000.0, 2280000.0, 600000.0, 950000.0, 858000.0, None, 788000.0, None]


In [32]:
print([movie['Box office (float)'] for movie in movie_raw[0:10]])

[45.472, 418000000.0, 164000000.0, 83300000.0, 960000.0, 1300000.0, 267399999.99999997, 1135000.0, 799000.0, 3355000.0]


- Convert date to datetime

In [33]:
print([movie.get('Release dates', 'N/A') for movie in movie_raw[0:10]])
print('\n')
print([movie.get('Release date', 'N/A') for movie in movie_raw[0:10]])

['N/A', ['December 21, 1937 ( Carthay Circle Theatre )', 'February 4, 1938 (United States)'], ['February 7, 1940 ( Center Theatre )', 'February 23, 1940 (United States)'], 'N/A', 'N/A', ['October 23, 1941 (New York City)', 'October 31, 1941 (U.S.)'], ['August 9, 1942 (World Premiere – London)', 'August 13, 1942 (Premiere – New York City)', 'August 21, 1942 (U.S.)'], ['August 24, 1942 (World Premiere – Rio de Janeiro)', 'February 6, 1943 (U.S. Premiere – Boston)', 'February 19, 1943 (U.S.)'], 'N/A', ['December 21, 1944 (Mexico City)', 'February 3, 1945 (US)']]


[['May 19, 1937'], 'N/A', 'N/A', ['November 13, 1940'], ['June 27, 1941'], 'N/A', 'N/A', 'N/A', ['July 17, 1943'], 'N/A']


Release date and release dates

In [34]:
from datetime import datetime

In [35]:
# get rid of brackets and leave following format: May 23, 2012
dates_1 = [movie.get('Release date', 'N/A') for movie in movie_raw]
dates_2 = [movie.get('Release dates', 'N/A') for movie in movie_raw]

In [36]:
# June 28, 1950

def clean_date(date):
    return date.split("(")[0].strip()

def date_conversion(date):
    if isinstance(date, list):
        date = date[0]
        
    if date == "N/A":
        return None
        
    date_str = clean_date(date)

    fmts = ["%B %d, %Y", "%d %B %Y"]
    for fmt in fmts:
        try:
            return datetime.strptime(date_str, fmt)
        except:
            pass
    return None

In [37]:
# join release date and release dates to a new dictionary key and then apply datetime function

# start of by making a new dictionary key
for movie in movie_raw:
    movie['Release date temp'] = ''

# add release dates and release date to list
for movie in movie_raw:
    try:
        movie['Release date temp'] = movie['Release date']
    except:
        pass

for movie in movie_raw:
    try:
        movie['Release date temp'] = movie['Release dates']
    except:
        pass

In [38]:
for movie in movie_raw:
    movie['Release date (datetime)'] = date_conversion(movie.get('Release date temp', 'N/A'))

## Save cleaned file

In [39]:
# cannot save datetime to JSON therefore use pickle
import pickle

with open('cleaned_disney_data.pkl', 'wb') as f:
    pickle.dump(movie_raw, f)
        
with open('cleaned_disney_data.pkl', 'rb') as f:
    loaded_dict = pickle.load(f)


In [40]:
loaded_dict[0:1]

[{'title': 'Academy Award Review of',
  'Production company': 'Walt Disney Productions',
  'Distributed by': 'United Artists',
  'Release date': ['May 19, 1937'],
  'Running time': '41 minutes (74 minutes 1966 release)',
  'Country': 'United States',
  'Language': 'English',
  'Box office': '$45.472',
  'Running time (int)': 41,
  'Budget (float)': None,
  'Box office (float)': 45.472,
  'Release date temp': ['May 19, 1937'],
  'Release date (datetime)': datetime.datetime(1937, 5, 19, 0, 0)}]

## Attach IMDB/Rotten Tomato/Meta scores

In [41]:
cleaned_data = loaded_dict

In [42]:
cleaned_data[-40]

{'title': 'Encanto',
 'Directed by': ['Jared Bush', 'Byron Howard'],
 'Screenplay by': ['Charise Castro Smith', 'Jared Bush'],
 'Story by': ['Jared Bush',
  'Byron Howard',
  'Charise Castro Smith',
  'Jason Hand',
  'Nancy Kruse',
  'Lin-Manuel Miranda'],
 'Produced by': ['Yvett Merino', 'Clark Spencer'],
 'Starring': ['Stephanie Beatriz',
  'María Cecilia Botero',
  'John Leguizamo',
  'Mauro Castillo',
  'Jessica Darrow',
  'Angie Cepeda',
  'Carolina Gaitán',
  'Diane Guerrero',
  'Wilmer Valderrama'],
 'Cinematography': ['Nathan Warner (layout)',
  'Alessandro Jacomini',
  'Daniel Rice (lighting)'],
 'Edited by': 'Jeremy Milton',
 'Music by': 'Germaine Franco',
 'Production companies': ['Walt Disney Pictures',
  'Walt Disney Animation Studios'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release dates': ['November 3, 2021 ( El Capitan Theatre )',
  'November 24, 2021 (United States)'],
 'Running time': '102 minutes',
 'Country': 'United States',
 'Languages':

### Using omdb api

In [43]:
import requests

def get_ombd_info(title):
    base_url = "http://www.omdbapi.com/?"
    key_url = f't={title}&apikey=ef085b75'
    full_url = base_url + key_url
    
    return requests.get(full_url).json()

def get_rotten_tomato(omdb_info):
    ratings = omdb_info.get('Ratings', []) # if empty, it will make empty list
    for rating in ratings:
        if rating['Source'] == "Rotten Tomatoes":
            return rating['Value']
    return None # if empty return none

'71%'

In [45]:
for movie in movie_raw:
    title = movie['title']
    omdb_info = get_ombd_info(title)
    movie['imdb'] = omdb_info.get('imdbRating', None)
    movie['metascore'] = omdb_info.get('Metascore', None)
    movie['rottentomato'] = get_rotten_tomato(omdb_info)

In [50]:
movie_raw[-100]

{'title': 'Tomorrowland',
 'Directed by': 'Brad Bird',
 'Screenplay by': ['Damon Lindelof', 'Brad Bird'],
 'Story by': ['Damon Lindelof', 'Brad Bird', 'Jeff Jensen'],
 'Produced by': ['Damon Lindelof', 'Brad Bird', 'Jeffrey Chernov'],
 'Starring': ['George Clooney',
  'Hugh Laurie',
  'Britt Robertson',
  'Raffey Cassidy',
  'Tim McGraw',
  'Kathryn Hahn',
  'Keegan-Michael Key'],
 'Cinematography': 'Claudio Miranda',
 'Edited by': ['Walter Murch', 'Craig Wood'],
 'Music by': 'Michael Giacchino',
 'Production companies': ['Walt Disney Pictures', 'A113 Productions'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release dates': ['May 9, 2015 ( Disneyland )',
  'May 22, 2015 (United States)'],
 'Running time': '130 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$180–190 million',
 'Box office': '$209 million',
 'Running time (int)': 130,
 'Budget (float)': 180000000.0,
 'Box office (float)': 209000000.0,
 'Release date temp': ['May 9, 2015 (

## Save data

### JSON

To save as a JSON, convert datetime to string

In [51]:
movie_raw_copy = [movie.copy() for movie in movie_raw]

In [60]:
for movie in movie_raw_copy:
    current_date = movie['Release date (datetime)']
    if current_date:
        movie['Release date (datetime)'] = current_date.strftime("%B %d, %Y")
    else:
        movie['Release date (datetime)'] = None

In [61]:
movie_raw_copy[-35]

{'title': 'Cheaper by the Dozen',
 'Directed by': 'Gail Lerner',
 'Screenplay by': ['Kenya Barris', 'Jenifer Rice-Genzuk Henry'],
 'Story by': ['Craig Titley', 'Kenya Barris', 'Jenifer Rice-Genzuk Henry'],
 'Based on': ['Cheaper by the Dozen',
  'by',
  'Frank B. Gilbreth Jr.',
  'and',
  'Ernestine Gilbreth Carey'],
 'Produced by': 'Kenya Barris',
 'Starring': ['Gabrielle Union', 'Zach Braff', 'Erika Christensen'],
 'Cinematography': 'Mitchell Amundsen',
 'Edited by': 'Troy Takaki',
 'Music by': 'John Paesano',
 'Production companies': ['Walt Disney Pictures', 'Khalabo Ink Society'],
 'Distributed by': 'Disney+',
 'Release dates': ['March 16, 2022 ( El Capitan Theatre )',
  'March 18, 2022 (United States)'],
 'Running time': '107 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Running time (int)': 107,
 'Budget (float)': None,
 'Box office (float)': None,
 'Release date temp': ['March 16, 2022 ( El Capitan Theatre )',
  'March 18, 2022 (United States)'],
 'Release dat

In [62]:
save_data("disney_data_final.json", movie_raw_copy)

### CSV

In [79]:
import pandas as pd

In [80]:
df = pd.DataFrame(movie_raw)

In [81]:
df.shape

(520, 51)

In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 51 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   title                    520 non-null    object        
 1   Production company       214 non-null    object        
 2   Distributed by           517 non-null    object        
 3   Release date             340 non-null    object        
 4   Running time             499 non-null    object        
 5   Country                  465 non-null    object        
 6   Language                 497 non-null    object        
 7   Box office               401 non-null    object        
 8   Running time (int)       499 non-null    float64       
 9   Budget (float)           308 non-null    float64       
 10  Box office (float)       389 non-null    float64       
 11  Release date temp        520 non-null    object        
 12  Release date (datetime)  501 non-nul

In [83]:
df.to_csv('disney_data_dataframe.csv')