# Task 3: Clean our Data

In [18]:
import json

def load_data(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)

In [19]:
movie_info_list = load_data('disney_info_list.json')

movie_info_list[:5]

[{'title': 'Academy Award Review of Walt Disney Cartoons',
  'Production company': 'Walt Disney Productions',
  'Release date': ['May 19, 1937 (1937-05-19)'],
  'Running time': '41 minutes (74 minutes 1966 release)',
  'Country': 'United States',
  'Language': 'English',
  'Box office': '$45.472'},
 {'title': 'Snow White and the Seven Dwarfs',
  'Directed by': ['David Hand (supervising)',
   'William Cottrell',
   'Wilfred Jackson',
   'Larry Morey',
   'Perce Pearce',
   'Ben Sharpsteen'],
  'Produced by': 'Walt Disney',
  'Written by': ['Ted Sears',
   'Richard Creedon',
   'Otto Englander',
   'Dick Rickard',
   'Earl Hurd',
   'Merrill De Maris',
   'Dorothy Ann Blank',
   'Webb Smith'],
  'Based on': 'Snow Whiteby The Brothers Grimm',
  'Starring': ['Adriana Caselotti',
   'Lucille La Verne',
   'Harry Stockwell',
   'Roy Atwell',
   'Pinto Colvig',
   'Otis Harlan',
   'Scotty Mattraw',
   'Billy Gilbert',
   'Eddie Collins',
   'Moroni Olsen',
   'Stuart Buchanan'],
  'Music by'

### Subtasks:
- ~~Clean up scraping error~~
- ~~Clean up reference [1][2]~~
- ~~Spllit up the long strings on 'Directed by', 'Music by', etc~~
- ~~Convert single list item into a string~~
- ~~Convert running time into an integer~~
- ~~Convert budget and box office to numbers~~
- Convert dates into datetime object

#### Subtasks 1-3; Clean up scraping error, cleanup reference, split up long strings

In [1]:
import requests
from bs4 import BeautifulSoup as bs

In [3]:
# Clean up reference tag
def strip_tag(soup):
    for tag in soup.find_all("sup"):
        tag.decompose()

def get_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ").replace("\xa0", " ") for li in row_data.find_all("li")]
    # Split up long strings
    elif row_data.find("a"):
        return [a.get_text(" ").replace("\xa0", " ") for a in row_data.find_all("a")]
    else:
        return row_data.get_text(" ").replace("\xa0", " ")

def get_info_box(url):
    r = requests.get(url)
    soup = bs(r.content)

    info_box = soup.find(class_="infobox vevent")
    info_rows = info_box.find_all("tr")

    strip_tag(soup)

    movie_info = {}
    
    for index, row in enumerate(info_rows):

        if index == 0:
            movie_info['Title'] = row.find('th').get_text(" ")
        else:
            # Clean up some errors from scraping
            if row.find("th"):
                movie_info['Link'] = url
                content_keys = row.find('th').get_text(" ")
                content_values = get_value(row.find('td'))
                movie_info[content_keys] = content_values
        
    return movie_info

r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
soup = bs(r.content)

movies = soup.select(".wikitable.sortable i a")

base_path = "https://en.wikipedia.org/"

movie_list = []

for index, movie in enumerate(movies):
    if index % 10 == 0:
        print(index)
    try:    
        relative_path = movie["href"]
        full_path = base_path + relative_path
        movie_list.append(get_info_box(full_path))
    except Exception as e:
        print(movie.get_text(), full_path)
        print(e)

0
10
20
30
40
Zorro the Avenger https://en.wikipedia.org//wiki/Zorro_(1957_TV_series)#Media
'NoneType' object has no attribute 'find'
The Sign of Zorro https://en.wikipedia.org//wiki/Zorro_(1957_TV_series)#Media
'NoneType' object has no attribute 'find'
50
60
70
80
90
100
110
120
True-Life Adventures https://en.wikipedia.org//wiki/True-Life_Adventures
'NoneType' object has no attribute 'find_all'
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430


In [4]:
len(movie_list)

436

#### Subtask 4: Convert single item in list into string

In [68]:
def convert_value_into_string(dictionary):
    for key, value in dictionary.items():
        if type(value) == list and len(value) == 1:
            dictionary[key] = "".join(dictionary[key])


for item in movie_list:
    convert_value_into_string(item)

In [69]:
movie_list

[{'Title': 'Academy Award Review of ',
  'Link': 'https://en.wikipedia.org//wiki/Academy_Award_Review_of_Walt_Disney_Cartoons',
  'Production company ': 'Walt Disney Productions',
  'Release date': 'May 19, 1937',
  'Running time': '41 minutes (74 minutes 1966 release)',
  'Country': 'United States',
  'Language': 'English',
  'Box office': '$45.472'},
 {'Title': 'Snow White and the Seven Dwarfs',
  'Link': 'https://en.wikipedia.org//wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)',
  'Directed by': ['David Hand  (supervising)',
   'William Cottrell',
   'Wilfred Jackson',
   'Larry Morey',
   'Perce Pearce',
   'Ben Sharpsteen'],
  'Produced by': 'Walt Disney',
  'Written by': ['Ted Sears',
   'Richard Creedon',
   'Otto Englander',
   'Dick Rickard',
   'Earl Hurd',
   'Merrill De Maris',
   'Dorothy Ann Blank',
   'Webb Smith'],
  'Based on': ['Snow White', 'Brothers Grimm'],
  'Starring': ['Adriana Caselotti',
   'Lucille La Verne',
   'Harry Stockwell',
   'Roy Atwell',
   'Pinto

In [2]:
import json

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
        
save_data("cleaned_disney_info_list.json", movie_list)

NameError: name 'movie_list' is not defined

#### Subtask 5: Convert running time into an integer

In [1]:
import json

def load_data(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)
    
movie_list = load_data("cleaned_disney_info_list.json")

In [2]:
movie_list[:5]

[{'Title': 'Academy Award Review of ',
  'Link': 'https://en.wikipedia.org//wiki/Academy_Award_Review_of_Walt_Disney_Cartoons',
  'Production company ': 'Walt Disney Productions',
  'Release date': 'May 19, 1937',
  'Running time': '41 minutes (74 minutes 1966 release)',
  'Country': 'United States',
  'Language': 'English',
  'Box office': '$45.472'},
 {'Title': 'Snow White and the Seven Dwarfs',
  'Link': 'https://en.wikipedia.org//wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)',
  'Directed by': ['David Hand  (supervising)',
   'William Cottrell',
   'Wilfred Jackson',
   'Larry Morey',
   'Perce Pearce',
   'Ben Sharpsteen'],
  'Produced by': 'Walt Disney',
  'Written by': ['Ted Sears',
   'Richard Creedon',
   'Otto Englander',
   'Dick Rickard',
   'Earl Hurd',
   'Merrill De Maris',
   'Dorothy Ann Blank',
   'Webb Smith'],
  'Based on': ['Snow White', 'Brothers Grimm'],
  'Starring': ['Adriana Caselotti',
   'Lucille La Verne',
   'Harry Stockwell',
   'Roy Atwell',
   'Pinto

In [4]:
time = [movie.get('Running time', 'N/A') for movie in movie_list]

print(len(time))
print(time)

436
['41 minutes (74 minutes 1966 release)', '83 minutes', '88 minutes', '126 minutes', '74 minutes', '64 minutes', '70 minutes', '42 minutes', '65 min.', '71 minutes', '75 minutes', '94 minutes', '73 minutes', '75 minutes', '82 minutes', '68 minutes', '74 minutes', '96 minutes', '75 minutes', '84 minutes', '77 minutes', '92 minutes', '69 minutes', '81 minutes', '60 minutes (VHS version) 71 minutes (original)', '127 minutes', '92 minutes', '76 minutes', '75 minutes', '73 minutes', '85 minutes', '81 minutes', '70 minutes', '90 min.', '80 minutes', '75 minutes', '83 minutes', '83 minutes', '72 minutes', '97 minutes', '75 minutes', '104 minutes', '93 minutes', '105 minutes', '95 minutes', '97 minutes', '134 minutes', '69 minutes', '92 minutes', '126 minutes', '79 minutes', '97 minutes', '128 minutes', '74 minutes', '91 minutes', '105 minutes', '98 minutes', '130 minutes', '89 min.', '93 minutes', '67 minutes', '98 minutes', '100 minutes', '118 minutes', '103 Minutes', '110 minutes', '80 m

In [3]:
# Find Movie index of specific value in Running Time
for item in movie_list:
    for value in item.values():
        if value == ['Los Angeles', 'New York City', "Director's Cut"]:
            print(item['Title'], item['Link'])
            print(movie_list.index(item))

The Happiest Millionaire https://en.wikipedia.org//wiki/The_Happiest_Millionaire
86


In [4]:
def minute_to_integer(running_time):
    if running_time == "N/A" or running_time == ['Los Angeles', 'New York City', "Director's Cut"]:
        return None
    
    if isinstance(running_time, list):
        return int(running_time[0].split(" ")[0])
    else:
        return int(running_time.split(" ")[0])

In [5]:
for movie in movie_list:
    movie['Running time (int)'] = minute_to_integer(movie.get('Running time', 'N/A'))

In [6]:
int_time = [movie.get('Running time (int)') for movie in movie_list]

print(len(int_time))
print(int_time)

436
[41, 83, 88, 126, 74, 64, 70, 42, 65, 71, 75, 94, 73, 75, 82, 68, 74, 96, 75, 84, 77, 92, 69, 81, 60, 127, 92, 76, 75, 73, 85, 81, 70, 90, 80, 75, 83, 83, 72, 97, 75, 104, 93, 105, 95, 97, 134, 69, 92, 126, 79, 97, 128, 74, 91, 105, 98, 130, 89, 93, 67, 98, 100, 118, 103, 110, 80, 79, 91, 91, 97, 118, 139, 92, 131, 87, 116, 93, 110, 110, 131, 101, 108, 84, 78, 75, None, 106, 110, 99, 113, 108, 112, 93, 91, 93, 100, 100, 79, 96, 113, 89, 118, 92, 88, 92, 87, 93, 93, 93, 90, 83, 96, 88, 89, 91, 93, 92, 97, 100, 100, 89, 91, 112, 115, 95, 91, 95, 104, 74, 48, 77, 104, 128, 101, 94, 104, 90, 100, 88, 93, 98, 100, 112, 84, 98, 97, 114, 96, 100, 109, 83, 90, 107, 96, 103, 91, 95, 105, 113, 80, 101, 89, 74, 90, 89, 110, 74, 93, 84, 83, 69, 77, 107, 93, 88, 108, 84, 121, 89, 104, 90, 86, 84, 108, 107, 96, 98, 105, 108, 94, 106, 102, 88, 102, 102, 97, 111, 100, 96, 98, 78, 81, 108, 89, 99, 89, 81, 92, 100, 89, 79, 91, 101, 104, 103, 86, 105, 93, 92, 98, 95, 93, 87, 93, 87, 128, 86, 95, 114,

In [7]:
for key, value in movie_list[86].items():
    if key == 'Running time':
        print(key, ':', value)
    if key == 'Running time (int)':
        print(key, ':', value)

Running time : ['Los Angeles', 'New York City', "Director's Cut"]
Running time (int) : None


#### Subtask 6:  Convert budget and box office to numbers

In [8]:
box_office = [movie.get('Box office', 'N/A') for movie in movie_list]

print(box_office)

['$45.472', '$418 million', '$164 million', '$76.4–$83.3 million', '$960,000 (worldwide rentals) ', '$1.3 million (est. United States/Canada rentals, 1941)', '$267.4 million', '$1,135,000 (worldwide rentals) ', '$799,000', '$3,355,000 (worldwide rentals) ', '$3.275 million (worldwide rentals)', '$65 million', '$3,165,000 (worldwide rentals)', '$2,560,000 (worldwide rentals) ', '$3.7 million (U.S. rental) $575,000 (foreign rental) ', '$1,625,000 (worldwide rentals)', '$263.6 million', '$4,100,000 (worldwide rentals) ', '$5.6 million (US, 1951)', '$2.1 million (US rentals)', '$87.4 million', '$1 million (US)', '$2.6 million (US)', 'N/A', '$1.75 million (US and Canadian rentals)', '$28.2 million', '$2,150,000 (US)', '$187 million', '$2.1 million (US)', '$1.6 million (US)', '$1.7 million (US)', 'N/A', 'N/A', '$2.75 million (US)', 'N/A', '$1.75 million (US rentals)', '$6,250,000 (US/Canada rentals)', 'N/A', '$1.8 million (est. US/ Canada rentals)', '$2.5 million (est. US/ Canada rentals)', 

In [9]:
import re

amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

word_re = rf"\${number}(-|\sto\s|–)?({number})?\s({amounts})"
value_re = rf"\${number}"
'''
money_conversion("$12.2 million") --> 12200000	## Word syntax
money_conversion("$790,000") --> 790000			## Value syntax
'''
def word_to_value(word):
    value_dict = {"thousand": 1000, "million":1000000, "billion":1000000000}
    return value_dict[word]

def parse_word_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    word_string = re.search(amounts, string, flags=re.I).group().lower()
    word = float(word_to_value(word_string))
    return value * word

def parse_value_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    return value

def money_conversion(money):
    if money == "N/A":
        return None

    if isinstance(money, list):
        money = money[0]

    word_syntax = re.search(word_re, money, flags=re.I)
    value_syntax = re.search(value_re, money)

    if word_syntax:
        return parse_word_syntax(word_syntax.group())
    elif value_syntax:
        return parse_value_syntax(value_syntax.group())
    else:
        None

In [10]:
for movie in movie_list:
    movie['Budget (float)'] = money_conversion(movie.get('Budget', "N/A"))
    movie['Box office (float)'] = money_conversion(movie.get('Box office', "N/A"))

In [11]:
print([movie.get('Budget (float)', 'N/A') for movie in movie_list])

[None, 1490000.0, 2600000.0, 2280000.0, 600000.0, 950000.0, 858000.0, None, 788000.0, None, 1350000.0, 2125000.0, None, 1500000.0, 1500000.0, None, 2900000.0, 1800000.0, 3000000.0, None, 4000000.0, 2000000.0, 300000.0, 1800000.0, None, 5000000.0, None, 4000000.0, None, None, None, None, None, None, 700000.0, None, None, None, None, None, 6000000.0, 1000000.0, None, 2000000.0, None, None, 2500000.0, None, None, 4000000.0, 3600000.0, None, None, None, None, 3000000.0, None, 3000000.0, None, None, None, None, None, None, None, None, None, 3000000.0, None, None, None, None, 4400000.0, None, None, None, None, None, None, None, None, None, None, None, 4000000.0, None, 5000000.0, None, None, None, None, 5000000.0, None, None, None, None, None, None, 4000000.0, None, None, None, 6300000.0, None, None, None, None, None, None, None, None, 5000000.0, None, None, None, None, 8000000.0, None, None, None, None, None, 1000000.0, None, None, None, None, 5000000.0, None, None, None, 7500000.0, None, 10

In [12]:
movie_list[-40]

{'Title': 'Beauty and the Beast',
 'Link': 'https://en.wikipedia.org//wiki/Beauty_and_the_Beast_(2017_film)',
 'Directed by': 'Bill Condon',
 'Produced by': ['David Hoberman', 'Todd Lieberman'],
 'Screenplay by': ['Stephen Chbosky', 'Evan Spiliotopoulos'],
 'Based on': ["Disney 's  Beauty and the Beast by  Linda Woolverton",
  'Beauty and the Beast by  Jeanne-Marie Leprince de Beaumont'],
 'Starring': ['Emma Watson',
  'Dan Stevens',
  'Luke Evans',
  'Kevin Kline',
  'Josh Gad',
  'Ewan McGregor',
  'Stanley Tucci',
  'Audra McDonald',
  'Gugu Mbatha-Raw',
  'Ian McKellen',
  'Emma Thompson'],
 'Music by': 'Alan Menken',
 'Cinematography': 'Tobias A. Schliessler',
 'Edited by': 'Virginia Katz',
 'Production company ': ['Walt Disney Pictures', 'Mandeville Films'],
 'Distributed by': 'Walt Disney Studios Motion Pictures',
 'Release date': ['February 23, 2017  ( Spencer House )',
  'March 17, 2017  (United States)'],
 'Running time': '129 minutes',
 'Country': 'United States',
 'Language

#### Subtask 6:  Convert Release Date to datetimes

In [16]:
release_date = [movie.get('Release date', 'N/A') for movie in movie_list]

In [17]:
print(release_date)

['May 19, 1937', ['December 21, 1937  ( Carthay Circle Theatre ,  Los Angeles ,  CA )', 'February 4, 1938  (United States)'], ['February 7, 1940  ( Center Theatre )', 'February 23, 1940  (United States)'], 'November 13, 1940', 'June 20, 1941', ['October 23, 1941  (New York City)', 'October 31, 1941  (U.S.)'], ['August 9, 1942  (World Premiere-London)', 'August 13, 1942  (Premiere-New York City)', 'August 21, 1942  (U.S.)'], ['August 24, 1942  (World Premiere-Rio de Janeiro)', 'February 6, 1943  (U.S. Premiere-Boston)', 'February 19, 1943  (U.S.)'], 'July 17, 1943', ['December 21, 1944  (Mexico City)', 'February 3, 1945  (US)'], ['April 20, 1946  (New York City premiere)', 'August 15, 1946  (U.S.)'], ['November 12, 1946  (Premiere: Atlanta, Georgia)', 'November 20, 1946'], 'September 27, 1947', 'May 27, 1948', 'November 29, 1948 (Chicago, Illinois) January 19, 1949 (Indianapolis, Indiana)', 'October 5, 1949', ['February 15, 1950  (Boston)', 'March 4, 1950  (United States)'], ['June 22, 

In [18]:
from datetime import datetime

def clean_date(date):
    return date.split("(")[0].strip()

def date_conversion(date):
    if isinstance(date, list):
        date = date[0]
    
    if date == "N/A":
        return None
    
    date_str = clean_date(date)
    
    frmts = ["%B %d, %Y", "%d %B %Y"]
    
    for frmt in frmts:
        try:
            return datetime.strptime(date_str, frmt)
        except:
            pass
    return None

In [19]:
for date in release_date:
    print(date_conversion(date))
    print()

1937-05-19 00:00:00

1937-12-21 00:00:00

1940-02-07 00:00:00

1940-11-13 00:00:00

1941-06-20 00:00:00

1941-10-23 00:00:00

1942-08-09 00:00:00

1942-08-24 00:00:00

1943-07-17 00:00:00

1944-12-21 00:00:00

1946-04-20 00:00:00

1946-11-12 00:00:00

1947-09-27 00:00:00

1948-05-27 00:00:00

1948-11-29 00:00:00

1949-10-05 00:00:00

1950-02-15 00:00:00

1950-06-22 00:00:00

1951-07-26 00:00:00

1952-03-13 00:00:00

1953-02-05 00:00:00

1953-08-08 00:00:00

1953-11-10 00:00:00

1953-10-26 00:00:00

1954-08-17 00:00:00

1954-12-23 00:00:00

1955-05-25 00:00:00

1955-06-22 00:00:00

1955-09-14 00:00:00

1955-12-22 00:00:00

1956-06-08 00:00:00

1956-07-18 00:00:00

1956-11-06 00:00:00

1956-12-20 00:00:00

1957-06-19 00:00:00

1957-08-28 00:00:00

1957-12-25 00:00:00

1958-07-09 00:00:00

1958-08-12 00:00:00

1958-12-25 00:00:00

1959-01-29 00:00:00

1959-03-19 00:00:00

1959-06-24 00:00:00

1959-11-10 00:00:00

None

1960-02-24 00:00:00

1960-05-19 00:00:00

1960-08-10 00:00:00

1960-11

In [20]:
for movie in movie_list:
    movie['Release date (datetime)'] = date_conversion(movie.get('Release date', 'N/A'))

In [21]:
print([movie.get('Release date (datetime)') for movie in movie_list])

[datetime.datetime(1937, 5, 19, 0, 0), datetime.datetime(1937, 12, 21, 0, 0), datetime.datetime(1940, 2, 7, 0, 0), datetime.datetime(1940, 11, 13, 0, 0), datetime.datetime(1941, 6, 20, 0, 0), datetime.datetime(1941, 10, 23, 0, 0), datetime.datetime(1942, 8, 9, 0, 0), datetime.datetime(1942, 8, 24, 0, 0), datetime.datetime(1943, 7, 17, 0, 0), datetime.datetime(1944, 12, 21, 0, 0), datetime.datetime(1946, 4, 20, 0, 0), datetime.datetime(1946, 11, 12, 0, 0), datetime.datetime(1947, 9, 27, 0, 0), datetime.datetime(1948, 5, 27, 0, 0), datetime.datetime(1948, 11, 29, 0, 0), datetime.datetime(1949, 10, 5, 0, 0), datetime.datetime(1950, 2, 15, 0, 0), datetime.datetime(1950, 6, 22, 0, 0), datetime.datetime(1951, 7, 26, 0, 0), datetime.datetime(1952, 3, 13, 0, 0), datetime.datetime(1953, 2, 5, 0, 0), datetime.datetime(1953, 8, 8, 0, 0), datetime.datetime(1953, 11, 10, 0, 0), datetime.datetime(1953, 10, 26, 0, 0), datetime.datetime(1954, 8, 17, 0, 0), datetime.datetime(1954, 12, 23, 0, 0), dateti

In [22]:
movie_list[50]

{'Title': 'One Hundred and One Dalmatians',
 'Link': 'https://en.wikipedia.org//wiki/One_Hundred_and_One_Dalmatians',
 'Directed by': ['Clyde Geronimi', 'Hamilton Luske', 'Wolfgang Reitherman'],
 'Produced by': 'Walt Disney',
 'Story by': 'Bill Peet',
 'Based on': ['The Hundred and One Dalmatians', 'Dodie Smith'],
 'Starring': ['Rod Taylor',
  'Cate Bauer',
  'Betty Lou Gerson',
  'Ben Wright',
  'Bill Lee  (singing voice)',
  'Lisa Davis',
  'Martha Wentworth'],
 'Music by': 'George Bruns',
 'Edited by': ['Roy M. Brewer, Jr.', 'Donald Halliday'],
 'Production company ': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': 'January 25, 1961',
 'Running time': '79 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$3.6 million',
 'Box office': '$303 million',
 'Running time (int)': 79,
 'Budget (float)': 3600000.0,
 'Box office (float)': 303000000.0,
 'Release date (datetime)': datetime.datetime(1961, 1, 25, 0, 0)}

In [130]:
import pickle

def save_data_pickle(name, data):
    with open(name, 'wb') as f:
        pickle.dump(data, f)

In [131]:
import pickle

def load_data_pickle(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [29]:
save_data_pickle("cleaned_more_disney_info_list.pickle", movie_list)

In [30]:
disney_movie = load_data_pickle("cleaned_more_disney_info_list.pickle")

In [31]:
disney_movie

[{'Title': 'Academy Award Review of ',
  'Link': 'https://en.wikipedia.org//wiki/Academy_Award_Review_of_Walt_Disney_Cartoons',
  'Production company ': 'Walt Disney Productions',
  'Release date': 'May 19, 1937',
  'Running time': '41 minutes (74 minutes 1966 release)',
  'Country': 'United States',
  'Language': 'English',
  'Box office': '$45.472',
  'Running time (int)': 41,
  'Budget (float)': None,
  'Box office (float)': 45.472,
  'Release date (datetime)': datetime.datetime(1937, 5, 19, 0, 0)},
 {'Title': 'Snow White and the Seven Dwarfs',
  'Link': 'https://en.wikipedia.org//wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)',
  'Directed by': ['David Hand  (supervising)',
   'William Cottrell',
   'Wilfred Jackson',
   'Larry Morey',
   'Perce Pearce',
   'Ben Sharpsteen'],
  'Produced by': 'Walt Disney',
  'Written by': ['Ted Sears',
   'Richard Creedon',
   'Otto Englander',
   'Dick Rickard',
   'Earl Hurd',
   'Merrill De Maris',
   'Dorothy Ann Blank',
   'Webb Smith'],
  

# Task 4: Attach IMDB/Rotten Tomatoes/Metascore scores

In [9]:
movie_list = load_data_pickle("cleaned_more_disney_info_list.pickle")

In [10]:
movie_list[5]

{'Title': 'Dumbo',
 'Link': 'https://en.wikipedia.org//wiki/Dumbo',
 'Directed by': ['Ben Sharpsteen',
  'Norman Ferguson',
  'Wilfred Jackson',
  'Jack Kinney'],
 'Produced by': 'Walt Disney',
 'Story by': ['Joe Grant', 'Dick Huemer'],
 'Based on': 'Helen Aberson',
 'Starring': ['Edward Brophy',
  'Herman Bing',
  'Sterling Holloway',
  'Verna Felton',
  'Cliff Edwards',
  'James Baskett',
  'Nick Stewart',
  'Hall Johnson'],
 'Narrated by': 'John McLeish',
 'Music by': ['Frank Churchill', 'Oliver Wallace'],
 'Production company ': 'Walt Disney Productions',
 'Distributed by': 'RKO Radio Pictures',
 'Release date': ['October 23, 1941  (New York City)',
  'October 31, 1941  (U.S.)'],
 'Running time': '64 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$950,000',
 'Box office': '$1.3 million (est. United States/Canada rentals, 1941)',
 'Running time (int)': 64,
 'Budget (float)': 950000.0,
 'Box office (float)': 1300000.0,
 'Release date (datetime)': datetime.

In [34]:
# http://www.omdbapi.com/?apikey=[yourkey]&

In [121]:
import requests
import urllib
import os

def get_omdb_info(title):
    base_url = "http://www.omdbapi.com/?"
    parameters = {'apikey': os.environ['OMDB_KEY'], 't': title}
    params_encoded = urllib.parse.urlencode(parameters)
    full_url = base_url + params_encoded
    return requests.get(full_url).json()

def get_rotten_tomatoes_score(omdb):
    ratings = omdb.get('Ratings', [])
    for rating in ratings:
        if rating['Source'] == 'Rotten Tomatoes':
            value = rating['Value'].split("%")[0]
            return value
    return None

In [126]:
def attach_score(data):
    for index, movie in enumerate(data):
        if index % 10 == 0:
            print(index)
            
        title = movie['Title']    
        omdb = get_omdb_info(title)
        movie['Rotten Tomatoes'] = get_rotten_tomatoes_score(omdb)
        movie['IMDB'] = omdb.get('imdbRating', None)
        movie['Metascore'] = omdb.get('Metascore', None)

In [127]:
attach_score(movie_list)

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430


In [128]:
movie_list[15:20]

[{'Title': 'The Adventures of Ichabod and Mr. Toad',
  'Link': 'https://en.wikipedia.org//wiki/The_Adventures_of_Ichabod_and_Mr._Toad',
  'Directed by': ['Jack Kinney', 'Clyde Geronimi', 'James Algar'],
  'Produced by': 'Walt Disney',
  'Story by': ['Erdman Penner',
   'Winston Hibler',
   'Joe Rinaldi',
   'Ted Sears',
   'Homer Brightman',
   'Harry Reeves'],
  'Based on': ['The Wind in the Willows',
   'Kenneth Grahame',
   'The Legend of Sleepy Hollow',
   'Washington Irving'],
  'Starring': ['Eric Blore',
   "J. Pat O'Malley",
   'Colin Campbell',
   'John McLeish',
   'Campbell Grant',
   'Claude Allister',
   'Leslie Denison',
   'Edmond Stevens',
   'The Rhythmaires'],
  'Narrated by': ['Basil Rathbone  ( The Wind in the Willows )',
   'Bing Crosby  ( The Legend of Sleepy Hollow )'],
  'Music by': 'Oliver Wallace',
  'Edited by': 'John O. Young',
  'Production company ': 'Walt Disney Productions',
  'Distributed by': 'RKO Radio Pictures',
  'Release date': 'October 5, 1949',
  

In [132]:
save_data_pickle("disney_movie_final", movie_list)

# Task 5: Save into json and csv

### Subtask: Convert datetime into string

In [134]:
movie_list_copy = [movie.copy() for movie in movie_list]

In [138]:
from datetime import datetime

for movie in movie_list_copy:
    current_date = movie['Release date (datetime)']
    if current_date:
        movie['Release date (datetime)'] = current_date.strftime("%B %d, %Y")
    else:
        movie['Release date (datetime)'] = None

In [144]:
print(movie_list_copy[5]['Release date (datetime)'])
print(movie_list[5]['Release date (datetime)'])

October 23, 1941
1941-10-23 00:00:00


In [145]:
import json

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [154]:
save_data("disney_movie_final.json", movie_list_copy)

In [158]:
import pandas as pd

def save_data_csv(title, data):
    df = pd.DataFrame(data)
    df.to_csv(title, index=False)

In [160]:
save_data_csv("disney_movie_final.csv", movie_list)