# Disney Dataset Creation

Learning how to solve data science problems with a bunch of different libraries! `

### Task 1: Use Beautiful Soup to scrape webpage

In [1]:
from bs4 import BeautifulSoup as bs
import requests
import re

In [2]:
r = requests.get("https://en.wikipedia.org/wiki/Toy_Story_3")

soup = bs(r.content)

In [3]:
info_box = soup.find(class_="infobox vevent")
info_rows = info_box.find_all("tr")

for row in info_rows:
    print(row.prettify())

<tr>
 <th class="summary" colspan="2" style="text-align:center;font-size:125%;font-weight:bold;font-size:110%;font-style:italic;">
  Toy Story 3
 </th>
</tr>

<tr>
 <td colspan="2" style="text-align:center">
  <a class="image" href="/wiki/File:Toy_Story_3_poster.jpg" title="All of the toys packed close together, holding up a large numeral '3', with Buzz, who is putting a friendly arm around Woody's shoulder, and Woody holding the top of the 3.">
   <img alt="All of the toys packed close together, holding up a large numeral '3', with Buzz, who is putting a friendly arm around Woody's shoulder, and Woody holding the top of the 3." class="thumbborder" data-file-height="326" data-file-width="220" decoding="async" height="326" src="//upload.wikimedia.org/wikipedia/en/6/69/Toy_Story_3_poster.jpg" width="220"/>
  </a>
  <div style="font-size:95%;padding:0.35em 0.35em 0.25em;line-height:1.25em;">
   Theatrical release poster
  </div>
 </td>
</tr>

<tr>
 <th scope="row" style="white-space:nowra

In [4]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    else:
        return row_data.get_text().replace("\xa0", " ")

movie_info = {}

for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['title'] = row.find("th").get_text(" ", strip=True)
    elif index == 1:
        continue
    else:
        content_key = row.find("th").get_text(" ", strip=True)
        content_value = get_content_value(row.find("td"))
        movie_info[content_key] = content_value
        
movie_info

{'title': 'Toy Story 3',
 'Directed by': 'Lee Unkrich',
 'Produced by': 'Darla K. Anderson',
 'Screenplay by': 'Michael Arndt',
 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'],
 'Starring': ['Tom Hanks',
  'Tim Allen',
  'Joan Cusack',
  'Don Rickles',
  'Wallace Shawn',
  'John Ratzenberger',
  'Estelle Harris',
  'Blake Clark',
  'Jeff Pidgeon',
  'Ned Beatty',
  'Michael Keaton',
  'Jodi Benson',
  'John Morris'],
 'Music by': 'Randy Newman',
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Edited by': 'Ken Schretzmann',
 'Production company': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Distributed by': 'Walt Disney StudiosMotion Pictures',
 'Release date': ['June 12, 2010 ( 2010-06-12 ) ( Taormina Film Fest )',
  'June 18, 2010 ( 2010-06-18 ) (United States)'],
 'Running time': '103 minutes[1]',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$200 million[1]',
 'Box office': '$1.067 billion[1]'}

### Task 2: Get info for all movies!

In [24]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text().replace("\xa0", " ")
        
def clean_tags(soup):
    for tag in soup.find_all(["sup", "span"], ):
        tag.decompose()
        
def get_info_box(url):
    r = requests.get(url)
    soup = bs(r.content)
    
    
    info_box = soup.find(class_="infobox vevent")
    info_rows = info_box.find_all("tr")
    
    clean_tags(soup)
    
    movie_info = {}

    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['title'] = row.find("th").get_text(" ", strip=True)
        else:
            header = row.find('th')
            if header:
                content_key = row.find("th").get_text(" ", strip=True)
                content_value = get_content_value(row.find("td"))
                movie_info[content_key] = content_value
    
    return movie_info



In [25]:
get_info_box("https://en.wikipedia.org/wiki/The_Great_Locomotive_Chase")

{'title': 'The Great Locomotive Chase',
 'Directed by': 'Francis D. Lyon',
 'Produced by': ['Lawrence Edward Watkin', 'Walt Disney'],
 'Written by': 'Lawrence Edward Watkin',
 'Starring': ['Fess Parker',
  'Jeffrey Hunter',
  'John Lupton',
  'Jeff York',
  'Slim Pickens'],
 'Music by': 'Paul J. Smith',
 'Cinematography': 'Charles Boyle',
 'Edited by': 'Ellsworth Hoagland',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': 'June 8, 1956',
 'Running time': '85 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Box office': '$1.7 million (US)'}

In [26]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
disney = bs(r.content)
movies = disney.select(".wikitable.sortable i a")
print(len(movies))

base_path = "https://en.wikipedia.org/"

movie_info_list = []
for index, movie in enumerate(movies):
    if index % 10 == 0:
        print(index)
    try:
        relative_path = movie['href']
        full_path = base_path + relative_path
        title = movie['title']
        
        movie_info_list.append(get_info_box(full_path))
    except Exception as e:
        print(movie.get_text())
        print(e)
    
movie_info_list

448
0
10
20
30
40
Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
50
60
70
80
90
100
110
120
True-Life Adventures
'NoneType' object has no attribute 'find_all'
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
Luca
'NoneType' object has no attribute 'find_all'
440


[{'title': 'Academy Award Review of',
  'Production company': 'Walt Disney Productions',
  'Release date': ['May 19, 1937'],
  'Running time': '41 minutes (74 minutes 1966 release)',
  'Country': 'United States',
  'Language': 'English',
  'Box office': '$45.472'},
 {'title': 'Snow White and the Seven Dwarfs',
  'Directed by': ['David Hand (supervising)',
   'William Cottrell',
   'Wilfred Jackson',
   'Larry Morey',
   'Perce Pearce',
   'Ben Sharpsteen'],
  'Produced by': 'Walt Disney',
  'Written by': ['Ted Sears',
   'Richard Creedon',
   'Otto Englander',
   'Dick Rickard',
   'Earl Hurd',
   'Merrill De Maris',
   'Dorothy Ann Blank',
   'Webb Smith'],
  'Based on': ['Snow White', 'by The', 'Brothers Grimm'],
  'Starring': ['Adriana Caselotti',
   'Lucille La Verne',
   'Harry Stockwell',
   'Roy Atwell',
   'Pinto Colvig',
   'Otis Harlan',
   'Scotty Mattraw',
   'Billy Gilbert',
   'Eddie Collins',
   'Moroni Olsen',
   'Stuart Buchanan'],
  'Music by': ['Frank Churchill', 'Pa

In [27]:
import json

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
        
def load_data(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)
        

In [28]:
save_data("disney_data_cleaned.json", movie_info_list)

### Clean the Data!

if you want to load the data, use the load_data function 

In [30]:
[movie.get("Running time (int)", 'N/A') for movie in movie_info_list]

['41 minutes (74 minutes 1966 release)',
 '83 minutes',
 '88 minutes',
 '126 minutes',
 '74 minutes',
 '64 minutes',
 '70 minutes',
 '42 minutes',
 '65 min.',
 '71 minutes',
 '75 minutes',
 '94 minutes',
 '73 minutes',
 '75 minutes',
 '82 minutes',
 '68 minutes',
 '74 minutes',
 '96 minutes',
 '75 minutes',
 '84 minutes',
 '77 minutes',
 '92 minutes',
 '69 minutes',
 '81 minutes',
 ['60 minutes (VHS version)', '71 minutes (original)'],
 '127 minutes',
 '92 minutes',
 '76 minutes',
 '75 minutes',
 '73 minutes',
 '85 minutes',
 '81 minutes',
 '70 minutes',
 '90 min.',
 '80 minutes',
 '75 minutes',
 '83 minutes',
 '83 minutes',
 '72 minutes',
 '97 minutes',
 '75 minutes',
 '104 minutes',
 '93 minutes',
 '105 minutes',
 '95 minutes',
 '97 minutes',
 '134 minutes',
 '69 minutes',
 '92 minutes',
 '126 minutes',
 '79 minutes',
 '97 minutes',
 '128 minutes',
 '74 minutes',
 '91 minutes',
 '105 minutes',
 '98 minutes',
 '130 minutes',
 '89 min.',
 '93 minutes',
 '67 minutes',
 '98 minutes',
 '1

In [42]:
# '132 minutes'
def minute_to_int(running_time):
    if running_time == "N/A":
        return None
    
    if isinstance(running_time, list):
        return int(running_time[0].split(" ")[0])
    else:
        return int(running_time.split(" ")[0])

for movie in movie_info_list:
    movie['Running time (int)'] = minute_to_int(movie.get('Running time', 'N/A'))

In [48]:
print([movie.get("Budget", 'N/A') for movie in movie_info_list])

['N/A', '$1.49 million', '$2.6 million', '$2.28 million', '$600,000', '$950,000', '$858,000', 'N/A', '$788,000', 'N/A', '$1.35 million', '$2.125 million', 'N/A', '$1.5 million ', '$1.5 million ', 'N/A', '$2.9 million', '$1,800,000', '$3 million', 'N/A', '$4 million', '$2 million', '$300,000', '$1.8 million', 'N/A', '$5 million', 'N/A', '$4 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$700,000', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$6 million', 'under $1 million or $1,250,000', 'N/A', '$2 million', 'N/A', 'N/A', '$2.5 million', 'N/A', 'N/A', '$4 million', '$3.6 million', 'N/A', 'N/A', 'N/A', 'N/A', '$3 million', 'N/A', '$3 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$3 million', 'N/A', 'N/A', 'N/A', 'N/A', '$4.4–6 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$4 million', 'N/A', '$5 million', 'N/A', 'N/A', 'N/A', 'N/A', '$5 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$4 million', 'N/A', 'N/A', 'N/A',