### Disney Dataset Creation (w/ Python BeautifulSoup)

Scrape & clean a list of disney wikipedia pages to create a dataset to further analyse

#### Task #1: Get Info Box (store in Python dictionary)

###### Import Neeccessary Libraries

In [1]:
from bs4 import BeautifulSoup as bs
import requests

Load the webpage

In [2]:
r = requests.get("https://en.wikipedia.org/wiki/Toy_Story_3")

## Convert to a beautiful soup object

soup = bs(r.content)

# print out the HTML
contents = soup.prettify()



In [3]:
info_box = soup.find(class_="infobox vevent")
info_rows = info_box.find_all("tr")

for row in info_rows:
    print(row.prettify())

<tr>
 <th class="infobox-above summary" colspan="2" style="font-size: 125%; font-style: italic;">
  Toy Story 3
 </th>
</tr>

<tr>
 <td class="infobox-image" colspan="2">
  <a class="image" href="/wiki/File:Toy_Story_3_poster.jpg" title="All of the toys packed close together, holding up a large numeral '3', with Buzz, who is putting a friendly arm around Woody's shoulder, and Woody holding the top of the 3.">
   <img alt="All of the toys packed close together, holding up a large numeral '3', with Buzz, who is putting a friendly arm around Woody's shoulder, and Woody holding the top of the 3." class="thumbborder" data-file-height="326" data-file-width="220" decoding="async" height="326" src="//upload.wikimedia.org/wikipedia/en/6/69/Toy_Story_3_poster.jpg" width="220"/>
  </a>
  <div class="infobox-caption">
   Theatrical release poster
  </div>
 </td>
</tr>

<tr>
 <th class="infobox-label" scope="row" style="white-space: nowrap; padding-right: 0.65em;">
  Directed by
 </th>
 <td class="

In [4]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text().replace("\xa0", " ") for li in row_data.find_all("li")]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text().replace("\xa0", " ")
    
movie_info = {}
for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['title'] = row.find("th").get_text(" ", strip=True)
    elif index == 1:
        continue
    else:
        content_key = row.find("th").get_text(" ", strip=True)
        content_value = get_content_value(row.find("td"))
        movie_info[content_key] = content_value

movie_info

{'title': 'Toy Story 3',
 'Directed by': 'Lee Unkrich',
 'Screenplay by': 'Michael Arndt',
 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'],
 'Produced by': 'Darla K. Anderson',
 'Starring': ['Tom Hanks',
  'Tim Allen',
  'Joan Cusack',
  'Don Rickles',
  'Wallace Shawn',
  'John Ratzenberger',
  'Estelle Harris',
  'Ned Beatty',
  'Michael Keaton',
  'Jodi Benson',
  'John Morris'],
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Edited by': 'Ken Schretzmann',
 'Music by': 'Randy Newman',
 'Production companies': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release dates': ['June 12, 2010 (2010-06-12) (Taormina Film Fest)',
  'June 18, 2010 (2010-06-18) (United States)'],
 'Running time': '103 minutes[1]',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$200 million[1]',
 'Box office': '$1.067 billion[1]'}

## Task2: Get info box for all movies

In [5]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")

## Convert to a beautiful soup object

soup = bs(r.content)

# print out the HTML
contents = soup.prettify()



In [6]:
movies = soup.select(".wikitable.sortable i")
movies[0:10]

[<i><a href="/wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)" title="Snow White and the Seven Dwarfs (1937 film)">Snow White and the Seven Dwarfs</a></i>,
 <i><a href="/wiki/Pinocchio_(1940_film)" title="Pinocchio (1940 film)">Pinocchio</a></i>,
 <i><a href="/wiki/Fantasia_(1940_film)" title="Fantasia (1940 film)">Fantasia</a></i>,
 <i><a href="/wiki/The_Reluctant_Dragon_(1941_film)" title="The Reluctant Dragon (1941 film)">The Reluctant Dragon</a></i>,
 <i><a href="/wiki/Dumbo" title="Dumbo">Dumbo</a></i>,
 <i><a href="/wiki/Bambi" title="Bambi">Bambi</a></i>,
 <i><a href="/wiki/Saludos_Amigos" title="Saludos Amigos">Saludos Amigos</a></i>,
 <i><a href="/wiki/Victory_Through_Air_Power_(film)" title="Victory Through Air Power (film)">Victory Through Air Power</a></i>,
 <i><a href="/wiki/The_Three_Caballeros" title="The Three Caballeros">The Three Caballeros</a></i>,
 <i><a href="/wiki/Make_Mine_Music" title="Make Mine Music">Make Mine Music</a></i>]

In [7]:
import re
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text().replace("\xa0", " ") for li in row_data.find_all("li")]
    else:
        return row_data.get_text().replace("\xa0", " ")

def clean_tags(soup):
    for tag in soup.find_all(["sup", "span"]):
        tag.decompose()
    
def get_info_box(url):
    
    r = requests.get(url)

    soup = bs(r.content)
    info_box = soup.find(class_="infobox vevent")
    info_rows = info_box.find_all("tr")
    
    clean_tags(soup)
    
    movie_info = {}
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['title'] = row.find("th").get_text(" ", strip=True)
        else:
            header = row.find("th")
            if header:
                content_key = row.find("th").get_text(" ", strip=True)
                
                content_value = get_content_value(row.find("td"))
#                 if any(i[0:4].isdigit() for i in content_value):
#                     content_value = [int(s) for s in re.findall(r'\b\d+\b', content_value)]
                    
                movie_info[content_key] = content_value
    return movie_info



In [8]:
get_info_box("https://en.wikipedia.org/wiki/One_Little_Indian_(film)")


{'title': 'One Little Indian',
 'Directed by': 'Bernard McEveety',
 'Written by': 'Harry Spalding',
 'Produced by': 'Winston Hibler',
 'Starring': ['James Garner',
  'Vera Miles',
  'Pat Hingle',
  "Clay O'Brien",
  'John Doucette',
  'Morgan Woodward',
  'Andrew Prine'],
 'Cinematography': 'Charles F. Wheeler',
 'Edited by': 'Robert Stafford',
 'Music by': 'Jerry Goldsmith',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['June 20, 1973'],
 'Running time': '90 Minutes',
 'Country': 'United States',
 'Language': 'English',
 'Box office': '$2 million'}

In [9]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
soup = bs(r.content)
movies = soup.select(".wikitable.sortable i a")

base_path = "https://en.wikipedia.org/"

movie_info_list = []
for index, movie in enumerate(movies):
    if index % 100 == 0:
        print(index)
    try:  
        relative_path = movie['href']
        full_path = base_path + relative_path
        title = movie['title']
        
        movie_info_list.append(get_info_box(full_path))
        
    except Exception as e:
        print(movie.get_text())
        print(e)
    

0
Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
100
200
300
400
The Beatles: Get Back – The Rooftop Concert
'NoneType' object has no attribute 'find'
500
Elemental
'NoneType' object has no attribute 'find_all'
Wish
'NoneType' object has no attribute 'find_all'
Elio
'NoneType' object has no attribute 'find_all'
61
'NoneType' object has no attribute 'find_all'
All Night Long
'NoneType' object has no attribute 'find'
Big Thunder Mountain Railroad
'NoneType' object has no attribute 'find_all'
Keeper of the Lost Cities
'NoneType' object has no attribute 'find_all'
Muppet Man
'NoneType' object has no attribute 'find_all'
One Thousand and One Nights
'NoneType' object has no attribute 'find_all'
Shrunk
'NoneType' object has no attribute 'find'
Sister Act 3
'NoneType' object has no attribute 'find'
The Graveyard Book
'NoneType' object has no attribute 'find_all'
The Thief
'NoneType' object has no attribute 'find_all'
Tom 

In [14]:
len(movie_info_list)

523

## Save/Reload Data

In [11]:
import json

def save_data(title, data):
    with open(title, "w", encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
         

In [12]:
import json

def Load_data(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)

In [13]:
import pickle

def save_data_pickle(name, data):
    with open(name, 'wb') as f:
        pickle.dump(data, f)



In [14]:
import pickle

def load_data_pickle(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [15]:
save_data_pickle("Disney_data_cleanedup_final.pickle", movie_info_list)

In [18]:
load_data_pickle("Disney_data_cleanedup_final.pickle")

[{'title': 'Snow White and the Seven Dwarfs',
  'Directed by': ['David Hand',
   'William Cottrell',
   'Wilfred Jackson',
   'Larry Morey',
   'Perce Pearce',
   'Ben Sharpsteen'],
  'Written by': ['Ted Sears',
   'Richard Creedon',
   'Otto Englander',
   'Dick Rickard',
   'Earl Hurd',
   'Merrill De Maris',
   'Dorothy Ann Blank',
   'Webb Smith'],
  'Based on': 'Snow Whiteby The Brothers Grimm',
  'Produced by': 'Walt Disney',
  'Starring': ['Adriana Caselotti',
   'Lucille La Verne',
   'Harry Stockwell',
   'Roy Atwell',
   'Pinto Colvig',
   'Otis Harlan',
   'Scotty Mattraw',
   'Billy Gilbert',
   'Eddie Collins',
   'Moroni Olsen',
   'Stuart Buchanan'],
  'Music by': ['Frank Churchill', 'Paul Smith', 'Leigh Harline'],
  'Production company': 'Walt Disney Productions',
  'Distributed by': 'RKO Radio Pictures',
  'Release dates': ['December 21, 1937 (Carthay Circle Theatre)',
   'February 4, 1938 (United States)'],
  'Running time': '83 minutes',
  'Country': 'United States',

## Task #3: Clean out data!

In [19]:
# movie_info_list = Load_data("Disney_data_cleanedup.json")

### Subtasks
- ~~Clean up references [1], [2], ...~~
- ~~Convert running time into an integer~~
- ~~Convert dates into datetime object~~
- ~~Split up the long string~~
- ~~Convert budget and box ofice into integer numbert~~

In [21]:
# movie_info_list[-40]

In [22]:
def minute_to_integer(running_time):
    if running_time == "N/A":
        return None
    if isinstance(running_time, list):
         return running_time[0].split(" ")[0]
    else:
        return int(running_time.split(" ")[0])

for movie in movie_info_list:
    movie['Running time(int)'] = minute_to_integer(movie.get("Running time", "N/A"))

In [23]:
# movie_info_list[-40]


In [24]:
import re


number = r"\d+(,\d{3})*\.*\d*"
amounts = r"thousand|million|billion"

word_re = rf"\${number}(-|\sto\s|–)?({number})?\s({amounts})"
value_re = rf"\${number}"



'''
TODO

Given either a string or a list of strings as input, return
a number (int or float) which is equal to the monetary value

money_conversion("$12.2 million") --> 12200000
money_conversion("$790,000") --> 790000

use test_money_conversion.py to test your solution
'''
def word_to_value(word):
    value_dict = {"thousand":1000, "million":1000000, "billion": 1000000000}
    return value_dict[word]



def parse_value_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    return value

def parse_word_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    word = re.search(amounts, string).group()
    # print(word)
    word_value = word_to_value(word)
    return int(value*word_value)

def money_conversion(money):

    if isinstance(money, list):
        money = money[0]


    word_syntax = re.search(word_re, money) 
    # print(word_syntax.group())
    value_syntax = re.search(value_re, money)


    if word_syntax:
        return parse_word_syntax(word_syntax.group())
    elif value_syntax:
        return parse_value_syntax(value_syntax.group())



In [25]:
for movie in movie_info_list:
    try:
        movie["Budget(float)"] = money_conversion(movie["Budget"])
        movie["Box office(float)"] = money_conversion(movie["Box office"])
    except:
        pass


In [27]:
# movie_info_list[-10]

In [28]:
from datetime import datetime

dates = [movie.get('Release date', 'N/A') for movie in movie_info_list]

def clean_date(date):
    return date.split("(")[0].strip()

def date_conversion(date):
    if isinstance(date, list):
        date = date[0]
        
    if date == "N/A":
        return None
        
    date_str = clean_date(date)
    print(date_str)
    
    fmts=["%B %d, %Y", "%d %B %Y"]
    for fmt in fmts:
        try:
            return datetime.strptime(date_str, fmt)
        except:
            pass
    return None
    

In [29]:
for movie in movie_info_list:
    movie["Release date (datetime)"] = date_conversion(movie.get('Release date', 'N/A'))

November 13, 1940
June 27, 1941
July 17, 1943
September 27, 1947
May 27, 1948
October 5, 1949
February 5, 1953
July 23, 1953
November 10, 1953
August 17, 1954
December 23, 1954
May 25, 1955
June 22, 1955
September 14, 1955
December 22, 1955
June 8, 1956
July 18, 1956
September 4, 1956
December 20, 1956
June 19, 1957
August 28, 1957
December 25, 1957
July 8, 1958
August 12, 1958
December 25, 1958
January 29, 1959
March 19, 1959
November 10, 1959
January 21, 1960
February 24, 1960
May 19, 1960
November 1, 1960
December 21, 1960
January 25, 1961
March 16, 1961
June 21, 1961
July 12, 1961
July 17, 1961
December 14, 1961
April 5, 1962
May 17, 1962
June 6, 1962
September 26, 1962
November 7, 1962
January 16, 1963
March 29, 1963
June 1, 1963
July 7, 1963
November 20, 1963
March 12, 1964
February 11, 1964
July 2, 1964
November 10, 1964
December 18, 1964
August 18, 1965
December 2, 1965
October 1, 1966
December 1, 1966
February 8, 1967
June 15, 1967
July 12, 1967
October 18, 1967
October 19, 19

In [30]:
# movie_info_list[20]

# Task 4: Attach IMDB/Rotten tomatoes scores

In [31]:
movie_info_list = load_data_pickle("Disney_data_cleanedup_final.pickle")


In [32]:
import requests
import urllib
import os

def get_omdb_info(title):
    base_url = "http://www.omdbapi.com/?"
    parameters = {"apikey": "553360bf", 't': title}
    params_encoded = urllib.parse.urlencode(parameters)
    full_url = base_url + params_encoded
    return requests.get(full_url).json()

def get_rotten_tomato_score(omdb_info):
    ratings = omdb_info.get("Ratings", [])
    for rating in ratings:
        if rating["Source"] == 'Rotten tomatoes':
            return ["Value"]
    return None

In [33]:
for movie in movie_info_list:
    title = movie["title"]
    omdb_info = get_omdb_info(title)
    movie["imdb"] = omdb_info.get("imdbRating", None)
    movie['metascore'] = get_rotten_tomato_score(omdb_info)

In [193]:
movie_info_list[-40]

{'title': 'Turning Red',
 'Directed by': 'Domee Shi',
 'Screenplay by': ['Julia Cho', 'Domee Shi'],
 'Story by': ['Domee Shi', 'Julia Cho', 'Sarah Streicher'],
 'Produced by': 'Lindsey Collins',
 'Starring': ['Rosalie Chiang',
  'Sandra Oh',
  'Ava Morse',
  'Hyein Park',
  'Maitreyi Ramakrishnan',
  'Orion Lee',
  'Wai Ching Ho',
  'Tristan Allerick Chen',
  'James Hong'],
 'Cinematography': ['Mahyar Abousaeedi', 'Jonathan Pytko'],
 'Edited by': ['Nicholas C. Smith', 'Steve Bloom'],
 'Music by': 'Ludwig Göransson',
 'Production companies': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Distributed by': 'Walt Disney StudiosMotion Pictures',
 'Release dates': ['March 1, 2022 (El Capitan Theatre)',
  'March 11, 2022 (United States)'],
 'Running time': '100 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$175 million',
 'Box office': '$20.1 million',
 'Running time(int)': 100,
 'Release date (datetime)': None,
 'imdb': '7.0',
 'metascore': None,
 'Budget

# Task #5: Save data as JSON & CSV 

In [45]:
# movie_info_list[0]

In [46]:
# movie_info_copy = [movie.copy() for movie in movie_info_list]

In [47]:
# for movie in movie_info_copy:
#     print(movie)
#     current_date = movie["Release date (datetime)"]
#     if current_date:
#         movie["Release date (datetime)"] = current_date.strftime("%B %d, %Y")
#         else:
#         movie["Release date (datetime)"] = None