### TASK 1: Scrape data from a webpage

In [5]:
import requests
from bs4 import BeautifulSoup as bs

URL = 'https://en.wikipedia.org/wiki/Toy_Story_3'
page = requests.get(URL)

soup = bs(page.content, 'html.parser')

In [2]:
movie_data = soup.find('table', class_='infobox vevent')

In [3]:
movie_rows = movie_data.find_all('tr')
for row in movie_rows:
    print(row.prettify())

<tr>
 <th class="summary" colspan="2" style="text-align:center;font-size:125%;font-weight:bold;font-size:110%;font-style:italic;">
  Toy Story 3
 </th>
</tr>

<tr>
 <td colspan="2" style="text-align:center">
  <a class="image" href="/wiki/File:Toy_Story_3_poster.jpg" title="All of the toys packed close together, holding up a large numeral '3', with Buzz, who is putting a friendly arm around Woody's shoulder, and Woody holding the top of the 3.">
   <img alt="All of the toys packed close together, holding up a large numeral '3', with Buzz, who is putting a friendly arm around Woody's shoulder, and Woody holding the top of the 3." class="thumbborder" data-file-height="326" data-file-width="220" decoding="async" height="326" src="//upload.wikimedia.org/wikipedia/en/6/69/Toy_Story_3_poster.jpg" width="220"/>
  </a>
  <div style="font-size:95%;padding:0.35em 0.35em 0.25em;line-height:1.25em;">
   Theatrical release poster
  </div>
 </td>
</tr>

<tr>
 <th scope="row" style="white-space:nowra

In [17]:
def get_content_value(row):
    if row.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row.find_all("li")]
    elif row.find("br"):
        return [text for text in row.stripped_strings]
    else:
        return row.get_text(" ", strip=True).replace("\xa0", " ")

In [18]:
def fill_up_dict(url):
    movie_info = dict()
    page = requests.get(url)
    soup = bs(page.content, 'html.parser')
    
    for s in soup.find_all(['sup', 'span']):
        s.decompose()   # Or s.extract()
        
    movie_rows = soup.find('table', class_='infobox vevent').find_all("tr")
    for idx, row in enumerate(movie_rows):
        if idx == 0:
            movie_info['Title'] = row.find("th").get_text(" ", strip=True)
        else:
            header = row.find('th')
            if header:  # Header found, add to dict
                key = header.get_text(" ", strip=True)
                movie_info[key] = get_content_value(row.find("td"))
    return movie_info

In [16]:
URL = "https://en.wikipedia.org/wiki/Ponyo"
fill_up_dict(URL)

{'Title': 'Ponyo',
 'Japanese': '',
 'Hepburn': 'Gake no Ue no Ponyo',
 'Directed by': 'Hayao Miyazaki',
 'Produced by': 'Toshio Suzuki',
 'Written by': 'Hayao Miyazaki',
 'Based on': ['The Little Mermaid', 'by', 'Hans Christian Andersen'],
 'Starring': ['Tomoko Yamaguchi',
  'Kazushige Nagashima',
  'Yūki Amami',
  'George Tokoro',
  'Yuria Nara',
  'Hiroki Doi',
  'Rumi Hiiragi',
  'Akiko Yano',
  'Kazuko Yoshiyuki',
  'Tomoko Naraoka'],
 'Music by': 'Joe Hisaishi',
 'Cinematography': 'Atsushi Okui',
 'Edited by': 'Takeshi Seyama',
 'Production company': 'Studio Ghibli',
 'Distributed by': 'Toho',
 'Release date': ['July 19, 2008'],
 'Running time': '101 minutes',
 'Country': 'Japan',
 'Language': 'Japanese',
 'Budget': ['¥', '3.4 billion', '(', 'US$', '34 million)'],
 'Box office': 'US$ 203.2 million'}

In [6]:
movie_info = dict()
for idx, row in enumerate(movie_rows):
    if idx == 0:
        movie_info['Title'] = row.find("th").get_text(" ", strip=True)
    else:
        header = row.find("th")
        if header:
            key = header.get_text(" ", strip=True)
            movie_info[key] = get_content_value(row.find("td"))

movie_info

{'Title': 'Toy Story 3',
 'Directed by': 'Lee Unkrich',
 'Produced by': 'Darla K. Anderson',
 'Screenplay by': 'Michael Arndt',
 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'],
 'Starring': ['Tom Hanks',
  'Tim Allen',
  'Joan Cusack',
  'Don Rickles',
  'Wallace Shawn',
  'John Ratzenberger',
  'Estelle Harris',
  'Blake Clark',
  'Jeff Pidgeon',
  'Ned Beatty',
  'Michael Keaton',
  'Jodi Benson',
  'John Morris'],
 'Music by': 'Randy Newman',
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Edited by': 'Ken Schretzmann',
 'Production company': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Distributed by': 'Walt Disney Studios Motion Pictures',
 'Release date': ['June 12, 2010 ( 2010-06-12 ) ( Taormina Film Fest )',
  'June 18, 2010 ( 2010-06-18 ) (United States)'],
 'Running time': '103 minutes [1]',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$200 million [1]',
 'Box office': '$1.067 billion [1]'}

### TASK 2: Scrape data from different links on a page

In [None]:
URL = "https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films"
BASE_URL = "https://en.wikipedia.org"
page_content = requests.get(URL)

soup = bs(page_content.content, 'html.parser')  # Defaults to python inbuilt HTML parser

movie_links = soup.find_all('table', class_='wikitable sortable')

In [None]:
import time
start = time.process_time()

all_data = []
for table in movie_links:
    movies = table.find_all("tr")
    for movie in movies:
        movie_dict = dict()
        if movie.find("td"):
            movie_data = movie.find_all("a")
            try:
                title = a["title"]
                url = BASE_URL + a['href']
                
                movie_dict = fill_up_dict(url)
                all_data.append(movie_dict)
                break
            except Exception as e:
                print(title)
                print(e)
print(time.process_time() - start)
all_data[0]

In [None]:
import time
start = time.process_time()

all_data = []
for table in movie_links:
    movies = table.find_all("tr")
    for movie in movies:
        movie_dict = dict()
        if movie.find("td"):
            movie_data = movie.find_all("td")
            try:
                a = movie_data[1].find("a", href=True)
                title = a["title"]
                url = BASE_URL + a['href']
                
                movie_dict = fill_up_dict(url)
                movie_dict["Movie Type"] = movie_data[0].get_text()
                
                all_data.append(movie_dict)
                break
            except Exception as e:
                print(title)
                print(e)
print(time.process_time() - start)
all_data[0]

In [21]:
URL = "https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films"
BASE_URL = "https://en.wikipedia.org"
page_content = requests.get(URL)

soup = bs(page_content.content, 'html.parser')  # Defaults to python inbuilt HTML parser

movies = soup.select('.wikitable.sortable i a')

In [26]:
movie_info_list = []
for idx, movie in enumerate(movies):
    movie_data = dict()
    if idx % 10 == 0:
        print(idx, end = ' ')
    try:
        relative_path = movie['href']
        title = movie['title']
        full_path = BASE_URL + relative_path
        movie_info_list.append(fill_up_dict(full_path))
        
    except Exception as e:
        print(title)
        print(e)

0 10 20 30 40 Zorro (1957 TV series)
'NoneType' object has no attribute 'find'
Zorro (1957 TV series)
'NoneType' object has no attribute 'find'
50 60 70 80 90 100 110 120 True-Life Adventures
'NoneType' object has no attribute 'find_all'
130 140 150 160 170 180 190 200 210 220 230 240 250 260 270 280 Herbie: Fully Loaded
HTTPSConnectionPool(host='en.wikipedia.org', port=443): Max retries exceeded with url: /wiki/Herbie:_Fully_Loaded (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x000001B6D346F198>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond',))
290 300 310 Enchanted (film)
HTTPSConnectionPool(host='en.wikipedia.org', port=443): Max retries exceeded with url: /wiki/Enchanted_(film) (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection o

In [27]:
len(movie_info_list)

441

In [28]:
movie_info_list[130]

{'Title': 'A Tale of Two Critters',
 'Directed by': 'Jack Speirs',
 'Produced by': 'Jack Speirs',
 'Screenplay by': 'Jack Speirs',
 'Narrated by': 'Mayf Nutter',
 'Music by': 'Buddy Baker',
 'Edited by': 'G. Gregg McLaughlin',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['June 22, 1977'],
 'Running time': '48 minutes',
 'Country': 'United States',
 'Language': 'English'}

In [180]:
import json
def myconverter(o):
    if isinstance(o, datetime.date):
        return o.__str__()

def save_json(save_path, data):
    with open(save_path, 'w', encoding='utf-8') as outfile:
        json.dump(data, outfile, indent=2, ensure_ascii=False, default=myconverter)
        
def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as infile:
        data = json.load(infile)
    return data

#save_json("clean_movies.json", movie_info_list)

### TASK 3: Save movie info as json + Data clean up
* Clean up references
* Convert to proper units
* Clean up lists
* Date time objects

In [125]:
movie_info_list[30]

{'Title': 'The Great Locomotive Chase',
 'Directed by': 'Francis D. Lyon',
 'Produced by': ['Lawrence Edward Watkin', 'Walt Disney'],
 'Written by': 'Lawrence Edward Watkin',
 'Starring': ['Fess Parker',
  'Jeffrey Hunter',
  'John Lupton',
  'Jeff York',
  'Slim Pickens'],
 'Music by': 'Paul J. Smith',
 'Cinematography': 'Charles Boyle',
 'Edited by': 'Ellsworth Hoagland',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': 'June 8, 1956',
 'Running time': '85 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Box office': '$1.7 million (US)'}

In [133]:
# Load cleaned data
clean_data = load_json('clean_movies.json')
len(clean_data)

441

In [134]:
[data.get('Running time', 'N/A') for data in clean_data]

['41 minutes (74 minutes 1966 release)',
 '83 minutes',
 '88 minutes',
 '126 minutes',
 '74 minutes',
 '64 minutes',
 '70 minutes',
 '42 minutes',
 '65 min.',
 '71 minutes',
 '75 minutes',
 '94 minutes',
 '73 minutes',
 '75 minutes',
 '82 minutes',
 '68 minutes',
 '74 minutes',
 '96 minutes',
 '75 minutes',
 '84 minutes',
 '77 minutes',
 '92 minutes',
 '69 minutes',
 '81 minutes',
 ['60 minutes (VHS version)', '71 minutes (original)'],
 '127 minutes',
 '92 minutes',
 '76 minutes',
 '75 minutes',
 '73 minutes',
 '85 minutes',
 '81 minutes',
 '70 minutes',
 '90 min.',
 '80 minutes',
 '75 minutes',
 '83 minutes',
 '83 minutes',
 '72 minutes',
 '97 minutes',
 '75 minutes',
 '104 minutes',
 '93 minutes',
 '105 minutes',
 '95 minutes',
 '97 minutes',
 '134 minutes',
 '69 minutes',
 '92 minutes',
 '126 minutes',
 '79 minutes',
 '97 minutes',
 '128 minutes',
 '74 minutes',
 '91 minutes',
 '105 minutes',
 '98 minutes',
 '130 minutes',
 '89 min.',
 '93 minutes',
 '67 minutes',
 '98 minutes',
 '1

### Running Time clean up

In [135]:
def min_to_int(running_time):
    if running_time == "N/A":
        return None
    elif isinstance(running_time, list):
        return int(running_time[0].split(' ')[0])
    else:
        return int(running_time.split(' ')[0])

In [136]:
for idx, data in enumerate(clean_data):
    data['Running Time(Minutes)'] = min_to_int(data.get("Running time", "N/A"))
clean_data[360]

{'Title': 'Wings of Life',
 'Directed by': 'Louis Schwartzberg',
 'Produced by': ['Grady Candler', 'Alix Tidmarsh'],
 'Narrated by': ['Mélanie Laurent',
  '(France)',
  'Meryl Streep',
  '(United States)'],
 'Music by': ['Steffen Aaskoven & Marc-George Andersen', 'Bliss'],
 'Edited by': 'Jonathan P. Shaw',
 'Production company': ['Disneynature', 'Blacklight Films'],
 'Distributed by': ['Walt Disney Studios Motion Pictures France',
  '(France)',
  'Walt Disney Studios Home Entertainment',
  '(United States)'],
 'Release date': ['16 March 2011 (France)', '16 April 2013 (United States)'],
 'Running time': '77 minutes',
 'Country': ['France', 'United States'],
 'Language': ['French', 'English'],
 'Running Time(Minutes)': 77}

### Release Date clean up

In [137]:
from datetime import datetime as dt
def generate_datetime_object(date_string):
    if ',' in date_string:
        return dt.strptime(date_string, "%B %d, %Y").date()
    elif ' ' in date_string:
        str_len = len(date_string.split(' '))
        if str_len == 3:
            return dt.strptime(date_string, "%d %B %Y").date()
        else:
            return dt.strptime(date_string, "%B %Y").date()
    else:
         return dt.strptime(date_string, "%Y").date()

def convert_date(release_date):
    if release_date == 'N/A':
        return None
    elif isinstance(release_date, list):
        if '(' in release_date[0]:
            return generate_datetime_object(release_date[0].split(' (')[0])
        else:
            return generate_datetime_object(release_date[0])
    else:
        if '(' in release_date:
            return generate_datetime_object(release_date.split(' (')[0])
        else:
            return generate_datetime_object(release_date)        

In [138]:
for idx, data in enumerate(clean_data):
    data['Date'] = convert_date(data.get("Release date", 'N/A'))
clean_data[156]

{'Title': 'Trenchcoat',
 'Directed by': 'Michael Tuchner',
 'Produced by': 'Jerry Leider',
 'Written by': ['Jeffrey Price', 'Peter S. Seaman'],
 'Starring': ['Margot Kidder',
  'Robert Hays',
  'David Suchet',
  'Gila von Weitershausen',
  'Ronald Lacey'],
 'Music by': 'Charles Fox',
 'Cinematography': 'Tonino Delli Colli',
 'Edited by': 'Frank J. Urioste',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['March 11, 1983'],
 'Running time': '91 min',
 'Country': 'United States',
 'Language': 'English',
 'Budget': 'unknown',
 'Box office': '$4,304,286 (US)',
 'Running Time(Minutes)': 91,
 'Date': datetime.date(1983, 3, 11)}

In [121]:
#save_json('clean_movies.json', clean_data)

### Convert Budget to the right type

In [199]:
def extract_budget(budget):
    if budget == 'N/A':
        return None
    elif isinstance(budget, list):
        return convert_budget_to_float(budget[0])
    else:
        return convert_budget_to_float(budget)  

def convert_budget_to_float(budget):
    budget_list = re.findall(r"\$\d*\.\d+|\$\d+", budget.replace(',', ''))
    result = None
    if len(budget_list) == 0:
        return result
    elif 'million' in budget:
        result = float(budget_list[0][1:]) * 1000000
    elif 'billion' in budget:
        result = float(budget_list[0][1:]) * 1000000000
    elif 'crore' in budget:
        result = float(budget_list[0][1:]) * 10000000 / 75.0  # Convert to dollars
    else:
        result = float(budget_list[0][1:])
    
    if 'A$' in budget or 'AU$' in budget:
        result *= 0.73
    return result

In [200]:
for idx, data in enumerate(clean_data):
    budget = data.get("Budget", 'N/A')
    box_office = data.get("Box office", 'N/A')
    data['budget_numeric'] = extract_budget(budget)
    data['box_office_numeric'] = extract_budget(box_office)
clean_data[245]

{'Title': 'Atlantis: The Lost Empire',
 'Directed by': ['Gary Trousdale', 'Kirk Wise'],
 'Produced by': 'Don Hahn',
 'Screenplay by': 'Tab Murphy',
 'Story by': ['Kirk Wise',
  'Gary Trousdale',
  'Joss Whedon',
  'Bryce Zabel',
  'Jackie Zabel',
  'Tab Murphy'],
 'Starring': ['Michael J. Fox',
  'James Garner',
  'Cree Summer',
  'Don Novello',
  'Phil Morris',
  'Claudia Christian',
  'Jacqueline Obradors',
  'Florence Stanley',
  'David Ogden Stiers',
  'John Mahoney',
  'Jim Varney',
  'Corey Burton',
  'Leonard Nimoy'],
 'Music by': 'James Newton Howard',
 'Edited by': 'Ellen Keneshea',
 'Production company': ['Walt Disney Pictures',
  'Walt Disney Feature Animation'],
 'Distributed by': 'Buena Vista Pictures',
 'Release date': ['June 3, 2001 (Premiere)', 'June 15, 2001 (United States)'],
 'Running time': '96 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$90–120 million',
 'Box office': '$186.1 million',
 'Running Time(Minutes)': 96,
 'Date': datetime.

In [201]:
save_json("clean_movies_final.json", clean_data)

In [203]:
import pickle

def save_pickle(save_path, data):
    with open(save_path, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_pickle(path):
    with open(path, 'rb') as handle:
        return pickle.load(handle)
    
save_pickle('clean_data.pkl', clean_data)

### TASK 4: Extract IMDB / other ratings for these movies