### TASK 1: Scrape data from a webpage

In [1]:
import requests
from bs4 import BeautifulSoup as bs

URL = 'https://en.wikipedia.org/wiki/Toy_Story_3'
page = requests.get(URL)

soup = bs(page.content, 'html.parser')

In [2]:
movie_data = soup.find('table', class_='infobox vevent')

In [3]:
movie_rows = movie_data.find_all('tr')
for row in movie_rows:
    print(row.prettify())

<tr>
 <th class="summary" colspan="2" style="text-align:center;font-size:125%;font-weight:bold;font-size:110%;font-style:italic;">
  Toy Story 3
 </th>
</tr>

<tr>
 <td colspan="2" style="text-align:center">
  <a class="image" href="/wiki/File:Toy_Story_3_poster.jpg" title="All of the toys packed close together, holding up a large numeral '3', with Buzz, who is putting a friendly arm around Woody's shoulder, and Woody holding the top of the 3.">
   <img alt="All of the toys packed close together, holding up a large numeral '3', with Buzz, who is putting a friendly arm around Woody's shoulder, and Woody holding the top of the 3." class="thumbborder" data-file-height="326" data-file-width="220" decoding="async" height="326" src="//upload.wikimedia.org/wikipedia/en/6/69/Toy_Story_3_poster.jpg" width="220"/>
  </a>
  <div style="font-size:95%;padding:0.35em 0.35em 0.25em;line-height:1.25em;">
   Theatrical release poster
  </div>
 </td>
</tr>

<tr>
 <th scope="row" style="white-space:nowra

In [53]:
def get_content_value(row):
    if row.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row.find_all("li")]
    elif row.find("br"):
        return [text for text in row.stripped_strings]
    else:
        return row.get_text(" ", strip=True).replace("\xa0", " ")

In [54]:
def fill_up_dict(url):
    movie_info = dict()
    page = requests.get(url)
    soup = bs(page.content, 'html.parser')
    
    for s in soup.find_all(['sup', 'span']):
        s.decompose()   # Or s.extract()
        
    movie_rows = soup.find('table', class_='infobox vevent').find_all("tr")
    for idx, row in enumerate(movie_rows):
        if idx == 0:
            movie_info['Title'] = row.find("th").get_text(" ", strip=True)
        elif idx == 1:
            continue
        else:
            key = row.find("th").get_text(" ", strip=True)
            movie_info[key] = get_content_value(row.find("td"))
    return movie_info

In [55]:
URL = "https://en.wikipedia.org/wiki/Ghosts_of_the_Abyss"
fill_up_dict(URL)

{'Title': 'Ghosts of the Abyss',
 'Directed by': 'James Cameron',
 'Produced by': ['John Bruno',
  'James Cameron',
  'Chuck Comisky',
  'Janace Tashjian',
  'Andrew Wight'],
 'Starring': ['Bill Paxton',
  'James Cameron',
  'Dr. John Broadwater',
  'Dr. Lori Johnston'],
 'Music by': 'Joel McNeely',
 'Cinematography': ['Vince Pace', 'D. J. Roller'],
 'Edited by': ['David C. Cook', 'Ed W. Marsh', 'Sven Pape', 'John Refoua'],
 'Production company': ['Walt Disney Pictures',
  'Walden Media',
  'Earthship Productions',
  'Ascot Elite Entertainment Group',
  'Golden Village',
  'Telepool',
  'UGC PH'],
 'Distributed by': 'Buena Vista Pictures',
 'Release date': ['March 31, 2003 (premiere)', 'April 11, 2003 (limited)'],
 'Running time': '61 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$13 million',
 'Box office': '$28.7 million'}

In [6]:
movie_info = dict()
for idx, row in enumerate(movie_rows):
    if idx == 0:
        movie_info['Title'] = row.find("th").get_text(" ", strip=True)
    elif idx == 1:
        continue
    else:
        key = row.find("th").get_text(" ", strip=True)
        movie_info[key] = get_content_value(row.find("td"))

movie_info

{'Title': 'Toy Story 3',
 'Directed by': 'Lee Unkrich',
 'Produced by': 'Darla K. Anderson',
 'Screenplay by': 'Michael Arndt',
 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'],
 'Starring': ['Tom Hanks',
  'Tim Allen',
  'Joan Cusack',
  'Don Rickles',
  'Wallace Shawn',
  'John Ratzenberger',
  'Estelle Harris',
  'Blake Clark',
  'Jeff Pidgeon',
  'Ned Beatty',
  'Michael Keaton',
  'Jodi Benson',
  'John Morris'],
 'Music by': 'Randy Newman',
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Edited by': 'Ken Schretzmann',
 'Production company': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Distributed by': 'Walt Disney Studios Motion Pictures',
 'Release date': ['June 12, 2010 ( 2010-06-12 ) ( Taormina Film Fest )',
  'June 18, 2010 ( 2010-06-18 ) (United States)'],
 'Running time': '103 minutes [1]',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$200 million [1]',
 'Box office': '$1.067 billion [1]'}

### TASK 2: Scrape data from different links on a page

In [None]:
URL = "https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films"
BASE_URL = "https://en.wikipedia.org"
page_content = requests.get(URL)

soup = bs(page_content.content, 'html.parser')  # Defaults to python inbuilt HTML parser

movie_links = soup.find_all('table', class_='wikitable sortable')

In [None]:
import time
start = time.process_time()

all_data = []
for table in movie_links:
    movies = table.find_all("tr")
    for movie in movies:
        movie_dict = dict()
        if movie.find("td"):
            movie_data = movie.find_all("a")
            try:
                title = a["title"]
                url = BASE_URL + a['href']
                
                movie_dict = fill_up_dict(url)
                all_data.append(movie_dict)
                break
            except Exception as e:
                print(title)
                print(e)
print(time.process_time() - start)
all_data[0]

In [None]:
import time
start = time.process_time()

all_data = []
for table in movie_links:
    movies = table.find_all("tr")
    for movie in movies:
        movie_dict = dict()
        if movie.find("td"):
            movie_data = movie.find_all("td")
            try:
                a = movie_data[1].find("a", href=True)
                title = a["title"]
                url = BASE_URL + a['href']
                
                movie_dict = fill_up_dict(url)
                movie_dict["Movie Type"] = movie_data[0].get_text()
                
                all_data.append(movie_dict)
                break
            except Exception as e:
                print(title)
                print(e)
print(time.process_time() - start)
all_data[0]

In [12]:
URL = "https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films"
BASE_URL = "https://en.wikipedia.org"
page_content = requests.get(URL)

soup = bs(page_content.content, 'html.parser')  # Defaults to python inbuilt HTML parser

movies = soup.select('.wikitable.sortable i a')

In [16]:
movie_info_list = []
for index, movie in enumerate(movies):
    movie_data = dict()
    try:
        relative_path = movie['href']
        title = movie['title']
        full_path = BASE_URL + relative_path
        movie_info_list.append(fill_up_dict(full_path))
        
    except Exception as e:
        print(title)
        print(e)

Zorro (1957 TV series)
'NoneType' object has no attribute 'find'
Zorro (1957 TV series)
'NoneType' object has no attribute 'find'
One Little Indian (film)
'NoneType' object has no attribute 'get_text'
True-Life Adventures
'NoneType' object has no attribute 'find_all'
Spirited Away
'NoneType' object has no attribute 'get_text'
The Haunted Mansion (film)
HTTPSConnectionPool(host='en.wikipedia.org', port=443): Max retries exceeded with url: /wiki/The_Haunted_Mansion_(film) (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x0000016537203080>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond',))
Howl's Moving Castle (film)
'NoneType' object has no attribute 'get_text'
The Nightmare Before Christmas
'NoneType' object has no attribute 'get_text'
Ponyo
'NoneType' object ha

In [17]:
len(movie_info_list)

425

In [19]:
movie_info_list[130]

{'Title': 'The Rescuers',
 'Directed by': ['Wolfgang Reitherman', 'John Lounsbery', 'Art Stevens'],
 'Produced by': ['Wolfgang Reitherman', 'Ron W. Miller'],
 'Story by': ['Larry Clemmons',
  'Vance Gerry',
  'Ken Anderson',
  'Frank Thomas',
  'Burny Mattinson',
  'Fred Lucky',
  'Dick Sebast',
  'David Michener',
  'Ted Berman'],
 'Based on': 'The Rescuers and Miss Bianca by Margery Sharp',
 'Starring': ['Bob Newhart',
  'Eva Gabor',
  'Michelle Stacy',
  'Geraldine Page',
  'Joe Flynn',
  'Jim Jordan',
  'John McIntire',
  'Jeanette Nolan',
  'Pat Buttram',
  'Bernard Fox'],
 'Music by': 'Artie Butler',
 'Edited by': ['Jim Koford', 'James Melton'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['June 22, 1977 ( 1977-06-22 )'],
 'Running time': '77 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$7.5 million [1]',
 'Box office': '$169 million [2]'}

### TASK 3: Save movie info as json + Data clean up
* Clean up references
* Convert to proper units
* Clean up lists
* Date time objects

In [30]:
import json

def save_json(save_path, data):
    with open(save_path, 'w', encoding='utf-8') as outfile:
        json.dump(data, outfile, indent=2, ensure_ascii=False)
#         json.dump(data, outfile, indent=2)
        
def load_json(file_path):
    with open(file_path, 'r') as infile:
        data = json.load(infile)
    return data

#save_json("movie.json", movie_info_list)

In [41]:
from bs4 import BeautifulSoup
from bs4.element import Comment
import requests


f_url="https://en.wikipedia.org/wiki/The_Lizzie_McGuire_Movie"

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]', 'reference']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)

html = requests.get(f_url)
text= text_from_html(html.text)

print(text)

         The Lizzie McGuire Movie   From Wikipedia, the free encyclopedia     Jump to navigation  Jump to search  The Lizzie McGuire Movie Theatrical release poster Directed by Jim Fall Produced by Stan Rogow Written by  Susan Estelle Jansen  Ed Decter  John J. Strauss  Based on Lizzie McGuire by Terri Minsky Starring  Hilary Duff  Adam Lamberg  Robert Carradine  Hallie Todd  Jake Thomas  Music by Cliff Eidelman Cinematography Jerzy Zieliński Edited by Margie Goodspeed Production company  Walt Disney Pictures  Stan Rogow Productions  Distributed by Buena Vista Pictures Release date  May 2, 2003 ( 2003-05-02 ) (United States)  Running time 94 minutes Country United States Language  English  Italian  Budget $17 million [1] Box office $55.5 million [2]  The Lizzie McGuire Movie is a 2003 American teen  comedy film directed by Jim Fall . The film serves as the finale of the Disney Channel  television series of the same name , and was the first theatrical film based on a Disney Channel seri