### Wikipedia Dataset (List of Disney films)

In [1]:
from bs4 import BeautifulSoup as bs
import requests
from urllib.parse import urljoin

#### 1. Extracting

In [2]:
r = requests.get('https://en.wikipedia.org/wiki/Alice_in_Wonderland_(1951_film)')

soup = bs(r.content)

contents = soup.prettify()
print(contents)

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Alice in Wonderland (1951 film) - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled";(function(){var cookie=document.cook

In [3]:
infobox = soup.find(class_='infobox vevent')
print(infobox.prettify())

<table class="infobox vevent">
 <tbody>
  <tr>
   <th class="infobox-above summary" colspan="2" style="font-size: 125%; font-style: italic;">
    Alice in Wonderland
   </th>
  </tr>
  <tr>
   <td class="infobox-image" colspan="2">
    <span class="mw-default-size mw-image-border" typeof="mw:File/Frameless">
     <a class="mw-file-description" href="/wiki/File:Alice_in_Wonderland_(1951_film)_poster.jpg">
      <img class="mw-file-element" data-file-height="389" data-file-width="256" decoding="async" height="334" src="//upload.wikimedia.org/wikipedia/en/thumb/c/c1/Alice_in_Wonderland_%281951_film%29_poster.jpg/220px-Alice_in_Wonderland_%281951_film%29_poster.jpg" srcset="//upload.wikimedia.org/wikipedia/en/c/c1/Alice_in_Wonderland_%281951_film%29_poster.jpg 1.5x" width="220"/>
     </a>
    </span>
    <div class="infobox-caption">
     Theatrical release poster
    </div>
   </td>
  </tr>
  <tr>
   <th class="infobox-label" scope="row" style="white-space: nowrap; padding-right: 0.65em;

#### Extract relevat information for one movie (followed by <tr</tr>> etc.)

In [4]:
rows = infobox.find_all('tr')
for row in rows:
    print(row.prettify())

<tr>
 <th class="infobox-above summary" colspan="2" style="font-size: 125%; font-style: italic;">
  Alice in Wonderland
 </th>
</tr>

<tr>
 <td class="infobox-image" colspan="2">
  <span class="mw-default-size mw-image-border" typeof="mw:File/Frameless">
   <a class="mw-file-description" href="/wiki/File:Alice_in_Wonderland_(1951_film)_poster.jpg">
    <img class="mw-file-element" data-file-height="389" data-file-width="256" decoding="async" height="334" src="//upload.wikimedia.org/wikipedia/en/thumb/c/c1/Alice_in_Wonderland_%281951_film%29_poster.jpg/220px-Alice_in_Wonderland_%281951_film%29_poster.jpg" srcset="//upload.wikimedia.org/wikipedia/en/c/c1/Alice_in_Wonderland_%281951_film%29_poster.jpg 1.5x" width="220"/>
   </a>
  </span>
  <div class="infobox-caption">
   Theatrical release poster
  </div>
 </td>
</tr>

<tr>
 <th class="infobox-label" scope="row" style="white-space: nowrap; padding-right: 0.65em;">
  Directed by
 </th>
 <td class="infobox-data">
  <style data-mw-deduplic

#### Clean up the data for the movie

In [5]:
movie_info = {}

def get_content_value(row_data):
    if row_data.find('li'):
        return[li.get_text('', strip=True).replace('\xa0','') for li in row_data.find_all('li')]
    else:
        return row_data.get_text('', strip=True).replace('\xa0', '')
    
for i, row in enumerate(rows):
    if i == 0:
        movie_info['Title'] = row.find('th').get_text()
    elif i == 1:
        continue
    else:
        k = row.find('th').get_text('', strip=True)
        kv = get_content_value(row.find('td'))
        #kv = row.find('td').get_text()
        movie_info[k] = kv
print(movie_info)


{'Title': 'Alice in Wonderland', 'Directed by': ['Clyde Geronimi', 'Wilfred Jackson', 'Hamilton Luske'], 'Story by': ['Winston Hibler', 'Ted Sears', 'Bill Peet', 'Erdman Penner', 'Joe Rinaldi', 'Milt Banta', 'Bill Cottrell', 'Dick Kelsey', 'Joe Grant', 'Dick Huemer', 'Del Connell', 'Tom Oreb', 'John Walbridge'], 'Based on': "Alice's Adventures in WonderlandandThrough the Looking-GlassbyLewis Carroll", 'Produced by': 'Walt DisneyBen Sharpsteen', 'Starring': ['Kathryn Beaumont', 'Ed Wynn', 'Richard Haydn', 'Sterling Holloway', 'Jerry Colonna', 'Verna Felton', "J. Pat O'Malley", 'Bill Thompson', 'Heather Angel'], 'Edited by': 'Lloyd Richardson', 'Music by': 'Oliver Wallace', 'Productioncompany': 'Walt Disney Productions', 'Distributed by': 'RKO Radio Pictures', 'Release dates': ['July26,1951(1951-07-26)(London)[1]', 'July28,1951(1951-07-28)(New York City)[1]', 'September14,1951(1951-09-14)(United States)'], 'Running time': '75 minutes[2]', 'Country': 'United States', 'Language': 'English'

#### 2. Collect data for all movies

In [6]:
r = requests.get('https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')

soup = bs(r.content)

contents = soup.prettify()
print(contents)

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of Walt Disney Pictures films - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled";(function(){var cookie=document.c

In [13]:
movies = soup.select('.wikitable.sortable i')
movies[0:10]

[<i><a href="/wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)" title="Snow White and the Seven Dwarfs (1937 film)">Snow White and the Seven Dwarfs</a></i>,
 <i><a href="/wiki/Pinocchio_(1940_film)" title="Pinocchio (1940 film)">Pinocchio</a></i>,
 <i><a href="/wiki/Fantasia_(1940_film)" title="Fantasia (1940 film)">Fantasia</a></i>,
 <i><a href="/wiki/The_Reluctant_Dragon_(1941_film)" title="The Reluctant Dragon (1941 film)">The Reluctant Dragon</a></i>,
 <i><a href="/wiki/Dumbo" title="Dumbo">Dumbo</a></i>,
 <i><a href="/wiki/Bambi" title="Bambi">Bambi</a></i>,
 <i><a href="/wiki/Saludos_Amigos" title="Saludos Amigos">Saludos Amigos</a></i>,
 <i><a href="/wiki/Victory_Through_Air_Power_(film)" title="Victory Through Air Power (film)">Victory Through Air Power</a></i>,
 <i><a href="/wiki/The_Three_Caballeros" title="The Three Caballeros">The Three Caballeros</a></i>,
 <i><a href="/wiki/Make_Mine_Music" title="Make Mine Music">Make Mine Music</a></i>]

In [37]:
def get_content_value(row_data):
    if row_data.find('li'):
        return[li.get_text(' ', strip=True).replace('\xa0','') for li in row_data.find_all('li')]
    elif row_data.find('br'):
        return [text for text in soup.stripped_strings]
    else:
        return row_data.get_text(' ', strip=True).replace('\xa0', '')
    
def clean_tags(soup):
    for tag in soup.find_all(['sup', 'span']):
        tag.decompose()
        
def get_info_box(url):
    
    r = requests.get(url)
    soup = bs(r.content)
    infobox = soup.find(class_='infobox vevent')
    rows = infobox.find_all('tr')
    
    clean_tags(soup)
    
    movie_info = {}
    for index, row in enumerate(rows):
        if index == 0:
            movie_info['Title'] = row.find('th').get_text(' ', strip=True)
        else:
            header = row.find('th')
            if header:
                k = row.find('th').get_text(' ', strip=True)
                kv = get_content_value(row.find('td'))
                #kv = row.find('td').get_text()
                movie_info[k] = kv
    return(movie_info)

In [None]:
r = requests.get('https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')
soup = bs(r.content)
movies = soup.select('.wikitable.sortable i a')

base_path = 'https://en.wikipedia.org/'

movies_list = []
for i, movie in enumerate(movies):
    #if i == 5:
       # break
    try:
        pathh = movie['href']
        full_path = urljoin(base_path, pathh)
        title = movie['title']
        
        movies_list.append(get_info_box(full_path))
        
    except Exception as e:
        print(movie.get_text())
        print (e)

Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
Mighty Ducks the Movie: The First Face-Off
'NoneType' object has no attribute 'find'
Spirited Away
'NoneType' object has no attribute 'find'
Howl's Moving Castle
'NoneType' object has no attribute 'find'
Ponyo
'NoneType' object has no attribute 'find'
Tales from Earthsea
'NoneType' object has no attribute 'find'


In [None]:
movies_list

In [42]:
len(movies_list)

541

#### Save Data

In [43]:
import json

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [24]:
def load_data(title):
    with open (title, encoding='utf-8') as f:
        return json.load(f)

In [44]:
save_data('disney_data.json', movies_list)