# Scrap information of all Disney Films from Wikipedia and create a movie dataset for further analysis
https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films

### 1. Scrap data of one movie and later use it as a function to iterate over all of the movies
https://en.wikipedia.org/wiki/Toy_Story_3

In [22]:
# import necessary libraries
from bs4 import BeautifulSoup as bs
import requests
import json # for printing dictionaries in pretty format
import numpy as np

In [58]:
def load_movie(url):
    r = requests.get(url)
    # convert to a beautiful soup object
    soup = bs(r.content) # HTML Document
    return soup 
    
def get_data(row):
    for tag in row(['sup', 'span']):
        tag.decompose()
    list = row.find_all('li')
    if list:
        data = [i.text for i in list]
    elif row.select('td br'):
        data = [text for text in row.find(class_='infobox-data').stripped_strings]
    else:
        data = row.find(class_='infobox-data').text
    return data

def get_movie_data(data_rows):
    movie = {}
    for index, row in enumerate(data_rows):
        try:
            if index == 0:
                movie['title'] = row.text
            elif index == 1:
                continue
            else:
                # checking if this row has .infobox-label or not and only extracting the data if it has label
                if row.find(class_='infobox-label'):
                    label = row.find(class_='infobox-label').text
                    data = get_data(row)
                    movie[label] = data
            # print(json.dumps(movie, indent=4, sort_keys=True))
        except Exception as e:
#             pass
            print('get_movie_data error', movie['title'], index, row)
            print(e)
    return movie
    

def get_movie(url):
    soup = load_movie(url) # HTML Document
    info_box = soup.find(class_='infobox')
    info_rows = info_box.find_all('tr')
    movie = get_movie_data(info_rows)
    # print(json.dumps(movie, indent=4, sort_keys=True)) # print dictionaries using pretty format
    return movie

get_movie('https://en.wikipedia.org/wiki/Toy_Story_3')

{'title': 'Toy Story 3',
 'Directed by': 'Lee Unkrich',
 'Screenplay by': 'Michael Arndt',
 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'],
 'Produced by': 'Darla K. Anderson',
 'Starring': ['Tom Hanks',
  'Tim Allen',
  'Joan Cusack',
  'Don Rickles',
  'Wallace Shawn',
  'John Ratzenberger',
  'Estelle Harris',
  'Ned Beatty',
  'Michael Keaton',
  'Jodi Benson',
  'John Morris'],
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Edited by': 'Ken Schretzmann',
 'Music by': 'Randy Newman',
 'Productioncompanies ': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release date': ['June\xa012,\xa02010 (Taormina Film Fest)',
  'June\xa018,\xa02010 (United States)'],
 'Running time': '103 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$200\xa0million',
 'Box office': '$1.067\xa0billion'}

### 2. Use `get_movie()` to scrap data of all the movies

In [13]:
r = requests.get('https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')
# convert to a beautiful soup object
soup = bs(r.content) # HTML Document

In [71]:
def get_movies_url_list():
    return soup.select('.wikitable.sortable tr td:nth-child(2) i a')

def get_movies():
    movies_name_list = get_movies_url_list()
    movies = np.array([])
    for index,movie in enumerate(movies_name_list):
        try:
            title = movie['title']
            relative_path = movie['href']
            url = 'https://en.wikipedia.org/' + relative_path
            single_movie_data = get_movie(url)
            movies = np.append(movies, single_movie_data)
        except Exception as e:
            print('get_movies error', movie['title'])
            print(e)
#     print(json.dumps(movies, indent=4, sort_keys=True))
    return movies

movies = get_movies()

# print sample movies list
for i in movies[:5]:
    print(i)
    print('====================')

{'title': 'Academy Award Review of Walt Disney Cartoons', 'Productioncompany ': 'Walt Disney Productions', 'Release date': ['May\xa019,\xa01937'], 'Running time': '41 minutes (74 minutes 1966 release)', 'Country': 'United States', 'Language': 'English', 'Box office': '$45.472'}
{'title': 'Snow White and the Seven Dwarfs', 'Directed by': ['David Hand', 'William Cottrell', 'Wilfred Jackson', 'Larry Morey', 'Perce Pearce', 'Ben Sharpsteen'], 'Written by': ['Ted Sears', 'Richard Creedon', 'Otto Englander', 'Dick Rickard', 'Earl Hurd', 'Merrill De Maris', 'Dorothy Ann Blank', 'Webb Smith'], 'Based on': ['Snow White', 'by The', 'Brothers Grimm'], 'Produced by': 'Walt Disney', 'Starring': ['Adriana Caselotti', 'Lucille La Verne', 'Harry Stockwell', 'Roy Atwell', 'Pinto Colvig', 'Otis Harlan', 'Scotty Mattraw', 'Billy Gilbert', 'Eddie Collins', 'Moroni Olsen', 'Stuart Buchanan'], 'Music by': ['Frank Churchill', 'Paul Smith', 'Leigh Harline'], 'Productioncompany ': 'Walt Disney Productions', 'D

### 3. Save Data

In [73]:
def save_data(path, data):
    import json
    with open(path, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, ensure_ascii=False, indent=4)

save_data('./dataset/movies.json', movies.tolist()) # converting numpy arr to list