### Task #1: Scrape the infobox from Toy Story 3 wiki page (save in python dictionary)

In [1]:
import bs4
import requests

#### Load the webpage and get the content

In [2]:
website = requests.get('https://en.wikipedia.org/wiki/Davy_Crockett_and_the_River_Pirates')
website.status_code;

In [3]:
soup = bs4.BeautifulSoup(website.text, 'html.parser')

In [4]:
infobox = soup.find(class_="infobox vevent")

In [5]:
infobox;

#### Scrape labels from the infobox table

In [6]:
labels = infobox.find_all('th', {'class': 'infobox-label'})

labels_list = [] # future keys

for label in labels:

    labels_list.append(label.get_text(' ', strip=True)) 
    # otherwise when we have <br/> in the label, it joins without any character

labels_list.insert(0, 'Title') # adding Title, which is not in the 'infobox-label' class
labels_list

['Title',
 'Directed by',
 'Written by',
 'Produced by',
 'Starring',
 'Edited by',
 'Music by',
 'Production company',
 'Distributed by',
 'Release date',
 'Running time',
 'Country',
 'Language']

#### Scrape values from the infobox table

In [7]:
data = infobox.find_all('td', {'class' : 'infobox-data'})

data_list = []

for value in data:

    if value.find('li'):
        value_list = [li.get_text(' ', strip=True).replace('\xa0', ' ') for li in value.find_all('li')]
        data_list.append(value_list)
        
    elif value.find('br'):
        # some lists have <br> instead of <li> tags
        value_list = [text for text in value.stripped_strings]
        data_list.append(value_list)
        
    else:
        data_list.append(value.get_text(' ', strip=True).replace('\xa0', ' '))  


data_list.insert(0, infobox.find('th').text) # adding title to the list
data_list

['Davy Crockett and the River Pirates',
 'Norman Foster',
 ['Tom Blackburn', 'Norman Foster'],
 'Bill Walsh',
 ['Fess Parker', 'Buddy Ebsen', 'Jeff York'],
 'Stanley Johnson',
 ['Thomas W. Blackburn (lyrics)',
  'George Bruns',
  'Edward H. Plumb (orchestration)'],
 'Walt Disney Productions',
 'Buena Vista Film Distribution Co., Inc.',
 'July 18, 1956 ( 1956-07-18 )',
 '81 minutes',
 'United States',
 'English']

#### Make a label:value dictionary

In [8]:
infobox_dict = dict(zip(labels_list, data_list))

In [9]:
infobox_dict

{'Title': 'Davy Crockett and the River Pirates',
 'Directed by': 'Norman Foster',
 'Written by': ['Tom Blackburn', 'Norman Foster'],
 'Produced by': 'Bill Walsh',
 'Starring': ['Fess Parker', 'Buddy Ebsen', 'Jeff York'],
 'Edited by': 'Stanley Johnson',
 'Music by': ['Thomas W. Blackburn (lyrics)',
  'George Bruns',
  'Edward H. Plumb (orchestration)'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Film Distribution Co., Inc.',
 'Release date': 'July 18, 1956 ( 1956-07-18 )',
 'Running time': '81 minutes',
 'Country': 'United States',
 'Language': 'English'}

### Task #2: Scrape infobox for all movies in List of Disney Films (save as list of dictionaries)

#### Create a function that deletes references and additional date formats from data

In [10]:
def delete_references(soup_object):
    for data in soup_object(['sup', 'span']): # sup = tag for references; span = tag for dates
        data.decompose()

#### Create a function which loads the website and makes a dictionary from the film's infobox

In [12]:
def get_info_box(url):
    
    ### loading the website ###
    website = requests.get(url)
    soup = bs4.BeautifulSoup(website.text, 'html.parser')
    infobox = soup.find(class_="infobox vevent")
    
    delete_references(soup) 
    
    ### scraping infobox labels ####
    labels = infobox.find_all('th', {'class': 'infobox-label'})
    labels_list = []
    
    for label in labels:
        labels_list.append(label.get_text(' ', strip=True)) 

    labels_list.insert(0, 'Title')
    
    
    ### scraping infobox values ###
    data = infobox.find_all('td', {'class' : 'infobox-data'})
    data_list = []

    for value in data:
        if value.find('li'):
            value_list = [li.get_text(' ', strip=True).replace('\xa0', ' ') for li in value.find_all('li')]
            data_list.append(value_list)
            
        elif value.find('br'):        
            value_list = [text for text in value.stripped_strings]
            data_list.append(value_list)
        
        else:
            data_list.append(value.get_text(' ', strip=True).replace('\xa0', ' '))    

    data_list.insert(0, infobox.find('th').text)
    
    ### converting to a dictionary ###
    infobox_dict = dict(zip(labels_list, data_list))
    return infobox_dict

In [13]:
get_info_box('https://en.wikipedia.org/wiki/Davy_Crockett_and_the_River_Pirates') # checking if it works for one link

{'Title': 'Davy Crockett and the River Pirates',
 'Directed by': 'Norman Foster',
 'Written by': ['Tom Blackburn', 'Norman Foster'],
 'Produced by': 'Bill Walsh',
 'Starring': ['Fess Parker', 'Buddy Ebsen', 'Jeff York'],
 'Edited by': 'Stanley Johnson',
 'Music by': ['Thomas W. Blackburn (lyrics)',
  'George Bruns',
  'Edward H. Plumb (orchestration)'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Film Distribution Co., Inc.',
 'Release date': 'July 18, 1956',
 'Running time': '81 minutes',
 'Country': 'United States',
 'Language': 'English'}

#### Load the webpage and select all the rows with film titles and links

In [14]:
r = requests.get('https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')
soup = bs4.BeautifulSoup(r.text, 'html.parser')

In [15]:
films = soup.select('.wikitable.sortable i a')
films; # len == 454

#### Apply get_info_box function to all films and store the output as a list of dictionaries

In [16]:
film_info = []
wiki_base = 'https://en.wikipedia.org/'


for index, film in enumerate(films):

#     if index % 10 ==0:
#         print(index)
        
    try:
        title = film['title']
        link = film['href']
        
        film_info.append(get_info_box(wiki_base + link))
        

    except Exception as e:
        print(e)
        print(film.text)
        
# Exceptions: 
# 'True-Life Adventures' (doesn't have an infobox)
# 'Better Nate Than Never' (doesn't have a film website)

'NoneType' object has no attribute 'find_all'
True-Life Adventures
'NoneType' object has no attribute 'find_all'
Better Nate Than Never


In [17]:
len(film_info) # len == 452 --> only 2 have dropped out

452

#### Convert data to json file

In [18]:
import json

In [19]:
def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [20]:
def load_data(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)

In [21]:
save_data('my_data_cleaned.json', film_info)

### Task #3: Data cleaning

#### Convert runnng time values to integer type

In [22]:
# changing value to int
def minute_to_int(running_time):
    try:
        if type(running_time) == str:
            return int(running_time.split()[0])
        else:
            return int(running_time[0].split()[0])
    except: # the 'Zorro' exception with value '22–24'
        return None
    
    
# creating a new key-value pair and apply to every film dictionary
for film in film_info:
    if 'Running time' in film.keys():
            film['Running time (int)'] = minute_to_int(film['Running time'])
    else:
        film['Running time (int)'] = None

In [23]:
print([film['Running time (int)'] for film in film_info])

[41, 83, 88, 126, 74, 64, 70, 42, 70, 71, 75, 94, 73, 75, 82, 68, 74, 96, 75, 84, 77, 92, 69, 81, 60, 127, 92, 76, 75, 73, 85, 81, 70, 90, 80, 75, 83, 83, 72, 97, 75, 104, 93, None, 105, 95, 97, 134, None, 69, 92, 131, 79, 97, 128, 74, 91, 105, 98, 130, 89, 93, 67, 98, 100, 118, 103, 110, 80, 79, 91, 91, 97, 118, 139, 92, 131, 87, 116, 93, 110, 110, 131, 101, 108, 84, 78, 75, 164, 106, 110, 99, 113, 108, 112, 93, 91, 93, 100, 100, 79, 96, 113, 89, 118, 92, 88, 92, 87, 93, 93, 93, 90, 83, 96, 88, 89, 91, 93, 92, 97, 100, 100, 89, 91, 112, 115, 95, 91, 95, 104, 74, 48, 77, 104, 128, 101, 94, 104, 90, 100, 88, 93, 98, 100, 112, 84, 98, 97, 114, 96, 100, 109, 83, 90, 107, 96, 103, 91, 95, 105, 113, 80, 101, 89, 74, 90, 89, 110, 74, 93, 84, 83, 74, 77, 107, 93, 88, 108, 84, 121, 89, 104, 90, 86, 84, 108, 107, 96, 98, 105, 108, 94, 106, 102, 88, 102, 102, 97, 111, 100, 96, 98, 78, 81, 108, 89, 99, 89, 81, 92, 100, 89, 79, 91, 101, 104, 103, 86, 105, 93, 92, 98, 95, 93, 87, 93, 87, 128, 86, 9

#### Convert 'Budget' and 'Box office' values to floats

In [24]:
[film.get('Budget', None) for film in film_info]

[None,
 '$1.49 million',
 '$2.6 million',
 '$2.28 million',
 '$600,000',
 '$950,000',
 '$858,000',
 None,
 '$788,000',
 None,
 '$1.35 million',
 '$2.125 million',
 None,
 '$1.5 million',
 '$1.5 million',
 None,
 '$2.2 million',
 '$1,800,000',
 '$3 million',
 None,
 '$4 million',
 '$2 million',
 '$300,000',
 '$1.8 million',
 None,
 '$5 million',
 None,
 '$4 million',
 None,
 None,
 None,
 None,
 None,
 None,
 '$700,000',
 None,
 None,
 None,
 None,
 None,
 '$6 million',
 'under $1 million or $1,250,000',
 None,
 None,
 '$2 million',
 None,
 None,
 '$2.5 million',
 None,
 None,
 None,
 '$4 million',
 '$3.6 million',
 None,
 None,
 None,
 None,
 '$3 million',
 None,
 '$3 million',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 '$3 million',
 None,
 None,
 None,
 None,
 '$4.4–6 million',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 '$4 million',
 None,
 '$5 million',
 None,
 None,
 None,
 None,
 '$5 million',
 None,
 None,
 None,
 None,
 No

In [25]:
import re

### function checks if numerals are present in the value and returns them as int ###
def multiply(budget):
    numerals = {'million':1000000, 'thousand':1000, 'billion':1000000000}
    multiply_by = [numeral for numeral in numerals.keys() if numeral in budget]
        
    if multiply_by:
        return numerals[multiply_by[0]]

### function returns a float if money pattern is present in a value ###
def float_it(amount):
    money = re.findall(r'\$[0-9]+[.,]?\d*[.,]?\d*[.,]?\d*', amount)
    
    if money:
        numbers = [re.sub(r'\$', '', number) for number in money]
        # getting rid of commas to enable making a float later
        no_commas = [re.sub(',', '', amount) for amount in numbers]

        if multiply(amount):
            final = float(no_commas[0]) * multiply(amount)
        else:
            final = float(no_commas[0])
        return final

### function applies conversion to both strings and lists ###
def money_to_float(budget):
    if type(budget)==str:
        return float_it(budget)
    
    elif type(budget)==list:
        
        for index, instance in enumerate(budget):
            result = float_it(instance)
            if result: # first occurence in a list is a correct value
                return result
            elif result==None and index==len(budget)-1: # there was no correct budget value
                return None

    else:
        return None

In [26]:
### creating new keys with Budget and Box office values converted to floats ###
for film in film_info:
    film['Budget (float)'] = money_to_float(film.get('Budget', None))
    film['Box office (float)'] = money_to_float(film.get('Box office', None))

In [27]:
film_info

[{'Title': 'Academy Award Review of ',
  'Production company': 'Walt Disney Productions',
  'Release date': ['May 19, 1937'],
  'Running time': '41 minutes (74 minutes 1966 release)',
  'Country': 'United States',
  'Language': 'English',
  'Box office': '$45.472',
  'Running time (int)': 41,
  'Budget (float)': None,
  'Box office (float)': 45.472},
 {'Title': 'Snow White and the Seven Dwarfs',
  'Directed by': ['David Hand',
   'William Cottrell',
   'Wilfred Jackson',
   'Larry Morey',
   'Perce Pearce',
   'Ben Sharpsteen'],
  'Written by': ['Ted Sears',
   'Richard Creedon',
   'Otto Englander',
   'Dick Rickard',
   'Earl Hurd',
   'Merrill De Maris',
   'Dorothy Ann Blank',
   'Webb Smith'],
  'Based on': ['Snow White', 'by The', 'Brothers Grimm'],
  'Produced by': 'Walt Disney',
  'Starring': ['Adriana Caselotti',
   'Lucille La Verne',
   'Harry Stockwell',
   'Roy Atwell',
   'Pinto Colvig',
   'Otis Harlan',
   'Scotty Mattraw',
   'Billy Gilbert',
   'Eddie Collins',
   'Mo

#### Convert release dates to date objects

In [43]:
from datetime import datetime

def date_conversion(date_value):
    if type(date_value)==list:
        date_value = date_value[0]
        
    elif date_value == None:
        return None
    
    cleaned = date_value.split('(')[0].strip()
    
    formats = ['%d %B %Y', '%B %d, %Y']
    for pattern in formats:
        try:
            return datetime.strptime(cleaned, pattern)
        except:
            pass # date_value which has a different pattern is a missing data (eg. year only)
    return None

In [44]:
for film in film_info:
    film['Release date (datetime)'] = date_conversion(film.get('Release date', None))

In [46]:
### saving data with pickle since it is suitable for datatime objects ###
import pickle

def save_data_pickle(name, data):
    with open(name, 'wb') as f:
        pickle.dump(data, f)

In [47]:
def load_data_pickle(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [48]:
save_data_pickle('disney_data_cleaned_2.pickle', film_info) 
# data with all correct types (budget, box office, release date, running time)

### Task #4: Attach IMDB, Metascore, and Rotten Tomatoes scores to dataset (working with APIs)

In [50]:
film_info = load_data_pickle('disney_data_cleaned_2.pickle')

In [51]:
import os

In [52]:
### function retrieves the information from OMDB base for one film ###
def get_omdb_info(title):
    api_key = os.environ.get('OMDB') # pass your own API key
    full_url = f'http://www.omdbapi.com/?t={title}&apikey={api_key}'
    return requests.get(full_url).json()

In [53]:
def get_rotten_tomato_score(omdb_info):
    if omdb_info.get('Ratings', None):
        for rating in omdb_info['Ratings']:
            if rating['Source'] == 'Rotten Tomatoes':
                return rating['Value']
    return None

In [54]:
### getting rating information from OMDB for every film ####
# be careful - you can use API for free only up to 1000 requests a day

for film in film_info:
        
    full_info = get_omdb_info(film['Title'])
    
    film['rotten_tomato_score'] = get_rotten_tomato_score(full_info)
    film['metascore'] = full_info.get('Metascore', None)
    film['imdb_rating'] = full_info.get('imdbRating', None)

In [55]:
film_info

[{'Title': 'Academy Award Review of ',
  'Production company': 'Walt Disney Productions',
  'Release date': ['May 19, 1937'],
  'Running time': '41 minutes (74 minutes 1966 release)',
  'Country': 'United States',
  'Language': 'English',
  'Box office': '$45.472',
  'Running time (int)': 41,
  'Budget (float)': None,
  'Box office (float)': 45.472,
  'Release date (datetime)': datetime.datetime(1937, 5, 19, 0, 0),
  'rotten_tomato_score': None,
  'metascore': 'N/A',
  'imdb_rating': '7.0'},
 {'Title': 'Snow White and the Seven Dwarfs',
  'Directed by': ['David Hand',
   'William Cottrell',
   'Wilfred Jackson',
   'Larry Morey',
   'Perce Pearce',
   'Ben Sharpsteen'],
  'Written by': ['Ted Sears',
   'Richard Creedon',
   'Otto Englander',
   'Dick Rickard',
   'Earl Hurd',
   'Merrill De Maris',
   'Dorothy Ann Blank',
   'Webb Smith'],
  'Based on': ['Snow White', 'by The', 'Brothers Grimm'],
  'Produced by': 'Walt Disney',
  'Starring': ['Adriana Caselotti',
   'Lucille La Verne',

In [56]:
save_data_pickle('disney_data_final.pickle', film_info)

### Task #5: Save final dataset as a JSON file and as a CSV file

#### Converting to json

In [57]:
film_info_copy = [film.copy() for film in film_info]

In [58]:
### changing datetime objects to strings to enable json format conversion
from datetime import datetime

for film in film_info_copy:
    if film.get('Release date (datetime)', None):
        film['Release date (datetime)'] = film['Release date (datetime)'].strftime('%B %d, %Y')

In [59]:
save_data('disney_data_final.json', film_info_copy)

#### Converting to csv

In [60]:
import pandas as pd

In [61]:
df = pd.DataFrame(film_info)

In [62]:
df.head(100)

Unnamed: 0,Title,Production company,Release date,Running time,Country,Language,Box office,Running time (int),Budget (float),Box office (float),...,Executive producer,Producers,Editors,Distributor,Japanese,Hepburn,Adaptation by,Animation by,Traditional,Simplified
0,Academy Award Review of,Walt Disney Productions,"[May 19, 1937]",41 minutes (74 minutes 1966 release),United States,English,$45.472,41.0,,4.547200e+01,...,,,,,,,,,,
1,Snow White and the Seven Dwarfs,Walt Disney Productions,"[December 21, 1937 ( Carthay Circle Theatre )]",83 minutes,United States,English,$418 million,83.0,1490000.0,4.180000e+08,...,,,,,,,,,,
2,Pinocchio,Walt Disney Productions,"[February 7, 1940 ( Center Theatre ), February...",88 minutes,United States,English,$164 million,88.0,2600000.0,1.640000e+08,...,,,,,,,,,,
3,Fantasia,Walt Disney Productions,"[November 13, 1940]",126 minutes,United States,English,$76.4–$83.3 million (United States and Canada),126.0,2280000.0,7.640000e+07,...,,,,,,,,,,
4,The Reluctant Dragon,Walt Disney Productions,"[June 27, 1941]",74 minutes,United States,English,"$960,000 (worldwide rentals)",74.0,600000.0,9.600000e+05,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Rascal,Walt Disney Productions,"September 4, 1969",93 minutes,United States,English,,93.0,,,...,,,,,,,,,,
96,The Computer Wore Tennis Shoes,Walt Disney Productions,"[November 28, 1969]",91 minutes,United States,English,$5.5 million (US/ Canada rentals),91.0,,5.500000e+06,...,,,,,,,,,,
97,King of the Grizzlies,Walt Disney Productions Robert Lawrence Produc...,"[February 11, 1970]",93 minutes,,English,,93.0,,,...,,,,,,,,,,
98,The Boatniks,Walt Disney Productions,"July 1, 1970",100 minutes,United States,English,"$18,607,492",100.0,,1.860749e+07,...,,,,,,,,,,


In [63]:
column_names = list(df.columns.values)

In [64]:
#### deleting columns if number of NaNs is > 400 ###
for name in column_names:
    if df[name].isna().sum()>400:
        df.drop(name, inplace=True, axis=1)

In [65]:
df.head()

Unnamed: 0,Title,Production company,Release date,Running time,Country,Language,Box office,Running time (int),Budget (float),Box office (float),...,Starring,Music by,Distributed by,Budget,Story by,Narrated by,Cinematography,Edited by,Screenplay by,Production companies
0,Academy Award Review of,Walt Disney Productions,"[May 19, 1937]",41 minutes (74 minutes 1966 release),United States,English,$45.472,41.0,,45.472,...,,,,,,,,,,
1,Snow White and the Seven Dwarfs,Walt Disney Productions,"[December 21, 1937 ( Carthay Circle Theatre )]",83 minutes,United States,English,$418 million,83.0,1490000.0,418000000.0,...,"[Adriana Caselotti, Lucille La Verne, Harry St...","[Frank Churchill, Paul Smith, Leigh Harline]",RKO Radio Pictures,$1.49 million,,,,,,
2,Pinocchio,Walt Disney Productions,"[February 7, 1940 ( Center Theatre ), February...",88 minutes,United States,English,$164 million,88.0,2600000.0,164000000.0,...,"[Cliff Edwards, Dickie Jones, Christian Rub, W...","[Leigh Harline, Paul J. Smith]",RKO Radio Pictures,$2.6 million,"[Ted Sears, Otto Englander, Webb Smith, Willia...",,,,,
3,Fantasia,Walt Disney Productions,"[November 13, 1940]",126 minutes,United States,English,$76.4–$83.3 million (United States and Canada),126.0,2280000.0,76400000.0,...,"[Leopold Stokowski, Deems Taylor]",See program,RKO Radio Pictures,$2.28 million,"[Joe Grant, Dick Huemer]",Deems Taylor,James Wong Howe,,,
4,The Reluctant Dragon,Walt Disney Productions,"[June 27, 1941]",74 minutes,United States,English,"$960,000 (worldwide rentals)",74.0,600000.0,960000.0,...,"[Robert Benchley, Frances Gifford, Buddy Peppe...","[Frank Churchill, Larry Morey]",RKO Radio Pictures,"$600,000",,,Bert Glennon,Paul Weatherwax,,


In [66]:
df.to_csv('disney_data_final_columns_deleted.csv')