# Solving real world data science tasks with Python Beautiful Soup

- This is a sample project that will scrape Disney movie pages on wikipedia for information using Beatiful soup.

# Task 1: Scrape the infobox from ToyS Story 3 from wikipedia page and save it in a dict

### Import necessary libraries

In [500]:
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests
import re

### Load the wiki page

In [9]:
wikilink = "https://en.wikipedia.org/wiki/Toy_Story_3"
r = requests.get(wikilink)

# Convert request object to a beautifulsoup object
soup = bs(r.content)

# Print out the HTML
contents = soup.prettify()

In [15]:
# Use the browser inspect feature to find the desired web object
info_box = soup.find(class_ ="infobox vevent")
print(info_box.prettify())

<table class="infobox vevent">
 <tbody>
  <tr>
   <th class="infobox-above summary" colspan="2" style="font-size: 125%; font-style: italic;">
    Toy Story 3
   </th>
  </tr>
  <tr>
   <td class="infobox-image" colspan="2">
    <a class="image" href="/wiki/File:Toy_Story_3_poster.jpg" title="All of the toys packed close together, holding up a large numeral '3', with Buzz, who is putting a friendly arm around Woody's shoulder, and Woody holding the top of the 3.">
     <img alt="All of the toys packed close together, holding up a large numeral '3', with Buzz, who is putting a friendly arm around Woody's shoulder, and Woody holding the top of the 3." class="thumbborder" data-file-height="326" data-file-width="220" decoding="async" height="326" src="//upload.wikimedia.org/wikipedia/en/6/69/Toy_Story_3_poster.jpg" width="220"/>
    </a>
    <div class="infobox-caption">
     Theatrical release poster
    </div>
   </td>
  </tr>
  <tr>
   <th class="infobox-label" scope="row" style="white-s

In [20]:
# Find all the table rows in the infobox
info_rows = info_box.find_all("tr")
for row in info_brows:
    print(row.prettify())

<tr>
 <th class="infobox-above summary" colspan="2" style="font-size: 125%; font-style: italic;">
  Toy Story 3
 </th>
</tr>

<tr>
 <td class="infobox-image" colspan="2">
  <a class="image" href="/wiki/File:Toy_Story_3_poster.jpg" title="All of the toys packed close together, holding up a large numeral '3', with Buzz, who is putting a friendly arm around Woody's shoulder, and Woody holding the top of the 3.">
   <img alt="All of the toys packed close together, holding up a large numeral '3', with Buzz, who is putting a friendly arm around Woody's shoulder, and Woody holding the top of the 3." class="thumbborder" data-file-height="326" data-file-width="220" decoding="async" height="326" src="//upload.wikimedia.org/wikipedia/en/6/69/Toy_Story_3_poster.jpg" width="220"/>
  </a>
  <div class="infobox-caption">
   Theatrical release poster
  </div>
 </td>
</tr>

<tr>
 <th class="infobox-label" scope="row" style="white-space: nowrap; padding-right: 0.65em;">
  Directed by
 </th>
 <td class="

### Start scraping data off the webpage
- Some table texts had unwanted characters that were replaced by the .replace() string method
- Used beautiful soup to make sure all table header and table data texts were joined by spaces

In [50]:
# This function is going to handle the cases where a row data might store data in lists
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    else:
        return row_data.get_text().replace("\xa0", " ")

In [51]:
# It seems all the table headers (th) are the keys of our dict
# The table data (td) are going to be the value

movie_info = {}

for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['title'] = row.find("th").get_text(" ", strip=True)
    # Skip row 1 because it is the alt-text of the picture
    elif index == 1:
        continue
    else:
        content_key = row.find("th").get_text(" ", strip=True)
        content_value = get_content_value(row.find("td"))
        movie_info[content_key] = content_value
    
movie_info

{'title': 'Toy Story 3',
 'Directed by': 'Lee Unkrich',
 'Screenplay by': 'Michael Arndt',
 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'],
 'Produced by': 'Darla K. Anderson',
 'Starring': ['Tom Hanks',
  'Tim Allen',
  'Joan Cusack',
  'Don Rickles',
  'Wallace Shawn',
  'John Ratzenberger',
  'Estelle Harris',
  'Ned Beatty',
  'Michael Keaton',
  'Jodi Benson',
  'John Morris'],
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Edited by': 'Ken Schretzmann',
 'Music by': 'Randy Newman',
 'Production companies': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Distributed by': 'Walt Disney StudiosMotion Pictures',
 'Release dates': ['June 12, 2010 ( 2010-06-12 ) ( Taormina Film Fest )',
  'June 18, 2010 ( 2010-06-18 ) (United States)'],
 'Running time': '103 minutes[1]',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$200 million[1]',
 'Box office': '$1.067 billion[1]'}

# Task 2: Go through all the movies in the wikipedia page that lists all the Disney movies

In [395]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text().replace("\xa0", " ")

def clean_tags(soup):
    for tag in soup.find_all(["sup", "span"]):
        tag.decompose()

def get_info_box(url):
    r = requests.get(url)
    soup = bs(r.content)
    clean_tags(soup)
    info_box = soup.find(class_ ="infobox vevent")
    info_rows = info_box.find_all("tr")
    
    movie_info = {}    
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['title'] = row.find("th").get_text(" ", strip=True)
        else:
            header = row.find('th')
            if header:
                content_key = header.get_text(" ", strip=True)
                content_value = get_content_value(row.find("td"))
                movie_info[content_key] = content_value

    return movie_info
    

In [421]:
# Request the page and convert to BS object
wiki_disney_movies = "https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films"
r = requests.get(wiki_disney_movies)
soup = bs(r.content)

# After some inspection on the page, we find all the tables share this class
movies = soup.select(".wikitable.sortable i a")

In [428]:
# Create movies list and a paths list
movie_titles = []
movie_paths = []

# Get the titles and paths of each movie and add to our list
for movie in movies:
    title = movie.get('title')
    relative_path = movie.get('href')
    movie_titles.append(title)
    movie_paths.append(relative_path)


In [439]:
# Remove (year) from titles
movie_titles = [re.sub('\([^)]*\)', '', title) for title in movie_titles]

In [443]:
# Another for loop to make the list of dictionaries
# Use a try and except block to catch instances of paths that do not lead to 
# movies since the movie_paths list will have links to other page
base_path = "https://en.wikipedia.org/"
movie_info_list = []
print("WEB SCRAPING MOVIES")
for i, path in enumerate(movie_paths):
    # Limit the requests call to 200
    if i == 200:
        break
    elif i % 20 == 0:
        print("IN PROGRESS")
    full_path = base_path + path
    try:
        movie_info_list.append(get_info_box(full_path))
    except Exception as e:
        print(path)
        print(e)
    
print("COMPLETED")

WEB SCRAPING MOVIES
IN PROGRESS
IN PROGRESS
IN PROGRESS
/wiki/Zorro_(1957_TV_series)#Theatrical
'NoneType' object has no attribute 'find'
/wiki/Zorro_(1957_TV_series)#Theatrical
'NoneType' object has no attribute 'find'
IN PROGRESS
IN PROGRESS
IN PROGRESS
IN PROGRESS
IN PROGRESS
IN PROGRESS
IN PROGRESS


### Saved/Reload Movie data

In [1347]:
import json

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [1348]:
import json

def load_data(title):
    with open(title, 'r', encoding='utf-8') as f:
        return json.load(f)

In [1262]:
save_data("disney_data_v2.json", movie_info_list)

TypeError: Object of type datetime is not JSON serializable

# Task 3 : Clean data

In [971]:
# Load data
movie_info_list = load_data("disney_data_v1.json")

### Subtasks
- ~~Remove citation tags (i.e. [1], [2])~~ (accomplished by the clean_tags() method)
- ~~Convert running times into integers ~~
- Convert dates into datetime objects
- ~~Fix long strings~~(clean_tags() method)
- Conver budget and box office monetary amounts to numbers

In [972]:
movie_info_list[1]

{'title': 'Snow White and the Seven Dwarfs',
 'Directed by': ['David Hand',
  'William Cottrell',
  'Wilfred Jackson',
  'Larry Morey',
  'Perce Pearce',
  'Ben Sharpsteen'],
 'Written by': ['Ted Sears',
  'Richard Creedon',
  'Otto Englander',
  'Dick Rickard',
  'Earl Hurd',
  'Merrill De Maris',
  'Dorothy Ann Blank',
  'Webb Smith'],
 'Based on': ['Snow White', 'by The', 'Brothers Grimm'],
 'Produced by': 'Walt Disney',
 'Starring': ['Adriana Caselotti',
  'Lucille La Verne',
  'Harry Stockwell',
  'Roy Atwell',
  'Pinto Colvig',
  'Otis Harlan',
  'Scotty Mattraw',
  'Billy Gilbert',
  'Eddie Collins',
  'Moroni Olsen',
  'Stuart Buchanan'],
 'Music by': ['Frank Churchill', 'Paul Smith', 'Leigh Harline'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'RKO Radio Pictures',
 'Release dates': ['December 21, 1937 ( Carthay Circle Theatre )',
  'February 4, 1938 (United States)'],
 'Running time': '83 minutes',
 'Country': 'United States',
 'Language': 'English',

#### Convert running time field to integers 
- Since some movies have been rereleased with different running times, I will choose to keep only the first running time as the overall running time. 
- The selected runtime is put into a new field

In [973]:
def time_to_int(runtime):
    if runtime == "N/A":
        return None
    elif type(runtime) is list:
        runtime = runtime[0]
        time = re.findall('^[0-9]+',runtime)
        return int(time[0])
    else:
        time = re.findall('^[0-9]+',runtime)
        return int(time[0])

In [974]:
for movie in movie_info_list:
    movie['Running time (int)'] = time_to_int(movie.get("Running time", "N/A"))

#### Convert budget and box office values to floats
- If there is a range for budget or box office values, the lowest number will be grabbed.

In [975]:
def money_to_number(value):
    money_expression = "\$\d+(?:\.\d+)?"
    if value == "N/A" or len(value) == 0 or value == "unknown":
        return None
    elif type(value) is list:
        one_text = ' '.join(value)
        money = re.findall(money_expression, one_text)
        return float(money[0].split("$")[1])

    else:
        money = re.findall(money_expression, value)
        return float(money[0].split("$")[1])

In [976]:
for movie in movie_info_list:
    movie['Box office (numerical)'] = money_to_number(movie.get('Box office', 'N/A'))
    movie['Budget (numerical)'] = money_to_number(movie.get('Budget', 'N/A'))

In [977]:
# Running time (int), Budget (numerical), and Box office (numerical)
# Have been added as new keys to the dictionaries
movie_info_list[1].keys()

dict_keys(['title', 'Directed by', 'Written by', 'Based on', 'Produced by', 'Starring', 'Music by', 'Production company', 'Distributed by', 'Release dates', 'Running time', 'Country', 'Language', 'Budget', 'Box office', 'Running time (int)', 'Box office (numerical)', 'Budget (numerical)'])

#### Convert Dates into datetime objects
- Some entries have "Release dates" while other have "Release date".
- In this case, all "Release dates" will be changes to "Release date"
- Only the first date will be kept as the release date.  


In [978]:
movie_info_list[-10].keys()

dict_keys(['title', 'Directed by', 'Screenplay by', 'Based on', 'Produced by', 'Starring', 'Cinematography', 'Edited by', 'Music by', 'Production companies', 'Distributed by', 'Release date', 'Running time', 'Countries', 'Language', 'Budget', 'Box office', 'Running time (int)', 'Box office (numerical)', 'Budget (numerical)'])

In [979]:
# Update key
movie_info_list = [{"Release date" if k == "Release dates" else k:v for k,v in movie.items()} for movie in movie_info_list]


In [1084]:
import datetime

In [1221]:
[(i, movie.get('Release date') for i, movie in enumerate(movie_info_list)]

[(0, ['May 19, 1937']),
 (1,
  ['December 21, 1937 ( Carthay Circle Theatre )',
   'February 4, 1938 (United States)']),
 (2,
  ['February 7, 1940 ( Center Theatre )',
   'February 23, 1940 (United States)']),
 (3, ['November 13, 1940']),
 (4, ['June 27, 1941']),
 (5, ['October 23, 1941 (New York City)', 'October 31, 1941 (U.S.)']),
 (6,
  ['August 9, 1942 (World Premiere – London)',
   'August 13, 1942 (Premiere – New York City)',
   'August 21, 1942 (U.S.)']),
 (7,
  ['August 24, 1942 (World Premiere – Rio de Janeiro)',
   'February 6, 1943 (U.S. Premiere – Boston)',
   'February 19, 1943 (U.S.)']),
 (8, ['July 17, 1943']),
 (9, ['December 21, 1944 (Mexico City)', 'February 3, 1945 (US)']),
 (10, ['April 20, 1946 (New York City premiere)', 'August 15, 1946 (U.S.)']),
 (11,
  ['November 12, 1946 (Premiere: Atlanta, Georgia)',
   'November 20, 1946',
   'March 30, 1947 (Stanford theater, Palo Alto, California)']),
 (12, ['September 27, 1947']),
 (13, 'May 27, 1948'),
 (14,
  ['November

In [1248]:
def clean_date(date):
    date_expression = '([0-9]+\s)?([a-z|A-z]+\s[0-9]+\,*\s*[0-9]+)'
    try:
        if date == "N/A":
            return None
        elif type(date) is list:
            return re.search(date_expression, date[0]).group()
        else:
            return re.search(date_expression, date).group()
    except:
        return None

    
def date_to_datetime(date):
    date_string = clean_date(date)
    date_format = ["%B %d, %Y", "%d %B %Y"] 
    for formt in date_format:
        try:
            return datetime.datetime.strptime(date_string, formt)
        except:
            pass
    return None


In [1252]:
for movie in movie_info_list:
    movie['Release date (datetime)'] = date_to_datetime(movie.get('Release date', 'N/A'))

In [1259]:
movie_info_list[70]

{'title': 'The Three Lives of Thomasina',
 'Directed by': 'Don Chaffey',
 'Written by': 'Robert Westerby',
 'Based on': ['Thomasina, the Cat Who Thought She Was God',
  'by',
  'Paul Gallico'],
 'Produced by': ['Ron Miller Walt Disney'],
 'Starring': ['Patrick McGoohan',
  'Karen Dotrice',
  'Susan Hampshire',
  'Matthew Garber'],
 'Narrated by': 'Elspeth March',
 'Cinematography': 'Paul Beeson',
 'Edited by': 'Gordon Stone',
 'Music by': 'Paul J. Smith',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['11 December 1963 ( New York City )', '4 June 1964 (U.S.)'],
 'Running time': '97 minutes',
 'Countries': ['United Kingdom', 'United States'],
 'Languages': ['English', 'Gaelic'],
 'Box office': '$2,250,000 (US/ Canada)',
 'Running time (int)': 97,
 'Box office (numerical)': 2.0,
 'Budget (numerical)': None,
 'Release date (datetime)': datetime.datetime(1963, 12, 11, 0, 0)}

#### Use the pickle module to serialize the dict.
- JSON does not support serializing datetime objects

In [1280]:
import pickle

def save_pickle_data(title, data):
    with open(title, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)



In [1281]:
import pickle

def load_pickle_data(title):
    with open(title, 'rb') as handle:
        return pickle.load(handle)

In [1282]:
save_pickle_data("disney_data_clean.pickle", movie_info_list)

In [1283]:
movie_info_list = load_pickle_data("disney_data_clean.pickle")

# Task 4: Attach more movie data from OMBD
- http://www.omdbapi.com/?apikey=[yourkey]&

In [1310]:
import requests
import os
import urllib

def get_ombd_info(title):
    base_url = "http://www.omdbapi.com/?"
    my_key = "c9b7a758"
    params = {
        'apikey': my_key,
        't': title,}
    params_encoded = urllib.parse.urlencode(params)
    full_url = base_url + params_encoded
    return requests.get(full_url).json()

def get_rotten_score(ratings):
    for rating in ratings:
        if rating['Source'] == 'Rotten Tomatoes':
            return int(rating['Value'].split("%")[0])
    return None



In [1319]:
for movie in movie_info_list:
    title = movie['title']
    ombd_info = get_ombd_info(title)
    ratings = ombd_info.get('Ratings', [])
    rotten_score = get_rotten_score(ratings)
    movie['Rotten Tomatoes Score'] = rotten_score
    movie['Metascore'] = ombd_info.get('Metascore', 'N/A')
    movie['imdbRating'] = ombd_info.get('imdbRating', 'N/A')

In [1362]:
# One last data type conversion
for movie in movie_info_list:
    try:
        movie['Metascore'] = int(movie['Metascore'])
        movie['imdbRating'] = float(movie['imdbRating'])
    except:
        pass

In [1363]:
save_pickle_data('disney_data_final.pickle', movie_info_list)

# Task 5: Save data as JSON & CSV
- Reconvert all the datetime objects to strings to save them

In [1366]:
movie_info_copy = [movie.copy() for movie in movie_info_list]

In [1367]:
movie_info_copy

[{'title': 'Academy Award Review of',
  'Production company': 'Walt Disney Productions',
  'Distributed by': 'United Artists',
  'Release date': ['May 19, 1937'],
  'Running time': '41 minutes (74 minutes 1966 release)',
  'Country': 'United States',
  'Language': 'English',
  'Box office': '$45.472',
  'Running time (int)': 41,
  'Box office (numerical)': 45.472,
  'Budget (numerical)': None,
  'Release date (datetime)': datetime.datetime(1937, 5, 19, 0, 0),
  'Rotten Tomatoes Score': None,
  'Metascore': 'N/A',
  'imdbRating': 7.1},
 {'title': 'Snow White and the Seven Dwarfs',
  'Directed by': ['David Hand',
   'William Cottrell',
   'Wilfred Jackson',
   'Larry Morey',
   'Perce Pearce',
   'Ben Sharpsteen'],
  'Written by': ['Ted Sears',
   'Richard Creedon',
   'Otto Englander',
   'Dick Rickard',
   'Earl Hurd',
   'Merrill De Maris',
   'Dorothy Ann Blank',
   'Webb Smith'],
  'Based on': ['Snow White', 'by The', 'Brothers Grimm'],
  'Produced by': 'Walt Disney',
  'Starring': 

In [1369]:
for movie in movie_info_copy:
    current_date = movie['Release date (datetime)']
    if current_date:
        movie['Release date (datetime)'] = current_date.strftime("%B %d, %Y")

In [1370]:
movie_info_copy[1:5]

[{'title': 'Snow White and the Seven Dwarfs',
  'Directed by': ['David Hand',
   'William Cottrell',
   'Wilfred Jackson',
   'Larry Morey',
   'Perce Pearce',
   'Ben Sharpsteen'],
  'Written by': ['Ted Sears',
   'Richard Creedon',
   'Otto Englander',
   'Dick Rickard',
   'Earl Hurd',
   'Merrill De Maris',
   'Dorothy Ann Blank',
   'Webb Smith'],
  'Based on': ['Snow White', 'by The', 'Brothers Grimm'],
  'Produced by': 'Walt Disney',
  'Starring': ['Adriana Caselotti',
   'Lucille La Verne',
   'Harry Stockwell',
   'Roy Atwell',
   'Pinto Colvig',
   'Otis Harlan',
   'Scotty Mattraw',
   'Billy Gilbert',
   'Eddie Collins',
   'Moroni Olsen',
   'Stuart Buchanan'],
  'Music by': ['Frank Churchill', 'Paul Smith', 'Leigh Harline'],
  'Production company': 'Walt Disney Productions',
  'Distributed by': 'RKO Radio Pictures',
  'Release date': ['December 21, 1937 ( Carthay Circle Theatre )',
   'February 4, 1938 (United States)'],
  'Running time': '83 minutes',
  'Country': 'Unite

In [1371]:
# Saving as a .json
save_data('disney_data_final.json', movie_info_copy)

In [1431]:
import pandas as pd
pd.set_option('display.max_columns', None)

df = pd.DataFrame(movie_info_list)
df.to_csv("disney_movie_data_final.csv")

#### Some examples of how you could sort and view data

In [1432]:
runtimes = df.sort_values(['Running time (int)'], ascending=True)

In [1433]:
condition = df['Running time (int)'] > 50
high_runtimes = df.loc[condition, ['title', 'Release date', 'Running time (int)']]

In [1434]:
high_runtimes.sort_values(['Running time (int)'])

Unnamed: 0,title,Release date,Running time (int)
24,The Vanishing Prairie,"[August 17, 1954]",60.0
5,Dumbo,"[October 23, 1941 (New York City), October 31,...",64.0
60,The Legend of Lobo,"[November 7, 1962 (Los Angeles)]",67.0
15,The Adventures of Ichabod and Mr. Toad,"[October 5, 1949]",68.0
47,Jungle Cat,"[June 1960 ( Berlin ), August 10, 1960 (US)]",69.0
...,...,...,...
80,"Follow Me, Boys!","[December 1, 1966]",131.0
73,Those Calloways,"November 10, 1964",131.0
46,Pollyanna,"May 19, 1960",134.0
72,Mary Poppins,"[August 27, 1964 (LA), September 24, 1964 (NY)]",139.0
