### Install needed libraries

In [1]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd

### Web scraping

Returns content from specified page

In [2]:
r = requests.get("https://en.wikipedia.org/wiki/Toy_Story_3")
soup = bs(r.content)

Returns specified table:

In [3]:
info_box = soup.find(class_="infobox vevent")

Returns all tr elements from class

In [4]:
info_row = info_box.find_all("tr")

In [5]:
def get_content_value(row):
    #If li is found -> there is more than one string
    if row.find("li"):
        #row.find_all("li") -> returns li element for row if found
        return [li.get_text(" ", strip = True) for li in row.find_all("li")]
    else:
        return row.get_text(" ", strip = True)

In [6]:
movie_info = {}
for index, row in enumerate(info_row):
    if index == 0:
        movie_info["title"] = row.find(class_ = "infobox-above summary").get_text(" ", strip = True)
    elif index == 1:
        continue
    else:
        key = row.find(class_ = "infobox-label").get_text(" ", strip = True)
        value = get_content_value(row.find(class_ = "infobox-data"))
        movie_info[key] = value
    

In [7]:
movie_info

{'title': 'Toy Story 3',
 'Directed by': 'Lee Unkrich',
 'Screenplay by': 'Michael Arndt',
 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'],
 'Produced by': 'Darla K. Anderson',
 'Starring': ['Tom Hanks',
  'Tim Allen',
  'Joan Cusack',
  'Don Rickles',
  'Wallace Shawn',
  'John Ratzenberger',
  'Estelle Harris',
  'Ned Beatty',
  'Michael Keaton',
  'Jodi Benson',
  'John Morris'],
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Edited by': 'Ken Schretzmann',
 'Music by': 'Randy Newman',
 'Production company': 'Pixar Animation Studios',
 'Distributed by': 'Walt Disney Studios Motion Pictures [ a ]',
 'Release dates': ['June\xa012,\xa02010 ( 2010-6-12 ) ( Taormina Film Fest )',
  'June\xa018,\xa02010 ( 2010-6-18 ) (United States)'],
 'Running time': '103 minutes [ 1 ]',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$200\xa0million [ 1 ]',
 'Box office': '$1.067\xa0billion [ 1 ]'}

##### Cleaning part

In [8]:
movie_info['Release dates'][0] = movie_info['Release dates'][0].replace("\xa0", " ")
movie_info['Release dates'][1] = movie_info['Release dates'][1].replace("\xa0", " ")
movie_info['Budget'] = movie_info['Budget'].replace("\xa0", " ")
movie_info['Box office'] = movie_info['Box office'].replace("\xa0", " ")

In [9]:
movie_info['Running time'] = movie_info['Running time'][:-5]
movie_info['Budget'] = movie_info['Budget'][:-5]
movie_info['Box office'] = movie_info['Box office'][:-5]

#### Web scraping part 2. Scrap all tables from wikipedia. Hard approach

In [10]:
response = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")

In [11]:
soup = bs(response.content)

Need to write class separately, otherwise table returns None

Get headers

In [12]:
table = soup.find("table", class_=["wikitable", "sortable", "jquery-tablesorter"])

headers = []
for i in table.find_all("tr"):
    for th in i.find_all("th"):
        headers.append(th.get_text(strip = True))
headers

['Release date', 'Title', 'Notes']

Scrap first table

In [13]:
df = pd.DataFrame(columns = headers)

row = 0
for i in table.find_all("tr")[1:]:
    column_index = 0
    for td in i.find_all("td"):
        column = headers[column_index]
        df.loc[row, column] = td.get_text(strip = True)
        column_index += 1
    row += 1
            


Scrap all tables

In [21]:
tables = soup.find_all("table", class_=["wikitable", "sortable", "jquery-tablesorter"])

df = pd.DataFrame(columns = headers)

row = 0
for table in tables:
    tbody = table.find("tbody")
    for tr in table.find_all("tr"):
        column_index = 0
        for td in table.find_all("td"):
            if column_index > 2:
                column_index = 0
                row += 1
            column = headers[column_index]
            df.loc[row, column] = td.get_text(strip = True)
            column_index += 1
            index += 1
        row += 1

#### Web scraping part 2. Scrap all tables from wikipedia. Easy approach

In [38]:
index = 1
df = pd.DataFrame(columns = headers)
while index < 8:
    df = pd.concat([df, pd.read_html("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")[index]])
    index += 1 

In [39]:
df

Unnamed: 0,Release date,Title,Notes
0,"December 21, 1937",Snow White and the Seven Dwarfs,first film to be distributed by RKO Radio Pict...
1,"February 7, 1940",Pinocchio,Inducted into the National Film Registry in 1994
2,"November 13, 1940",Fantasia,anthology film Inducted into the National Film...
3,"June 20, 1941",The Reluctant Dragon,fictionalized tour around the Disney studio
4,"October 23, 1941",Dumbo,Inducted into the National Film Registry in 2017
5,"August 21, 1942",Bambi,Inducted into the National Film Registry in 2011
6,"February 6, 1943",Saludos Amigos,anthology film
7,"July 17, 1943",Victory Through Air Power,documentary film with wide use of animation; d...
8,"February 3, 1945",The Three Caballeros,anthology film
9,"April 20, 1946",Make Mine Music,anthology film
