# Introduction

## What is scraping the web?

## Why scrape data?

## Is this Legal/ Ethical?

# Programming:

In [1]:
# Import packages

# Webscraping:
from bs4 import BeautifulSoup

# access the web:
import requests

# data formnatting
import pandas as pd

# text searching:
import re

# Making the Soup
https://beautiful-soup-4.readthedocs.io/en/latest/#making-the-soup

In [2]:
with open("simple_webpage.html") as f:
    _txt_file = f.readlines()
print(_txt_file)
# mess:

['<!DOCTYPE html>\n', '\n', '<html>\n', '<body>\n', '\n', '<h1>My First Heading</h1>\n', '\n', '<p>My first paragraph.</p>\n', '\n', '<h1>My Second Heading</h1>\n', '\n', '<p>My Second paragraph.</p>\n', '\n', '<h1>My Third Heading</h1>\n', '\n', '<p>My Third paragraph.</p>\n', '\n', '</body>\n', '</html>\n']


In [3]:
with open("simple_webpage.html") as f:
    soup = BeautifulSoup(f)
print(soup)
# not mess:

<!DOCTYPE html>
<html>
<body>
<h1>My First Heading</h1>
<p>My first paragraph.</p>
<h1>My Second Heading</h1>
<p>My Second paragraph.</p>
<h1>My Third Heading</h1>
<p>My Third paragraph.</p>
</body>
</html>



In [None]:
print(type(soup))
# a soup object is a?

In [None]:
print(vars(soup))
# what do they have?

In [4]:
print(soup.contents)

['html', <html>
<body>
<h1>My First Heading</h1>
<p>My first paragraph.</p>
<h1>My Second Heading</h1>
<p>My Second paragraph.</p>
<h1>My Third Heading</h1>
<p>My Third paragraph.</p>
</body>
</html>, '\n']


In [5]:
soup.h1

<h1>My First Heading</h1>

In [6]:
soup.p

<p>My first paragraph.</p>

# Kinds of objects
https://beautiful-soup-4.readthedocs.io/en/latest/#kinds-of-objects

## Name

In [7]:
soup.h1.name

'h1'

## Attributes

In [8]:
soup.h1['class']
# Produces Key Error
# So add class to html variable

KeyError: 'class'

In [9]:
soup.h1['id']

KeyError: 'id'

## Switch to actor Example

In [10]:
with open("example_actor_webpage.html") as f:
    soup = BeautifulSoup(f)
print(soup)
# now that our example has more attributes ...

<!DOCTYPE html>
<html>
<body>
<div>
<h1 class="title" id="movie_title">The Irishman</h1>
<p>A paragraph of text about The Irishman, directed by <a href="https://www.imdb.com/name/nm0000217/">Martin Scorsese</a>.</p>
</div>
<div class="table" id="cast">
<h2>The Cast</h2>
<table>
<tr>
<td><a href="">Dave</a></td><td>Lead</td>
</tr>
<tr>
<td>Barry</td><td>Supporting Role</td>
</tr>
<tr>
<td>Alejandro</td><td>Supporting Role</td>
</tr>
</table>
</div>
<div>
<h1>Storyline</h1>
<p>My Third paragraph.</p>
</div>
</body>
</html>


In [11]:
soup.h1['class']

['title']

## Text

In [12]:
soup.h1.text

'The Irishman'

## Links (an attribute)

In [13]:
soup.a.text

'Martin Scorsese'

In [14]:
soup.a['href']

'https://www.imdb.com/name/nm0000217/'

# Navigating the tree

## Going Down

### Children

In [15]:
list(soup.div.children)

['\n',
 <h1 class="title" id="movie_title">The Irishman</h1>,
 '\n',
 <p>A paragraph of text about The Irishman, directed by <a href="https://www.imdb.com/name/nm0000217/">Martin Scorsese</a>.</p>,
 '\n']

In [16]:
for child in soup.div.children:
    print(child)



<h1 class="title" id="movie_title">The Irishman</h1>


<p>A paragraph of text about The Irishman, directed by <a href="https://www.imdb.com/name/nm0000217/">Martin Scorsese</a>.</p>




### Descendants
Descendant are almost the same as children yet there's one major difference:


In [17]:
for child in soup.div.descendants:
    print(child)



<h1 class="title" id="movie_title">The Irishman</h1>
The Irishman


<p>A paragraph of text about The Irishman, directed by <a href="https://www.imdb.com/name/nm0000217/">Martin Scorsese</a>.</p>
A paragraph of text about The Irishman, directed by 
<a href="https://www.imdb.com/name/nm0000217/">Martin Scorsese</a>
Martin Scorsese
.




## Going up

In [18]:
soup.a.parent

<p>A paragraph of text about The Irishman, directed by <a href="https://www.imdb.com/name/nm0000217/">Martin Scorsese</a>.</p>

In [19]:
list(soup.a.parents)

[<p>A paragraph of text about The Irishman, directed by <a href="https://www.imdb.com/name/nm0000217/">Martin Scorsese</a>.</p>,
 <div>
 <h1 class="title" id="movie_title">The Irishman</h1>
 <p>A paragraph of text about The Irishman, directed by <a href="https://www.imdb.com/name/nm0000217/">Martin Scorsese</a>.</p>
 </div>,
 <body>
 <div>
 <h1 class="title" id="movie_title">The Irishman</h1>
 <p>A paragraph of text about The Irishman, directed by <a href="https://www.imdb.com/name/nm0000217/">Martin Scorsese</a>.</p>
 </div>
 <div class="table" id="cast">
 <h2>The Cast</h2>
 <table>
 <tr>
 <td><a href="">Dave</a></td><td>Lead</td>
 </tr>
 <tr>
 <td>Barry</td><td>Supporting Role</td>
 </tr>
 <tr>
 <td>Alejandro</td><td>Supporting Role</td>
 </tr>
 </table>
 </div>
 <div>
 <h1>Storyline</h1>
 <p>My Third paragraph.</p>
 </div>
 </body>,
 <html>
 <body>
 <div>
 <h1 class="title" id="movie_title">The Irishman</h1>
 <p>A paragraph of text about The Irishman, directed by <a href="https://ww

In [20]:
list(soup.a.parents)[0]

<p>A paragraph of text about The Irishman, directed by <a href="https://www.imdb.com/name/nm0000217/">Martin Scorsese</a>.</p>

## Going Sideways

In [21]:
list(soup.table.tr.td.next_siblings)

[<td>Lead</td>, '\n']

In [22]:
for row in soup.table.contents:
    if row.text == 'Lead':
        print(row)
# Produces Attribute Errpr

AttributeError: 'NavigableString' object has no attribute 'text'

BS4 Gives an error because not all the objects it's returning have a 'text' attribute. Namely the empty strings. You could easily shortcut this with an if statement or try/ except, but that would stretch this class.

In [23]:
for row in soup.table.tr.td.next_siblings:
    if 'Lead' in row.string:
        print(row.previous_sibling.text)

Dave


In [24]:
soup.table.tr.next_element

'\n'

# Searching the tree
https://beautiful-soup-4.readthedocs.io/en/latest/#searching-the-tree

## Filters

In [25]:
soup.find_all('a')

[<a href="https://www.imdb.com/name/nm0000217/">Martin Scorsese</a>,
 <a href="">Dave</a>]

In [26]:
soup.find_all(['a', 'td'])

[<a href="https://www.imdb.com/name/nm0000217/">Martin Scorsese</a>,
 <td><a href="">Dave</a></td>,
 <a href="">Dave</a>,
 <td>Lead</td>,
 <td>Barry</td>,
 <td>Supporting Role</td>,
 <td>Alejandro</td>,
 <td>Supporting Role</td>]

### Function Filter

A bit more challenging, though very useful.

https://beautiful-soup-4.readthedocs.io/en/latest/index.html?highlight=navigablestring#a-function

### Attributes

In [27]:
soup.find_all(id='movie_title')

[<h1 class="title" id="movie_title">The Irishman</h1>]

In [28]:
soup.find_all('h1', class_='title')

[<h1 class="title" id="movie_title">The Irishman</h1>]

# Into the wild

In [31]:
webpage = requests.get("https://www.imdb.com/title/tt1302006")

In [None]:
print(webpage.content)

In [33]:
soup = BeautifulSoup(webpage.content)

In [34]:
# links = soup.find_all('a')
links = soup.find_all(href=True)

In [None]:
for link in links:
    print(type(link))
    print(link.text)
    print(link)

In [36]:
links[0]['href']

'https://www.imdb.com/title/tt1302006/'

In [37]:
my_links = []
for link in links:
    wrapped = {'text':link.text,'url':link['href']}
    my_links.append(wrapped)
    

In [None]:
my_links
# we get a lot of empty text fields

# Presenting our Data

In [39]:
df = pd.DataFrame(my_links)

In [40]:
df

Unnamed: 0,text,url
0,,https://www.imdb.com/title/tt1302006/
1,,https://m.imdb.com/title/tt1302006/
2,,https://m.media-amazon.com/images/G/01/imdb/im...
3,,https://m.media-amazon.com/images/G/01/imdb/im...
4,,https://m.media-amazon.com/images/G/01/imdb/im...
...,...,...
482,Advertising,https://advertising.amazon.com/products/displa...
483,Jobs,https://www.amazon.jobs/en/teams/imdb?ref_=ft_jb
484,Conditions of Use,/conditions?ref_=ft_cou
485,Privacy Policy,/privacy?ref_=ft_pvc


## Bonus Pandas Tricks

In [None]:
df.loc[df.text.str.len() > 0]

In [None]:
df.loc[df.url.str.startswith('http')]
# Pandas documentation https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.startswith.html

In [None]:
df.loc[df.text.str.contains('Scorsese')]

# Applying what we know

## What genre of films do actors mainly work in?

In [67]:
actor = requests.get('https://www.imdb.com/name/nm0000134/?ref_=tt_cl_t1')

In [68]:
soup = BeautifulSoup(actor.content)

### Getting into the Filmography

In [69]:
filmography = soup.find(id='filmography')

In [72]:
actor_filmography = filmography.find('div', {'data-category' : 'actor'}).find_next('div', class_='filmo-category-section')

In [73]:
# Break up the above
actor_filmography = filmography.find('div', {'data-category' : 'actor'})
actor_filmography = actor_filmography.find_next('div', class_='filmo-category-section')

In [74]:
film_links = actor_filmography.find_all(href=True)

## Let's see those links

In [None]:
for link in film_links:
    print(link['href'])

In [None]:
baselink = 'https://www.imdb.com'
for link in films:
    if link['href'].startswith('/'):
        print(baselink + link['href'])

## Collecting Genres

In [79]:
baselink = 'https://www.imdb.com'
for link in film_links[:5]:
    if link['href'].startswith('/'):
        
        # CRAWL into another page
        
        url = baselink + link['href']
        page = requests.get(url)
        soup = BeautifulSoup(page.content)
        subtext = soup.find('div', class_='subtext').find_all(href=True)
        print(subtext)

[<a href="/search/title?genres=action&amp;explore=title_type,genres">Action</a>]
[<a href="/search/title?genres=crime&amp;explore=title_type,genres">Crime</a>, <a href="/search/title?genres=drama&amp;explore=title_type,genres">Drama</a>]
[<a href="/search/title?genres=crime&amp;explore=title_type,genres">Crime</a>, <a href="/search/title?genres=drama&amp;explore=title_type,genres">Drama</a>, <a href="/search/title?genres=history&amp;explore=title_type,genres">History</a>, <a href="/title/tt5537002/releaseinfo" title="See more release dates">2021 (USA)
</a>]


In [142]:
baselink = 'https://www.imdb.com'

# Somewhere to store our genres:
genres = []

# to search our for genre:
genre = re.compile('genre')
    
for link in film_links[:5]:
    if link['href'].startswith('/'):
        url = baselink + link['href']
        page = requests.get(url)
        soup = BeautifulSoup(page.content)
        subtext = soup.find('div', class_='subtext').find_all(href=genre, text=True)
        for g in subtext:
            genres.append(g.text)

In [143]:
genres

['Action', 'Crime', 'Drama', 'Crime', 'Drama', 'History']

### Collecting more actors

In [150]:
name = re.compile('name')
actors = []
cache = []

for link in film_links[:5]:
    if link['href'].startswith('/'):
        url = baselink + link['href']
        page = requests.get(url)
        soup = BeautifulSoup(page.content)
        _actors = soup.find('table', class_='cast_list').find_all(href=name, text=True)
        for a in _actors:
            actors.append(a.text.strip('\n')[1:])
            cache.append(baselink+a['href'])

In [151]:
actors

['Robert De Niro',
 'Edgar Ramírez',
 'Robert De Niro',
 'Chloë Grace Moretz',
 'Shailene Woodley',
 'Miles Teller',
 'Robert De Niro',
 'Leonardo DiCaprio']

In [86]:
cache

['https://www.imdb.com/name/nm0000134/',
 'https://www.imdb.com/name/nm0000134/',
 'https://www.imdb.com/name/nm1183149/',
 'https://www.imdb.com/name/nm1183149/',
 'https://www.imdb.com/name/nm0000134/',
 'https://www.imdb.com/name/nm0000134/',
 'https://www.imdb.com/name/nm1631269/',
 'https://www.imdb.com/name/nm1631269/',
 'https://www.imdb.com/name/nm0940362/',
 'https://www.imdb.com/name/nm0940362/',
 'https://www.imdb.com/name/nm1886602/',
 'https://www.imdb.com/name/nm1886602/',
 'https://www.imdb.com/name/nm0000134/',
 'https://www.imdb.com/name/nm0000134/',
 'https://www.imdb.com/name/nm0000138/',
 'https://www.imdb.com/name/nm0000138/']

## Crawling

In [673]:
cache = ['https://www.imdb.com/title/tt1302006']
log  = []
limit = 20

In [674]:
films = []

In [675]:
baselink = 'https://www.imdb.com'

In [676]:
genre = re.compile('genre')
name = re.compile('name')
title = re.compile('title')

In [677]:
while len(films) < limit:
    for film_link in cache[:limit]:
        # Check that the film is new
        if film_link not in log:
            # make the soup           
            webpage = requests.get(film_link)
            soup = BeautifulSoup(webpage.content)
            
            # get the film title
            film_name = soup.find('h1').text.strip('\n')
            
            # get the film genres
            genres = []
            _genres = soup.find('div', class_='subtext').find_all(href=genre, text=True)
            for g in _genres:
                genres.append(g.text)
            
            # get the film actors
            actors = []
            _actors = soup.find('table', class_='cast_list').find_all(href=name, text=True)
            for a in _actors:
                actors.append(a.text)
            
            # get the films duration
            duration = soup.find('div', class_='subtext').find('time', text=True).text
            
            # record where we've been
            log.append(film_link)
            
            # get the next films from this ones 'related'
            more_films = soup.find('div', class_='rec_page').find_all('a')
            _cache = [baselink+x['href'] for x in more_films if x not in log]
            
            # save the film data:
            films.append({'film_name': film_name, 'genres' : genres, 'actors': actors, 'duration':duration})

    cache = _cache

# Present out data in Pandas

In [723]:
df = pd.DataFrame(films)

In [724]:
df

Unnamed: 0,film_name,genres,actors,duration
0,The Irishman (2019),"[Biography, Crime, Drama]","[ Robert De Niro\n, Al Pacino\n, Joe Pesci\n...",\n 3h 29min\n ...
1,Once Upon a Time... in Hollywood (2019),"[Comedy, Drama]","[ Leonardo DiCaprio\n, Brad Pitt\n, Margot R...",\n 2h 41min\n ...
2,Joker (2019),"[Crime, Drama, Thriller]","[ Joaquin Phoenix\n, Robert De Niro\n, Zazie...",\n 2h 2min\n ...
3,Goodfellas (1990),"[Biography, Crime, Drama]","[ Robert De Niro\n, Ray Liotta\n, Joe Pesci\...",\n 2h 26min\n ...
4,Casino (1995),"[Crime, Drama]","[ Robert De Niro\n, Sharon Stone\n, Joe Pesc...",\n 2h 58min\n ...
5,The King (2019),"[Biography, Drama, History]","[ Tom Glynn-Carney\n, Gábor Czap\n, Tom Fish...",\n 2h 20min\n ...
6,Gisaengchung (2019),"[Comedy, Drama, Thriller]","[ Kang-ho Song\n, Sun-kyun Lee\n, Yeo-jeong ...",\n 2h 12min\n ...
7,The Lighthouse (2019),"[Drama, Fantasy, Horror]","[ Willem Dafoe\n, Robert Pattinson\n, Valeri...",\n 1h 49min\n ...
8,Us (2019),"[Horror, Mystery, Thriller]","[ Lupita Nyong'o\n, Winston Duke\n, Elisabet...",\n 1h 56min\n ...
9,Midsommar (2019),"[Drama, Horror, Mystery]","[ Florence Pugh\n, Jack Reynor\n, Vilhelm Bl...",\n 2h 27min\n ...


## Tidy Data

In [725]:
def tidy_date(rough):
    time = rough[-8:-3]
    hours = int(time[0])
    minutes = int(time[2:])
    return (hours*60)+minutes

df['minutes'] = df.duration.str.strip().apply(tidy_date)

# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.html
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.apply.html

## Re format data

In [726]:
genres_df = pd.DataFrame(df.genres.explode())
actors_df = pd.DataFrame(df.actors.explode())

# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.explode.html

### Merge new series

In [727]:
genre_actors_merged = actors_df.merge(genres_df,left_index=True, right_index=True)
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html

In [728]:
# more tidying
genre_actors_merged['film_name'] = df['film_name'].str.strip()
genre_actors_merged['minutes'] = df['minutes']
genre_actors_merged['actors'] = df['actors'].str.strip()

### House Keeping

In [729]:
df = genre_actors_merged.reset_index()[['actors','genres','film_name','minutes']]
df.columns = ['actor','genre','film', 'minutes']

# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.reset_index.html

In [730]:
df

Unnamed: 0,actor,genre,film,minutes
0,,Biography,The Irishman (2019),209
1,,Crime,The Irishman (2019),209
2,,Drama,The Irishman (2019),209
3,,Biography,The Irishman (2019),209
4,,Crime,The Irishman (2019),209
...,...,...,...,...
829,,Adventure,Guardians of the Galaxy Vol. 2 (2017),136
830,,Comedy,Guardians of the Galaxy Vol. 2 (2017),136
831,,Action,Guardians of the Galaxy Vol. 2 (2017),136
832,,Adventure,Guardians of the Galaxy Vol. 2 (2017),136


# Show duration by genre

In [685]:
df[['minutes','genre']].groupby('genre').mean()

# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html
# https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.mean.html

Unnamed: 0_level_0,minutes
genre,Unnamed: 1_level_1
Action,138.8
Adventure,138.8
Biography,165.0
Comedy,136.0
Crime,163.75
Drama,156.282609
Fantasy,131.272727
History,140.0
Horror,129.454545
Mystery,131.5
