# BeautifulSoup and Requests

In [78]:
from requests import get

In [79]:
url = 'https://www.imdb.com/search/title/?release_date=2017&sort=num_votes,desc&page=1'

In [80]:
response = get(url)

In [81]:
print(response)

<Response [200]>


In [82]:
print(response.text[:500])




<!DOCTYPE html>
<html
    xmlns:og="http://ogp.me/ns#"
    xmlns:fb="http://www.facebook.com/2008/fbml">
    <head>
         
        <meta charset="utf-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge">

    <meta name="apple-itunes-app" content="app-id=342792525, app-argument=imdb:///?src=mdot">



        <script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>

<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadTitle"


## Using BeautifulSoup to parse the HTML content

In [83]:
from bs4 import BeautifulSoup

In [84]:
html_soup = BeautifulSoup(response.text, 'html.parser')

In [85]:
type(html_soup)

bs4.BeautifulSoup

In [86]:
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')

In [87]:
print(type(movie_containers))

<class 'bs4.element.ResultSet'>


In [88]:
print(len(movie_containers))

50


## Extracting the data for a single movie

### The name of the movie

In [89]:
first_movie = movie_containers[0]

In [90]:
type(first_movie)

bs4.element.Tag

In [91]:
first_movie

<div class="lister-item mode-advanced">
<div class="lister-top-right">
<div class="ribbonize" data-caller="filmosearch" data-tconst="tt3315342"></div>
</div>
<div class="lister-item-image float-left">
<a href="/title/tt3315342/"> <img alt="Logan" class="loadlate" data-tconst="tt3315342" height="98" loadlate="https://m.media-amazon.com/images/M/MV5BYzc5MTU4N2EtYTkyMi00NjdhLTg3NWEtMTY4OTEyMzJhZTAzXkEyXkFqcGdeQXVyNjc1NTYyMjg@._V1_UX67_CR0,0,67,98_AL_.jpg" src="https://m.media-amazon.com/images/G/01/imdb/images/nopicture/large/film-184890147._CB470041630_.png" width="67"/>
</a> </div>
<div class="lister-item-content">
<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">1.</span>
<a href="/title/tt3315342/">Logan</a>
<span class="lister-item-year text-muted unbold">(2017)</span>
</h3>
<p class="text-muted">
<span class="certificate">A</span>
<span class="ghost">|</span>
<span class="runtime">137 min</span>
<span class="ghost">|</span>
<span class="genre">
Act

In [92]:
first_movie.div

<div class="lister-top-right">
<div class="ribbonize" data-caller="filmosearch" data-tconst="tt3315342"></div>
</div>

In [93]:
first_movie.a

<a href="/title/tt3315342/"> <img alt="Logan" class="loadlate" data-tconst="tt3315342" height="98" loadlate="https://m.media-amazon.com/images/M/MV5BYzc5MTU4N2EtYTkyMi00NjdhLTg3NWEtMTY4OTEyMzJhZTAzXkEyXkFqcGdeQXVyNjc1NTYyMjg@._V1_UX67_CR0,0,67,98_AL_.jpg" src="https://m.media-amazon.com/images/G/01/imdb/images/nopicture/large/film-184890147._CB470041630_.png" width="67"/>
</a>

In [94]:
first_movie.h3

<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">1.</span>
<a href="/title/tt3315342/">Logan</a>
<span class="lister-item-year text-muted unbold">(2017)</span>
</h3>

In [95]:
first_movie.h3.a

<a href="/title/tt3315342/">Logan</a>

In [96]:
first_movie.h3.a.text

'Logan'

In [97]:
first_name = first_movie.h3.a.text

### The year of the movie’s release

In [98]:
first_year = first_movie.h3.find('span', class_ = 'lister-item-year text-muted unbold')

In [99]:
first_year

<span class="lister-item-year text-muted unbold">(2017)</span>

In [100]:
first_text = first_year.text

In [101]:
first_text

'(2017)'

### The IMDB rating

In [102]:
first_movie.strong

<strong>8.1</strong>

In [103]:
first_imdb = float(first_movie.strong.text)

In [104]:
first_imdb

8.1

### The Metascore

In [105]:
first_mscore = first_movie.find('span', class_ = 'metascore favorable')

In [106]:
first_mscore = int(first_mscore.text)

In [107]:
print(first_mscore)

77


In [108]:
first_votes = first_movie.find('span', attrs = {'name':'nv'})

In [109]:
first_votes

<span data-value="567331" name="nv">567,331</span>

In [110]:
first_votes['data-value']

'567331'

In [111]:
first_votes = int(first_votes['data-value'])

In [112]:
eighth_movie_mscore = movie_containers[43].find('div', class_ = 'ratings-metascore')
type(eighth_movie_mscore)

NoneType

In [113]:
# Lists to store the scraped data in
names = []
years = []
imdb_ratings = []
metascores = []
votes = []
# Extract data from individual movie container
for container in movie_containers:
# If the movie has Metascore, then extract:
    if container.find('div', class_ = 'ratings-metascore') is not None:
    # The Metascore
        m_score = container.find('span', class_ = 'metascore').text
        metascores.append(int(m_score))
    else:
        metascores.append('NaN')
# The name
    name = container.h3.a.text
    names.append(name)
# The year
    year = container.h3.find('span', class_ = 'lister-item-year').text
    years.append(year)
# The IMDB rating
    imdb = float(container.strong.text)
    imdb_ratings.append(imdb)
# The number of votes
    vote = container.find('span', attrs = {'name':'nv'})['data-value']
    try:
        votes.append(int(vote))
    except:
        votes.append('NaN')

In [114]:
import pandas as pd
test_df = pd.DataFrame({'movie': names,
'year': years,
'imdb': imdb_ratings,
'votes': votes,
'metascore': metascores
})
print(test_df.info())
test_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
movie        50 non-null object
year         50 non-null object
imdb         50 non-null float64
votes        50 non-null int64
metascore    50 non-null object
dtypes: float64(1), int64(1), object(3)
memory usage: 2.0+ KB
None


Unnamed: 0,movie,year,imdb,votes,metascore
0,Logan,(2017),8.1,567331,77.0
1,Thor: Ragnarok,(2017),7.9,492325,74.0
2,Wonder Woman,(2017),7.5,491974,76.0
3,Guardians of the Galaxy Vol. 2,(2017),7.7,490974,67.0
4,Dunkirk,(2017),7.9,471757,94.0
5,Star Wars: Episode VIII - The Last Jedi,(2017),7.1,466805,85.0
6,Spider-Man: Homecoming,(2017),7.5,436958,73.0
7,Get Out,(I) (2017),7.7,407990,84.0
8,Blade Runner 2049,(2017),8.0,380364,81.0
9,Baby Driver,(2017),7.6,370866,86.0


# The script for multiple pages

## The script for multiple pages

In [115]:
pages = [str(i) for i in range(1,5)]
years_url = [str(i) for i in range(2000,2018)]

In [117]:
from time import sleep
from random import randint

In [118]:
def scrape(movie_containers):
    for container in movie_containers:
# If the movie has Metascore, then extract:
        if container.find('div', class_ = 'ratings-metascore') is not None:
    # The Metascore
            m_score = container.find('span', class_ = 'metascore').text
            metascores.append(int(m_score))
        else:
            metascores.append('NaN')
# The name
        name = container.h3.a.text
        names.append(name)
# The year
        year = container.h3.find('span', class_ = 'lister-item-year').text
        years.append(year)
# The IMDB rating
        imdb = float(container.strong.text)
        imdb_ratings.append(imdb)
# The number of votes
        vote = container.find('span', attrs = {'name':'nv'})['data-value']
        try:
            votes.append(int(vote))
        except:
            votes.append('NaN')

In [120]:
from time import time
start_time = time()
requests = 0
for _ in range(5):
# A request would go here
    requests += 1
    sleep(randint(1,3))
    elapsed_time = time() - start_time
    print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))

Request: 1; Frequency: 0.3329229644792022 requests/s
Request: 2; Frequency: 0.49935757315998763 requests/s
Request: 3; Frequency: 0.5992146485409833 requests/s
Request: 4; Frequency: 0.5706930492610918 requests/s
Request: 5; Frequency: 0.624242933335032 requests/s


In [129]:
from IPython.core.display import clear_output
start_time = time()
requests = 0
for _ in range(5):
# A request would go here
    requests += 1
    sleep(randint(1,3))
    current_time = time()
    elapsed_time = current_time - start_time
    print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
    clear_output(wait = True)

Request: 5; Frequency: 0.6233636589513988 requests/s


In [125]:
from warnings import warn
warn("Warning Simulation")

  


In [131]:
# Redeclaring the lists to store data in
names = []
years = []
imdb_ratings = []
metascores = []
votes = []

# Preparing the monitoring of the loop
start_time = time()
requests = 0
headers = ''
# For every year in the interval 2000-2017
for year_url in years_url:

    # For every page in the interval 1-4
    for page in pages:

        # Make a get request
        response = get('http://www.imdb.com/search/title?release_date=' + year_url +
        '&sort=num_votes,desc&page=' + page, headers = headers)

        # Pause the loop
        sleep(randint(8,15))

        # Monitor the requests
        requests += 1
        elapsed_time = time() - start_time
        print('Request:{}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
        clear_output(wait = True)

        # Throw a warning for non-200 status codes
        if response.status_code != 200:
            warn('Request: {}; Status code: {}'.format(requests, response.status_code))

        # Break the loop if the number of requests is greater than expected
        if requests > 72:
            warn('Number of requests was greater than expected.')
            break

        # Parse the content of the request with BeautifulSoup
        page_html = BeautifulSoup(response.text, 'html.parser')

        # Select all the 50 movie containers from a single page
        mv_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')

        # For every movie of these 50
        scrape(mv_containers)

Request:72; Frequency: 0.06438293965244717 requests/s


In [135]:
import pandas as pd
test_df = pd.DataFrame({'movie': names,
'year': years,
'imdb': imdb_ratings,
'votes': votes,
'metascore': metascores
})
print(test_df.info())
test_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600 entries, 0 to 3599
Data columns (total 5 columns):
movie        3600 non-null object
year         3600 non-null object
imdb         3600 non-null float64
votes        3600 non-null int64
metascore    3600 non-null object
dtypes: float64(1), int64(1), object(3)
memory usage: 140.7+ KB
None


Unnamed: 0,movie,year,imdb,votes,metascore
0,Gladiator,(2000),8.5,1220065,67
1,Memento,(2000),8.5,1036117,80
2,Snatch,(2000),8.3,722692,55
3,Requiem for a Dream,(2000),8.3,704190,68
4,X-Men,(2000),7.4,539311,64
5,Cast Away,(2000),7.8,475305,73
6,American Psycho,(2000),7.6,433905,64
7,Unbreakable,(2000),7.3,354962,62
8,Meet the Parents,(2000),7.0,293979,73
9,Mission: Impossible II,(2000),6.1,292234,59


In [136]:
test_df.to_csv('scrapped_movie_data.csv')