# Get the table of Top 250 Rated Movies from IMDB.com

## HTTP request using requests package

At this point, the data in the request is unstructured.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
url = "https://www.imdb.com/chart/top"
r = requests.get(url)
html_doc = r.text

In [3]:
# Print the html
print(html_doc[:500])




<!DOCTYPE html>
<html
    xmlns:og="http://ogp.me/ns#"
    xmlns:fb="http://www.facebook.com/2008/fbml">
    <head>
         
        <meta charset="utf-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge">

    
    
    

    
    
    

    <meta name="apple-itunes-app" content="app-id=342792525, app-argument=imdb:///?src=mdot">
            <style>
                body#styleguide-v2 {
                    background: no-repeat fixed center top #000;
                }
           


## use BeautifulSoup to parse data.

In [4]:
# Parse the HTML as a string
soup = BeautifulSoup(html_doc, 'lxml')

# Prettify the BeautifulSoup object
pretty_soup = soup.prettify()

In [5]:
print(pretty_soup[:500])

<!DOCTYPE html>
<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="app-id=342792525, app-argument=imdb:///?src=mdot" name="apple-itunes-app"/>
  <style>
   body#styleguide-v2 {
                    background: no-repeat fixed center top #000;
                }
  </style>
  <script type="text/javascript">
   var IMDbTimer={starttime: new Date().getTime(),pt:'jav


### Title of webpage

In [6]:
soup.title

<title>IMDb Top 250 - IMDb</title>

### Table of webpage
After observing the table, the info in 

td class="titleColumn"

span data-value="9.156868674854884" name="ir"

is the elements needed.

In [7]:
table = soup.find_all('table')[0]

In [8]:
table

<table class="chart full-width" data-caller-name="chart-top250movie">
<colgroup>
<col class="chartTableColumnPoster"/>
<col class="chartTableColumnTitle"/>
<col class="chartTableColumnIMDbRating"/>
<col class="chartTableColumnYourRating"/>
<col class="chartTableColumnWatchlistRibbon"/>
</colgroup>
<thead>
<tr>
<th></th>
<th>Rank &amp; Title</th>
<th>IMDb Rating</th>
<th>Your Rating</th>
<th></th>
</tr>
</thead>
<tbody class="lister-list">
<tr>
<td class="posterColumn">
<span data-value="1" name="rk"></span>
<span data-value="9.216927638836157" name="ir"></span>
<span data-value="7.791552E11" name="us"></span>
<span data-value="2024249" name="nv"></span>
<span data-value="-1.7830723611638426" name="ur"></span>
<a href="/title/tt0111161/?pf_rd_m=A2FGELUUNOQJNL&amp;pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&amp;pf_rd_r=W4F3WKE5NC5ZWW6FMZKJ&amp;pf_rd_s=center-1&amp;pf_rd_t=15506&amp;pf_rd_i=top&amp;ref_=chttp_tt_1"> <img height="67" src="https://m.media-amazon.com/images/M/MV5BMDFkYTc0MG

### Get info of movie
Use find_all() function to get specific info in class="titleColumn"

Then use text attribute to extract text without tags. 

Reference: https://stackoverflow.com/questions/23380171/using-beautifulsoup-extract-text-without-tags

In [9]:
movieTitle = table.find_all('td', {"class": "titleColumn"})
movieTitle[0:3]

[<td class="titleColumn">
       1.
       <a href="/title/tt0111161/?pf_rd_m=A2FGELUUNOQJNL&amp;pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&amp;pf_rd_r=W4F3WKE5NC5ZWW6FMZKJ&amp;pf_rd_s=center-1&amp;pf_rd_t=15506&amp;pf_rd_i=top&amp;ref_=chttp_tt_1" title="Frank Darabont (dir.), Tim Robbins, Morgan Freeman">The Shawshank Redemption</a>
 <span class="secondaryInfo">(1994)</span>
 </td>, <td class="titleColumn">
       2.
       <a href="/title/tt0068646/?pf_rd_m=A2FGELUUNOQJNL&amp;pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&amp;pf_rd_r=W4F3WKE5NC5ZWW6FMZKJ&amp;pf_rd_s=center-1&amp;pf_rd_t=15506&amp;pf_rd_i=top&amp;ref_=chttp_tt_2" title="Francis Ford Coppola (dir.), Marlon Brando, Al Pacino">The Godfather</a>
 <span class="secondaryInfo">(1972)</span>
 </td>, <td class="titleColumn">
       3.
       <a href="/title/tt0071562/?pf_rd_m=A2FGELUUNOQJNL&amp;pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&amp;pf_rd_r=W4F3WKE5NC5ZWW6FMZKJ&amp;pf_rd_s=center-1&amp;pf_rd_t=15506&amp;pf_rd_i=top&

In [10]:
title_list = []
for tag in movieTitle:
    title_list.append(tag.text.strip().split("\n"))
len(title_list)

250

### Clean movie info
Clean up info the rank, title, and year of movie and create a data frame.

In [11]:
title_list[0:3]

[['1.', '      The Shawshank Redemption', '(1994)'],
 ['2.', '      The Godfather', '(1972)'],
 ['3.', '      The Godfather: Part II', '(1974)']]

In [12]:
df_topMovies = pd.DataFrame(title_list, columns = ['Rank', 'Title', 'Year'])
df_topMovies.head()

Unnamed: 0,Rank,Title,Year
0,1.0,The Shawshank Redemption,(1994)
1,2.0,The Godfather,(1972)
2,3.0,The Godfather: Part II,(1974)
3,4.0,The Dark Knight,(2008)
4,5.0,12 Angry Men,(1957)


In [13]:
df_topMovies = df_topMovies.apply(lambda x: x.str.strip('.() '))

In [14]:
df_topMovies.head()

Unnamed: 0,Rank,Title,Year
0,1,The Shawshank Redemption,1994
1,2,The Godfather,1972
2,3,The Godfather: Part II,1974
3,4,The Dark Knight,2008
4,5,12 Angry Men,1957


### Get info of movie ratings
Use find_all() function to get specific info in name="ir"

Then extrat the attribute value of 'data-value' in each tag.

After data cleaning, add a column of rating into the data frame.

Reference: https://stackoverflow.com/questions/2612548/extracting-an-attribute-value-with-beautifulsoup

In [15]:
movieRating = table.find_all('span', {"name": "ir"})
movieRating[0:5]

[<span data-value="9.216927638836157" name="ir"></span>,
 <span data-value="9.156860217548445" name="ir"></span>,
 <span data-value="8.991573566247151" name="ir"></span>,
 <span data-value="8.958985134683418" name="ir"></span>,
 <span data-value="8.912096044684802" name="ir"></span>]

In [16]:
len(movieRating)

250

In [17]:
rating_list = []
for i in range(len(movieRating)):
    rating_list.append(float(movieRating[i]['data-value']))
len(title_list)

250

In [18]:
rating_list[0:5]

[9.216927638836157,
 9.156860217548445,
 8.991573566247151,
 8.958985134683418,
 8.912096044684802]

## Output: The table of top 250 rated movie

In [19]:
df_topMovies['Rating'] = rating_list
df_topMovies.head()

Unnamed: 0,Rank,Title,Year,Rating
0,1,The Shawshank Redemption,1994,9.216928
1,2,The Godfather,1972,9.15686
2,3,The Godfather: Part II,1974,8.991574
3,4,The Dark Knight,2008,8.958985
4,5,12 Angry Men,1957,8.912096


### Appendix: HTTP request by urllib package

In [20]:
from urllib.request import urlopen, Request

In [21]:
url = "https://www.imdb.com/chart/top"

# This packages the request: request
request = Request(url)

# Sends the request and catches the response: response
response = urlopen(request)

# Extract the response: html
html = response.read()

In [22]:
# Print the html
print(html)

b'\n\n\n<!DOCTYPE html>\n<html\n    xmlns:og="http://ogp.me/ns#"\n    xmlns:fb="http://www.facebook.com/2008/fbml">\n    <head>\n         \n        <meta charset="utf-8">\n        <meta http-equiv="X-UA-Compatible" content="IE=edge">\n\n    \n    \n    \n\n    \n    \n    \n\n    <meta name="apple-itunes-app" content="app-id=342792525, app-argument=imdb:///?src=mdot">\n            <style>\n                body#styleguide-v2 {\n                    background: no-repeat fixed center top #000;\n                }\n            </style>\n\n\n\n        <script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:\'java\'};</script>\n\n<script>\n    if (typeof uet == \'function\') {\n      uet("bb", "LoadTitle", {wb: 1});\n    }\n</script>\n  <script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>\n        <title>IMDb Top 250 - IMDb</title>\n  <script>(function(t){ (t.events = t.events || {})["csm_head_post_tit

In [23]:
print(type(response))

<class 'http.client.HTTPResponse'>


In [24]:
response.close()