In [2]:
# web_scraping helper
import requests
from bs4 import BeautifulSoup
import os
import sys
from google.cloud import bigquery
import datetime
import pandas as pd

def _get_soup(url = 'https://www.imdb.com/chart/top/?ref_=nv_mv_250'):

    '''
    Get the BeautifulSoup object from a url.
    Args:
        - url(str) = url of the website
            Default: 'https://www.imdb.com/chart/top/?ref_=nv_mv_250'
    Returns:
        - soup(BeautifulSoup) = BeautifulSoup object
    '''
    
    # Send a get request and parse using BeautifulSoup
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup


def _scrape_most_popular_titles(soup):

    '''
    Scrape the most popular titles from the IMDB website.
    Args:
        - soup(BeautifulSoup) = BeautifulSoup object
    Returns:
        - movie_names(list) = List of movie names
    '''

    # Find all movie names in the url
    movie_names = []
    titlesRefs = soup.find_all('td', {'class':'titleColumn'})

    # Collect movies into list
    for title in titlesRefs:
        movie_names.append(title.find("a").text)
    
    return movie_names

soup = _get_soup()

In [3]:
movie_names = []
movie_ratings = []

# Collect titles into list
titlesRefs = soup.find_all('td', {'class':'titleColumn'})
for title in titlesRefs:
    movie_names.append(title.find("a").text)

ratingsRefs = soup.find_all('td', {'class':'ratingColumn imdbRating'})
for rating in ratingsRefs:
    movie_ratings.append(float(rating.find("strong").text))

movie_dict = dict(zip(movie_names, movie_ratings))
movie_dict

{'The Shawshank Redemption': 9.2,
 'The Godfather': 9.2,
 'The Dark Knight': 9.0,
 'The Godfather Part II': 9.0,
 '12 Angry Men': 9.0,
 "Schindler's List": 8.9,
 'The Lord of the Rings: The Return of the King': 8.9,
 'Pulp Fiction': 8.8,
 'The Lord of the Rings: The Fellowship of the Ring': 8.8,
 'Il buono, il brutto, il cattivo': 8.8,
 'Forrest Gump': 8.8,
 'Fight Club': 8.7,
 'The Lord of the Rings: The Two Towers': 8.7,
 'Inception': 8.7,
 'Star Wars: Episode V - The Empire Strikes Back': 8.7,
 'The Matrix': 8.7,
 'GoodFellas': 8.7,
 "One Flew Over the Cuckoo's Nest": 8.6,
 'Spider-Man: Across the Spider-Verse': 8.6,
 'Se7en': 8.6,
 "It's a Wonderful Life": 8.6,
 'Shichinin no samurai': 8.6,
 'The Silence of the Lambs': 8.6,
 'Saving Private Ryan': 8.6,
 'Cidade de Deus': 8.6,
 'Interstellar': 8.6,
 'La vita è bella': 8.6,
 'The Green Mile': 8.6,
 'Star Wars': 8.5,
 'Terminator 2: Judgment Day': 8.5,
 'Back to the Future': 8.5,
 'Sen to Chihiro no kamikakushi': 8.5,
 'The Pianist': 

In [12]:
movies_df = pd.DataFrame(movie_dict.items(), columns=['movie_name', 'movie_rating'])

    # set index as movie id
movies_df['date'] = datetime.datetime.today()
movies_df


Unnamed: 0,movie_name,movie_rating,date
0,The Shawshank Redemption,9.2,2023-06-04 11:22:28.719655
1,The Godfather,9.2,2023-06-04 11:22:28.719655
2,The Dark Knight,9.0,2023-06-04 11:22:28.719655
3,The Godfather Part II,9.0,2023-06-04 11:22:28.719655
4,12 Angry Men,9.0,2023-06-04 11:22:28.719655
...,...,...,...
245,Life of Brian,8.0,2023-06-04 11:22:28.719655
246,The Iron Giant,8.0,2023-06-04 11:22:28.719655
247,The Help,8.0,2023-06-04 11:22:28.719655
248,Aladdin,8.0,2023-06-04 11:22:28.719655


In [10]:
movies_df

Unnamed: 0,movie_id,movie_name,movie_rating,date
0,1,The Shawshank Redemption,9.2,2023-06-04 11:11:41.136194
1,2,The Godfather,9.2,2023-06-04 11:11:41.136194
2,3,The Dark Knight,9.0,2023-06-04 11:11:41.136194
3,4,The Godfather Part II,9.0,2023-06-04 11:11:41.136194
4,5,12 Angry Men,9.0,2023-06-04 11:11:41.136194
...,...,...,...,...
245,246,Life of Brian,8.0,2023-06-04 11:11:41.136194
246,247,The Iron Giant,8.0,2023-06-04 11:11:41.136194
247,248,The Help,8.0,2023-06-04 11:11:41.136194
248,249,Aladdin,8.0,2023-06-04 11:11:41.136194


In [22]:
# web_scraping helper
import requests
from bs4 import BeautifulSoup
import os
import sys
from google.cloud import bigquery
import datetime
import pandas as pd

def _get_soup(url = 'https://www.imdb.com/chart/moviemeter?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=470df400-70d9-4f35-bb05-8646a1195842&pf_rd_r=2DV800026N5GD67AASMQ&pf_rd_s=right-4&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_ql_2'):

    '''
    Get the BeautifulSoup object from a url.
    Args:
        - url(str) = url of the website
            Default: 'https://www.imdb.com/chart/moviemeter?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=470df400-70d9-4f35-bb05-8646a1195842&pf_rd_r=2DV800026N5GD67AASMQ&pf_rd_s=right-4&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_ql_2'
    Returns:
        - soup(BeautifulSoup) = BeautifulSoup object
    '''
    
    # Send a get request and parse using BeautifulSoup
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup


soup = _get_soup()

def _scrape_most_popular_movies(soup):

    '''
    Scrape the most popular titles and ratings from the IMDB website.
    Args:
        - soup(BeautifulSoup) = BeautifulSoup object
    Returns:
        - movie_dict(dict) = Dictionary of movie names and ratings
    '''

    # Find all movie names in the url
    movie_names = []
    movie_ratings = []

    # Find all movie in the url
    titlesRefs = soup.find_all('td', {'class':'titleColumn'})
    ratingsRefs = soup.find_all('td', {'class':'ratingColumn imdbRating'})

    # Collect movies into title and rating list
    for title in titlesRefs:
        movie_names.append(title.find("a").text)

    for rating in ratingsRefs:
        movie_ratings.append(float(rating.find("strong").text))

    # Combine title and rating list into a dictionary
    movie_dict = dict(zip(movie_names, movie_ratings))
    
    return movie_dict

movie_names = []
movie_ratings = []

ratingsRefs = soup.find_all('td', {'class':'ratingColumn imdbRating'})


for i in range(250):
    try:
        print(float(ratingsRefs[i].find("strong").text))
    except:
        print('Missing rating. Replacing with -1')
        

7.2
6.3
8.2
9.5
8.0
7.3
7.2
6.1
7.5
Missing rating
7.0
Missing rating
5.6
Missing rating
4.8
7.5
6.7
6.7
6.0
7.5
6.9
Missing rating
9.1
Missing rating
Missing rating
6.5
Missing rating
Missing rating
7.1
5.2
Missing rating
7.1
6.2
Missing rating
5.7
6.6
6.6
6.4
6.3
6.8
5.9
6.7
6.1
8.0
5.8
7.1
6.1
7.7
7.0
8.3
Missing rating
5.0
8.2
7.7
6.8
7.8
8.3
9.2
5.2
4.2
5.4
7.2
7.3
5.4
7.4
8.0
8.7
9.3
7.4
6.2
6.3
5.2
6.0
7.1
Missing rating
7.3
7.6
6.3
7.6
7.6
6.3
Missing rating
7.0
7.3
7.2
8.4
6.5
7.7
7.8
7.3
6.7
6.6
7.7
6.1
6.5
7.6
3.9
7.2
6.8
7.6
Missing rating
Missing rating
Missing rating
Missing rating
Missing rating
Missing rating
Missing rating
Missing rating
Missing rating
Missing rating
Missing rating
Missing rating
Missing rating
Missing rating
Missing rating
Missing rating
Missing rating
Missing rating
Missing rating
Missing rating
Missing rating
Missing rating
Missing rating
Missing rating
Missing rating
Missing rating
Missing rating
Missing rating
Missing rating
Missing rating
Missing

In [3]:
response = requests.get('https://www.imdb.com/chart/moviemeter?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=470df400-70d9-4f35-bb05-8646a1195842&pf_rd_r=EEXWJ129DPX4KJKY6EAB&pf_rd_s=right-4&pf_rd_t=15506&pf_rd_i=topenglish&ref_=chttentp_ql_2')

response.status_code

200

In [4]:
soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
soup


<!DOCTYPE html>

<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
<head>
<meta charset="utf-8"/>
<style>
                body#styleguide-v2 {
                    background: no-repeat fixed center top #000;
                }
            </style>
<script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>
<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
</script>
<script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>
<title>Most Popular Movies - IMDb</title>
<script>(function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);</script>
<script>
    if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
</script>
<script>
    if (typeof uex == 'function') {
      uex("ld", "LoadTitle", {wb: 1});
    }
</script>
<link href="https://www.imdb.com/chart/