In [2]:
# web_scraping helper
import requests
from bs4 import BeautifulSoup
import os
import sys
from google.cloud import bigquery
import datetime
import pandas as pd

def _get_soup(url = 'https://www.imdb.com/chart/top/?ref_=nv_mv_250'):

    '''
    Get the BeautifulSoup object from a url.
    Args:
        - url(str) = url of the website
            Default: 'https://www.imdb.com/chart/top/?ref_=nv_mv_250'
    Returns:
        - soup(BeautifulSoup) = BeautifulSoup object
    '''
    
    # Send a get request and parse using BeautifulSoup
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup


def _scrape_most_popular_titles(soup):

    '''
    Scrape the most popular titles from the IMDB website.
    Args:
        - soup(BeautifulSoup) = BeautifulSoup object
    Returns:
        - movie_names(list) = List of movie names
    '''

    # Find all movie names in the url
    movie_names = []
    titlesRefs = soup.find_all('td', {'class':'titleColumn'})

    # Collect movies into list
    for title in titlesRefs:
        movie_names.append(title.find("a").text)
    
    return movie_names

soup = _get_soup()

In [3]:
movie_names = []
movie_ratings = []

# Collect titles into list
titlesRefs = soup.find_all('td', {'class':'titleColumn'})
for title in titlesRefs:
    movie_names.append(title.find("a").text)

ratingsRefs = soup.find_all('td', {'class':'ratingColumn imdbRating'})
for rating in ratingsRefs:
    movie_ratings.append(float(rating.find("strong").text))

movie_dict = dict(zip(movie_names, movie_ratings))
movie_dict

{'The Shawshank Redemption': 9.2,
 'The Godfather': 9.2,
 'The Dark Knight': 9.0,
 'The Godfather Part II': 9.0,
 '12 Angry Men': 9.0,
 "Schindler's List": 8.9,
 'The Lord of the Rings: The Return of the King': 8.9,
 'Pulp Fiction': 8.8,
 'The Lord of the Rings: The Fellowship of the Ring': 8.8,
 'Il buono, il brutto, il cattivo': 8.8,
 'Forrest Gump': 8.8,
 'Fight Club': 8.7,
 'The Lord of the Rings: The Two Towers': 8.7,
 'Inception': 8.7,
 'Star Wars: Episode V - The Empire Strikes Back': 8.7,
 'The Matrix': 8.7,
 'GoodFellas': 8.7,
 "One Flew Over the Cuckoo's Nest": 8.6,
 'Spider-Man: Across the Spider-Verse': 8.6,
 'Se7en': 8.6,
 "It's a Wonderful Life": 8.6,
 'Shichinin no samurai': 8.6,
 'The Silence of the Lambs': 8.6,
 'Saving Private Ryan': 8.6,
 'Cidade de Deus': 8.6,
 'Interstellar': 8.6,
 'La vita è bella': 8.6,
 'The Green Mile': 8.6,
 'Star Wars': 8.5,
 'Terminator 2: Judgment Day': 8.5,
 'Back to the Future': 8.5,
 'Sen to Chihiro no kamikakushi': 8.5,
 'The Pianist': 

In [12]:
movies_df = pd.DataFrame(movie_dict.items(), columns=['movie_name', 'movie_rating'])

    # set index as movie id
movies_df['date'] = datetime.datetime.today()
movies_df


Unnamed: 0,movie_name,movie_rating,date
0,The Shawshank Redemption,9.2,2023-06-04 11:22:28.719655
1,The Godfather,9.2,2023-06-04 11:22:28.719655
2,The Dark Knight,9.0,2023-06-04 11:22:28.719655
3,The Godfather Part II,9.0,2023-06-04 11:22:28.719655
4,12 Angry Men,9.0,2023-06-04 11:22:28.719655
...,...,...,...
245,Life of Brian,8.0,2023-06-04 11:22:28.719655
246,The Iron Giant,8.0,2023-06-04 11:22:28.719655
247,The Help,8.0,2023-06-04 11:22:28.719655
248,Aladdin,8.0,2023-06-04 11:22:28.719655


In [10]:
movies_df

Unnamed: 0,movie_id,movie_name,movie_rating,date
0,1,The Shawshank Redemption,9.2,2023-06-04 11:11:41.136194
1,2,The Godfather,9.2,2023-06-04 11:11:41.136194
2,3,The Dark Knight,9.0,2023-06-04 11:11:41.136194
3,4,The Godfather Part II,9.0,2023-06-04 11:11:41.136194
4,5,12 Angry Men,9.0,2023-06-04 11:11:41.136194
...,...,...,...,...
245,246,Life of Brian,8.0,2023-06-04 11:11:41.136194
246,247,The Iron Giant,8.0,2023-06-04 11:11:41.136194
247,248,The Help,8.0,2023-06-04 11:11:41.136194
248,249,Aladdin,8.0,2023-06-04 11:11:41.136194


In [1]:
# web_scraping helper
import requests
from bs4 import BeautifulSoup
import os
import sys
from google.cloud import bigquery
import datetime
import pandas as pd


def _get_soup(chart):

    '''
    Get the BeautifulSoup object from a url.
    Args:
        - chart(str) = chart to scrape
            Options: 'most_popular_movies', 'top_250_movies', 'top_english_movies', 'top_250_tv'
    Returns:
        - soup(BeautifulSoup) = BeautifulSoup object
    '''
    
    # Send a get request and parse using BeautifulSoup
    if chart == 'most_popular_movies':
        url = 'https://www.imdb.com/chart/moviemeter?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=470df400-70d9-4f35-bb05-8646a1195842&pf_rd_r=5V6VAGPEK222QB9E0SZ8&pf_rd_s=right-4&pf_rd_t=15506&pf_rd_i=toptv&ref_=chttvtp_ql_2'
    
    if chart == 'top_250_movies':
        url = 'https://www.imdb.com/chart/top?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=470df400-70d9-4f35-bb05-8646a1195842&pf_rd_r=5V6VAGPEK222QB9E0SZ8&pf_rd_s=right-4&pf_rd_t=15506&pf_rd_i=toptv&ref_=chttvtp_ql_3'
    
    if chart == 'top_english_movies':
        url = 'https://www.imdb.com/chart/top-english-movies?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=470df400-70d9-4f35-bb05-8646a1195842&pf_rd_r=3YMHR1ECWH2NNG5TPH1C&pf_rd_s=right-4&pf_rd_t=15506&pf_rd_i=boxoffice&ref_=chtbo_ql_4'
    
    if chart == 'top_250_tv':
        url = 'https://www.imdb.com/chart/tvmeter?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=470df400-70d9-4f35-bb05-8646a1195842&pf_rd_r=J9H259QR55SJJ93K51B2&pf_rd_s=right-4&pf_rd_t=15506&pf_rd_i=topenglish&ref_=chttentp_ql_5'

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup



def _scrape_movies(soup):

    '''
    Scrape the most popular titles and ratings from the IMDB website.
    Args:
        - soup(BeautifulSoup) = BeautifulSoup object
    Returns:
        - movie_dict(dict) = Dictionary of movie names and ratings
    '''
    # Find all movie names in the url
    movie_names = []
    movie_years = []
    movie_ratings = []

    # Find all movie in the url
    titlesRefs = soup.find_all('td', {'class':'titleColumn'})
    ratingsRefs = soup.find_all('td', {'class':'ratingColumn imdbRating'})

    # Collect movies into title and rating list
    for title in titlesRefs:
        try:
            movie_names.append(title.find("a").text)
        except:
            print('Missing title. Replacing with -1')
            movie_names.append(-1)
        
        try:
            movie_years.append(int(title.find("span").text[1:-1]))
        except:
            print('Missing year. Replacing with -1')
            movie_years.append(-1)

    for rating in ratingsRefs:
        movie_ratings.append(float(rating.find("strong").text))    

    # Create a dataframe
    movie_df = pd.DataFrame({'movie_name': movie_names, 'movie_year': movie_years, 'movie_rating': movie_ratings})
    
    # set index as movie id
    movie_df['movie_id'] = movie_df.index + 1

    # set date
    movie_df['update_date'] = datetime.datetime.today().strftime('%Y-%m-%d')

    # reorder columns
    movie_df = movie_df[['movie_id', 'movie_name', 'movie_year', 'movie_rating', 'update_date']]

    return movie_df

soup = _get_soup(chart='most_popular_movies') 

movie_df = _scrape_movies(soup)

movie_df
        

AttributeError: 'NoneType' object has no attribute 'text'

In [2]:
response = requests.get( url = 'https://www.imdb.com/chart/moviemeter?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=470df400-70d9-4f35-bb05-8646a1195842&pf_rd_r=5V6VAGPEK222QB9E0SZ8&pf_rd_s=right-4&pf_rd_t=15506&pf_rd_i=toptv&ref_=chttvtp_ql_2'
)

response.status_code

200

In [19]:
import requests
from bs4 import BeautifulSoup
import os
import sys
from google.cloud import bigquery
import datetime
import pandas as pd

response = requests.get( url = 'https://www.imdb.com/chart/moviemeter?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=470df400-70d9-4f35-bb05-8646a1195842&pf_rd_r=5V6VAGPEK222QB9E0SZ8&pf_rd_s=right-4&pf_rd_t=15506&pf_rd_i=toptv&ref_=chttvtp_ql_2'
)

response.status_code

soup = BeautifulSoup(response.text, 'html.parser')

movie_names = []
movie_years = []
movie_ratings = []
user_votings = []

    # Find all movie in the url

titlesRefs = soup.find_all('td', {'class':'titleColumn'})
ratingsRefs = soup.find_all('td', {'class':'ratingColumn imdbRating'})

    # Collect movies into title and rating list
for title in titlesRefs:
    try:
        movie_names.append(title.find("a").text)
    except:
        movie_names.append(-1)
        
    try:
        movie_years.append(int(title.find("span").text[1:-1]))
    except:
        movie_years.append(-1)

for rating in ratingsRefs:
    
    try:
        movie_ratings.append(float(rating.find("strong").text))    
    except:
        movie_ratings.append(-1)

    try:
        votes_str = rating.find("strong").attrs['title']
        votes_str = votes_str.split(' ')[3]
        votes_int = int(votes_str.replace(',', ''))
        user_votings.append(votes_int)
    except:
        user_votings.append(-1)



In [12]:
import requests
from bs4 import BeautifulSoup
import os
import sys
from google.cloud import bigquery
import datetime
import pandas as pd

response = requests.get( url = 'https://www.boxofficemojo.com/chart/mpaa_title_lifetime_gross/?by_mpaa=PG-13&ref_=bo_cso_ac'
)

response.status_code

soup = BeautifulSoup(response.text, 'html.parser')

# scrape movie names, rank, lifetime gross, year
movie_names = []
movie_ranks = []
movie_lifetime_gross = []
movie_years = []

# Find all movie in the url
titlesRefs = soup.find_all('td', {'class':'a-text-left mojo-field-type-title'})
ranksRefs = soup.find_all('td', {'class':'a-text-right mojo-header-column mojo-truncate mojo-field-type-rank'})
lifetime_grossRefs = soup.find_all('td', {'class':'a-text-right mojo-field-type-money mojo-estimatable'})

# Collect movies into title and rating list
for title in titlesRefs:
    try:
        movie_names.append(title.find("a").text)
    except:
        movie_names.append(-1)

    try:
        movie_years.append(int(title.find("span").text[1:-1]))
    except:
        movie_years.append(-1)

for rank in ranksRefs:
    try:
        movie_ranks.append(int(rank.text))
    except:
        movie_ranks.append(-1)

for lifetime_gross in lifetime_grossRefs:
    try:
        movie_lifetime_gross.append(int(lifetime_gross.text.replace('$','').replace(',','')))
    except:
        movie_lifetime_gross.append(-1)


# Create a dataframe
movie_df = pd.DataFrame({'movie_name': movie_names, 'movie_rank': movie_ranks, 'movie_lifetime_gross': movie_lifetime_gross, 'movie_year': movie_years})

# set index as movie id
movie_df['movie_id'] = movie_df.index + 1

# set date
movie_df['update_date'] = datetime.datetime.today().strftime('%Y-%m-%d')

# reorder columns
movie_df = movie_df[['movie_id', 'movie_name', 'movie_rank', 'movie_lifetime_gross', 'movie_year', 'update_date']]

[]

In [3]:
movie_names

['The Little Mermaid',
 'Fast X',
 'Guardians of the Galaxy Vol. 3',
 'The Super Mario Bros. Movie',
 'The Machine',
 'About My Father',
 'Kandahar',
 'You Hurt My Feelings',
 'Evil Dead Rise',
 'Book Club: The Next Chapter']

In [4]:
movie_years

[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]

In [None]:
Interesting analytical questions:

1. What are the most popular movies per year?

2. What are the most popular movies accounting for the amount of user votes?
    
3. What is the trend of ratings and user votes in the last 10 years? Do more user votes lead to higher ratings? Do more people vote over the years?







