In [1]:
from bs4 import BeautifulSoup
import requests
import time
import sqlite3

Scraped data from : https://www.metacritic.com/browse/movies/score/metascore/all/filtered?page=0

Using the information from : https://www.makeuseof.com/tag/best-movie-ratings-sites/

In [2]:
base_url = 'https://www.metacritic.com/'
sqldb_path = 'data/rating_database.db'
headers = {
    'Accept-Encoding': 'gzip, deflate, sdch',
    'Accept-Language': 'en-US,en;q=0.8',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
}

def insert_into_sql(statement, db_path = sqldb_path):
    """
    Insert or update sql database based on the statement parameter

    Parameters
    ----------
    statement : str
        The statement to execute
    db_path : str
        Path to sqlite database that you want to execute the statements

    Returns
    -------
    None
    """
    
    try:
        sqliteConnection = sqlite3.connect(db_path)
        cursor = sqliteConnection.cursor()
        cursor.execute(statement)
        sqliteConnection.commit()
        cursor.close()
    except sqlite3.Error as error:
        print("Error while connecting to sqlite", error)


def fetch_from_sql(statement, db_path = sqldb_path):
    """
    Fetch from sql database based on the statement parameter

    Parameters
    ----------
    statement : str
        The statement to execute
    db_path : str
        Path to sql database that you want to execute the statements

    Returns
    -------
    fetch_items : list
        A list of elements that match the statement from sql database
    """
    
    try:
        sqliteConnection = sqlite3.connect(db_path)
        cursor = sqliteConnection.cursor()
        cursor.execute(statement)
        fetch_items = cursor.fetchall()
        cursor.close()

        return fetch_items
    except sqlite3.Error as error:
        print("Error while connecting to sqlite", error)

In [4]:
def get_genres(movie_soup):
    """
    Get genres from the given movie_soup

    Parameters
    ----------
    movie_soup : BeautifulSoup obj
        Parsed html for the given movie
    
    Returns
    -------
    genre1 : str or None
        First listed genre of the movie. None if not listed
    genre2 : str or None
        Second listed genre of the movie. None if not listed
    genre3 : str or None
        Third listed genre of the movie. None if not listed
    """
    
    genre1, genre2, genre3 = None, None, None
    genres = [s.strip() for s in movie_soup.find('div', {'class': 'genres'})
                                           .text.split(':')[1].split(',')]

    try:
        genre1 = genres[0].strip()
        try:
            genre2 = genres[1].strip()
            try:
                genre3 = genres[2].strip()
            except:
                pass
        except:
            pass
    except:
        pass
    
    return genre1, genre2, genre3


def get_movie_info(url, headers=headers):
    """
    Get movie information and insert into a sql database. The information is
    movie title, year, critic_score, user_score, summary, rating, runtime, 
    genres, and positive, mixed, negative critics & users reviews count.

    Parameters
    ----------
    url : str
        url of the movie from metacritic website
    headers : dict
        Browser information to pass in when calling requests
    
    Returns
    -------
    movie_id : str
        Hash id generated by passing a movie title to python hash function
    """

    # Creating BeautifulSoup object with the given url
    movie_res = requests.get(url, headers=headers).text
    movie_soup = BeautifulSoup(movie_res, 'html.parser')

    # Retrieving critic and user scores
    scores = movie_soup.find_all('div', {'class': 'score fl'})
    critic_score = scores[0].text.strip()
    user_score = scores[1].text.strip()
    # some movies don't have user reviews and have tbd instead of floating point
    if user_score == 'tbd':
        return None
    
    # Retrieving title, year, and movie_id
    title_year = movie_soup.find('div', {'class': 'product_page_title'}).text.strip().split('\n')
    title = title_year[0]
    year = title_year[1]
    movie_id = hash(title)

    # Checking whether I already collected the movie info or not
    # If yes, returns None
    check_statement = f'SELECT movie_id FROM Movie WHERE movie_id = {movie_id}'
    if len(fetch_from_sql(check_statement)) > 0:
        return None

    # Retrieving summary
    summary = movie_soup.find('div', {'class': 'summary_deck'}).text.replace('"', '')
    if summary.startswith('\nSummary:'):
        summary = summary[9:].strip()
    if summary.endswith('… Expand'):
        summary = summary[:-8]

    # Retrieving rating (PG, PG-13, R, etc)
    try:
        rating = movie_soup.find('div', {'class': 'rating'}).text.split(':')[1].strip()
    except:
        # Some films (in the early history or independent films) are not rated
        rating = 'Not Rated'

    # Retrieving runtime
    try:
        runtime = movie_soup.find('div', {'class': 'runtime'}).text.split(':')[1].strip()
        if runtime.endswith(' min'):
            runtime = runtime[:-4]
    except:
        # Assigning -1 for some films that don't have runtime specified
        runtime = -1

    # Retrieving genres
    g1, g2, g3 = get_genres(movie_soup)

    # Retrieving critics' and users' positive, mixed, and negative review counts
    pos = [d.text.split(':')[1].strip().replace(',', '') 
           for d in movie_soup.find_all('div', {'class': 'chart positive'})]
    mix = [d.text.split(':')[1].strip().replace(',', '') 
           for d in movie_soup.find_all('div', {'class': 'chart mixed'})]
    neg = [d.text.split(':')[1].strip().replace(',', '') 
           for d in movie_soup.find_all('div', {'class': 'chart negative'})]
    num_critic_pos, num_critic_mix, num_critic_neg = pos[0], mix[0], neg[0]
    num_user_pos, num_user_mix, num_user_neg = pos[1], mix[1], neg[1]

    # Inserting all the information into a sqlite database
    insert_statement = f'INSERT INTO Movie (movie_id, movie_title, movie_year, movie_summary, \
        movie_critic_rating, movie_user_rating, movie_runtime, movie_genre, movie_subgenre, \
        movie_sub2genre, movie_rating, num_critic_pos, num_critic_mix, num_critic_neg, \
        num_user_pos, num_user_mix, num_user_neg) VALUES ("{movie_id}", "{title}", {year}, "{summary}", \
        {critic_score}, {user_score}, {runtime}, "{g1}", "{g2}", "{g3}", "{rating}", {num_critic_pos},\
        {num_critic_mix}, {num_critic_neg}, {num_user_pos}, {num_user_mix}, {num_user_neg})'
    insert_into_sql(insert_statement)

    return movie_id


def get_reviews(url, movie_id, critic=True, first=True, headers=headers):
    """
    Get movie reviews and insert into a sql database. It can retrieve reviews
    from both critics and users by setting the critic parameter.

    Parameters
    ----------
    url : str
        url of the movie reviews from metacritic website
    movie_id : str
        hash value of movie title after passed into python hash function
    critic : bool
        True if given url is for reviews from critics. False if otherwise
    first : bool
        True if it's the first time retrieving reviews from this movie. False if 
        this function is called recursively
    headers : dict
        Browser information to pass in when calling requests
    
    Returns
    -------
    None
    """

    # Creating BeautifulSoup object with the given url
    res = requests.get(url, headers=headers).text
    soup = BeautifulSoup(res, 'html.parser')

    # Retrieve names, dates, ratings of reviews
    names = [s.text.strip() for s in soup.find_all('span', {'class': 'author'})]
    dates = [s.text.strip() for s in soup.find_all('span', {'class': 'date'})]
    ratings = [s.text.strip() for s in soup.find_all('div', {'class': 'left fl'})]

    # Some early film reviews don't have dates --> Fill in with 'None'
    if len(names) != len(dates):
        dates = ['None'] * len(names)

    # Different approaches for retrieving reviews for critics/users
    if critic:
        reviews = [s.text.strip().replace('"', '') for s in soup.find_all('a', {'class': 'no_hover'})]
    else:
        tmp_reviews = [s for s in soup.find_all('div', {'class': 'review_body'})]
        reviews = [r.find('span', {'class': 'blurb blurb_expanded'}).text.strip().replace('"', '') 
                   if r.find('span', {'class': 'blurb blurb_expanded'}) 
                   else r.text.strip().replace('"', '') for r in tmp_reviews]

    # Different insert_statement for Critics/Users tables
    tb, sub = 'Users', 'user'
    if critic: 
        tb, sub = 'Critics', 'critic'
    insert_statement = f'INSERT INTO {tb} (movie_id, {sub}_name, {sub}_review, {sub}_date, {sub}_rating) VALUES'

    # Looping through collected information to create one insert statement
    # for multiple rows
    for n,d,ra,re in zip(names, dates, ratings, reviews):
        addition = f' ("{movie_id}",  "{n}", "{re}", "{d}", {ra}),'
        insert_statement += addition
    if len(names) > 0:
        insert_into_sql(insert_statement[:-1])

    # Calling get_reviews() recursively when there is more than one page of reviews
    #   Only using it when retrieving reviews for users since critics review page is
    #   almost always in a single page
    if first and not critic:
        try:
            last_page = soup.find('li', {'class': 'page last_page'}).text
            if last_page.startswith('…'):
                last_page = last_page[1:]
            
            for i in range(int(last_page)):
                next_review_url = url + '?page=' + str(i)
                get_reviews(next_review_url, movie_id, critic=False, first=False)
        except:
            pass


In [6]:
for i in range(156):
    url = f'https://www.metacritic.com/browse/movies/score/metascore/all/filtered?sort=desc&page={i}'
    res = requests.get(url, headers=headers).text
    soup = BeautifulSoup(res, 'html.parser')

    # Looping through all movies displayed on a page
    for movie in soup.find_all('td', {'class': 'clamp-summary-wrap'}):
        try:
            movie_endpoint = movie.find_all('a')[1]['href']
            critic_endpoint = movie_endpoint + '/critic-reviews'
            user_endpoint = movie_endpoint + '/user-reviews'
            movie_id = get_movie_info(base_url+movie_endpoint)
            
            # Get reviews when get_movie_info() does not return None
            if movie_id:
                get_reviews(base_url+critic_endpoint, movie_id, critic=True)
                try:
                    get_reviews(base_url+user_endpoint, movie_id, critic=False)
                except:
                    pass
            print(movie_endpoint)
        except:
            print(f'Didn\'t work: {movie_endpoint}')

        time.sleep(1)

/movie/tokyo-story
/movie/the-godfather
/movie/citizen-kane
/movie/rear-window
/movie/casablanca
/movie/three-colors-red
/movie/boyhood
/movie/vertigo
/movie/notorious-1946
/movie/singin-in-the-rain
/movie/playtime
/movie/city-lights
/movie/moonlight-2016
/movie/intolerance
/movie/the-rules-of-the-game
/movie/pinocchio-1940
/movie/touch-of-evil
/movie/the-zone-of-interest
/movie/seven-samurai
/movie/the-wild-bunch
/movie/au-hasard-balthazar
/movie/the-lady-vanishes-1938
/movie/the-treasure-of-the-sierra-madre
/movie/pans-labyrinth
/movie/some-like-it-hot
/movie/north-by-northwest
/movie/hoop-dreams
/movie/rashomon
/movie/the-passion-of-joan-of-arc
/movie/all-about-eve
/movie/jules-and-jim
/movie/my-left-foot
/movie/ran
/movie/the-third-man
/movie/dr-strangelove-or-how-i-learned-to-stop-worrying-and-love-the-bomb
/movie/quo-vadis-aida
/movie/psycho-1960
/movie/gone-with-the-wind
/movie/4-months-3-weeks-and-2-days
/movie/battleship-potemkin
/movie/a-streetcar-named-desire
/movie/the-malt