<h1>Metacritic Scrapers</h1>

<h2>Extracting Movie Titles, Ratings, and Slugs + Data Cleaning</h2>

In [None]:
import requests
from bs4 import BeautifulSoup
import time

base_url = "https://www.metacritic.com/browse/movie/?genre=action&genre=adventure&genre=animation&genre=comedy&genre=drama&genre=fantasy&genre=horror&genre=romance&genre=sci---fi&genre=thriller&releaseYearMin=2014&releaseYearMax=2023&page="

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

movie_list = []
movie_slugs = []

def scrape_page(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    # Extract movie information based on the data-title attribute
    movies = soup.select('.c-finderProductCard_title[data-title]')

    for movie in movies:
        title = movie['data-title']
        score = movie.find_next(class_='c-siteReviewScore_xsmall').text.strip()

        movie_list.append([title, score])
    
    # Find all elements with class 'c-finderProductCard'
    card_elements = soup.find_all(class_='c-finderProductCard')

    # Extract movie IDs from href attribute and append to the list
    for card_element in card_elements:
        link_element = card_element.find('a', href=True)
        if link_element:
            href_value = link_element['href']
            movie_slug = href_value.split('/')[-2]
            movie_slugs.append(movie_slug)

# Iterate through and scrape all 196 pages
for page_number in range(1, 197):
    url = f"{base_url}{page_number}"
    scrape_page(url)
    
    time.sleep(1)

In [None]:
# add slugs to movie list elements
combined_list = [title_score + [slug] for title_score, slug in zip(movie_list, movie_slugs)]

In [None]:
# Drop duplicates 

# Set to keep track of unique movies
unique_slugs = set()

# List to store unique occurrences
unique_occurrences = []

for movie_info in combined_list:
    slug = movie_info[2]
    if slug not in unique_slugs:
        unique_occurrences.append(movie_info)
        unique_slugs.add(slug)

In [None]:
# Write to CSV file

import csv
csv_filename = "metacritic_data.csv"
with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write header
    writer.writerow(['Movie', 'Rating', 'Slug'])
    # Write data
    writer.writerows(unique_occurrences)

<h2>Extracting Reviews + Data Cleaning</h2>

In [None]:
# Read CSV file into a pandas DataFrame
import pandas as pd
df = pd.read_csv("metacritic_data.csv")

titles_list = df['Movie'].tolist()
slugs_list = df['Slug'].tolist()

In [None]:
critic_reviews = []

import requests
from bs4 import BeautifulSoup

import requests
import re

def scrape_critic_reviews(movie):
    print(movie)
    
    url = f'https://www.metacritic.com/movie/{movie}/critic-reviews/'
    
    #Make a request to the URL
    response = requests.get(url, headers=headers)
    html_content = response.content

    #Use regular expression to find occurrences of the target phrase and extract text in quotations
    target_phrase = "reviewPath:b,quote"
    pattern = re.compile(fr'{re.escape(target_phrase)}\s*:\s*["\']([^"\']+?)["\']')
    matches = pattern.findall(html_content.decode('utf-8'))

    #Concatenate the matches into a single string
    concatenated_quotes = ' '.join(matches)

    critic_reviews.append(concatenated_quotes)

In [None]:
for movie in slugs_list:
    scrape_critic_reviews(movie)
    
df['Critic Reviews'] = critic_reviews
df.to_csv("movies_and_critic_reviews.csv", index=False)

<h1>LetterBoxd Scrapers</h1>

<h2>Extracting Movie Information + Data Cleaning</h2>

In [None]:
import requests
from bs4 import BeautifulSoup
import re

letterboxd_data = []

def extract_letterboxd_movie_info(movie_slug):
    print(movie_slug)
    
    # Construct  URL
    url = f'https://letterboxd.com/film/{movie_slug}/'

    # Make request to URL
    response = requests.get(url, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse HTML content with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract rating
        rating_match = re.search(r'"ratingValue":\s*(\d+\.\d+)', str(soup))
        rating = float(rating_match.group(1)) if rating_match else None

        # Extract genres
        genres_match = re.search(r'"genre":\s*\[([^\]]*)\]', str(soup))
        genres = [genre.strip('"\', ') for genre in genres_match.group(1).split(',')] if genres_match else None

        # Extract start date
        start_date_match = re.search(r'"startDate":\s*"(\d{4})"', str(soup))
        start_date = start_date_match.group(1) if start_date_match else None

        # Extract director
        director_match = re.search(r'"director":\s*\[{"@type":"Person","name":"([^"]+)"', str(soup))
        director = director_match.group(1) if director_match else None

        # Extract first two actors
        cast_list = soup.find('div', class_='cast-list text-sluglist')
        actors_match = re.findall(r'href="/actor/([^/]+)/"', str(cast_list))
        actors = [actor.replace('-', ' ').title() for actor in actors_match[:2]]

        # Extract studios
        studios_section = soup.find('h3', string='Studios')
        if studios_section:
            # Find all studio links within the section
            studio_links = studios_section.find_next('div', class_='text-sluglist')
            if studio_links:
                studio_links = studio_links.find_all('a', class_='text-slug')
                # Extract the text from each link
                studios = [link.get_text(strip=True) for link in studio_links]
            else:
                studios = None
        else:
            studios = None

        # Append movie information to global list
        letterboxd_data.append([rating, genres, start_date, director, actors, studios])
    else:
        letterboxd_data.append([None, None, None, None, None, None])

In [None]:
for movie in slugs_list:
    extract_letterboxd_movie_info(movie)
print('done')

In [None]:
# Make new pandas dataframe for Letterboxd data

# Specify column names
column_names = ['lb_rating', 'genre', 'year', 'director', 'actors', 'production_companies']

# Create a DataFrame from the list of lists
df2 = pd.DataFrame(letterboxd_data, columns=column_names)

In [None]:
# Join dataframes
combined_df = pd.concat([df, df2], axis=1)

<h2>Extracting Reviews + Data Cleaning</h2>

In [4]:
import requests
from bs4 import BeautifulSoup

# Global list to store the extracted review text
all = []

def scrape_reviews_for_movie(slug, num_pages=3):
    print(slug)
    base_url = 'https://letterboxd.com/film'
    movie_url = f'{base_url}/{slug}/reviews/by/activity'

    # List to store reviews for the current movie
    movie_reviews = []

    for page in range(1, num_pages + 1):
        page_url = f"{movie_url}/page/{page}/"
        response = requests.get(page_url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            reviews = soup.find_all('li', class_='film-detail')

            for review in reviews:
                body_text = review.find('div', class_='body-text')
                if body_text:
                    review_text = body_text.find('p')
                    if review_text:
                        review_text = review_text.get_text(strip=True)
                        movie_reviews.append(review_text)

    # Concatenate reviews for the current movie into a single string
    movie_reviews_concatenated = '\n'.join(movie_reviews)

    # Add the concatenated reviews to the global list
    all.append(movie_reviews_concatenated)

for slug in slugs_list:
    scrape_reviews_for_movie(slug, num_pages=3)

NameError: name 'slugs_list' is not defined

In [None]:
# add to dataframe
combined_df['lb_reviews'] = all