# Metacritic Scraper for PlayStation Posts 🎮

**https://www.metacritic.com/robots.txt**

In [33]:
import requests as re
import json as json 
import os as os
from bs4 import BeautifulSoup as bs 
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
import pandas as pd
import time

page number to go back one year is 90 (~ 15 ish dec 2022)

In [9]:
def article_scraper(page_number):
    url = f'https://www.metacritic.com/browse/game/all/all/all-time/new/?releaseYearMin=1958&releaseYearMax=2023&platform=ps5&platform=ps4&page={page_number}'
    headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"}  # metacritic blocks default user agent so I needed to tell it it's me 🤷‍♂️
    response = re.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Status code: {response.status_code}")  # since they block default user agents, I needed to check to make sure the request went through 👺
    soup = bs(response.content, 'html.parser')

    games = soup.find_all('div', class_='c-finderProductCard')
    data = []
    for game in games:
        link_element = game.find('a', class_='c-finderProductCard_container')
        title_element = game.find('h3', class_='c-finderProductCard_titleHeading')
        date_element = game.find('div', class_='c-finderProductCard_meta').find('span')
        description_element = game.find('div', class_='c-finderProductCard_description').find('span')

        link = f"https://www.metacritic.com{link_element['href']}" if link_element else None
        title = title_element.get_text(strip=True) if title_element else None
        date = date_element.get_text(strip=True) if date_element else None
        description = description_element.get_text(strip=True) if description_element else None
        
        data.append({'Title': title, 'Release Date': date, 'Description': description, 'Game Link': link})
    return data


N = 1  # Manually adjust number of pages to scrape
all_data = []
for page in range(1, N + 1):
    print(f"Scraping page {page}...")
    all_data.extend(article_scraper(page))

df = pd.DataFrame(all_data)

print(df)


Scraping page 1...
                                                Title  Release Date  \
0                                           Railbreak  Dec 22, 2023   
1                                     Synthetic Lover  Dec 22, 2023   
2                              Three Minutes To Eight  Dec 21, 2023   
3                           Mortal Kombat 1: Quan Chi  Dec 21, 2023   
4                                         Tiger Blade  Dec 20, 2023   
5                                  3D Color Labyrinth  Dec 20, 2023   
6                                   Bigfoot's Journey  Dec 20, 2023   
7          Rhapsody II: Ballad of the Little Princess  Dec 20, 2023   
8                                         Hotel R'n'R  Dec 20, 2023   
9      Aokana: Four Rhythms Across the Blue - EXTRA1P  Dec 20, 2023   
10                                      Vetrix Worlds  Dec 20, 2023   
11                           Poppy Playtime Chapter 1  Dec 20, 2023   
12             Rhapsody III: Memories of Marl Kingdom  Dec

In [41]:
def comments_spider(df):
    driver = webdriver.Chrome()
    total_games = len(df)
    comments_data = []

    for index, (game_index, game_row) in enumerate(df.iterrows(), start=1):
        game_link = game_row['Game Link']
        game_title = game_link.split('/')[-2].replace('-', ' ')

        print(f"{index}/{total_games}: Processing {game_title} at {game_link}")
        driver.get(game_link)

        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(3) 

            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        soup = bs(driver.page_source, 'html.parser')
        critic_reviews = soup.find_all('div', class_='c-reviewsSection_criticReviews')
        user_reviews = soup.find_all('div', class_='c-reviewsSection_userReviews')
        num_critic_comments = len([element for element in critic_reviews if element.find('div', class_='c-siteReview_quote')])
        num_user_comments = len([element for element in user_reviews if element.find('div', class_='c-siteReview_quote')])
        num_comments = num_critic_comments + num_user_comments
        
        print(f"🔎 -- Found {num_comments} reviews for {game_title} -- 🔍")

        for review in critic_reviews + user_reviews:
            console_element = review.find('a', class_='c-siteReview_platform')
            console_type = console_element.get_text(strip=True) if console_element else 'null'

            commenter_name_element = review.find('a', class_='c-siteReviewHeader_username')
            if commenter_name_element:
                commenter_name = commenter_name_element.get_text(strip=True)
            else:
                commenter_name_element = review.find('a', class_='c-siteReviewHeader_publicationName')
                commenter_name = commenter_name_element.get_text(strip=True) if commenter_name_element else 'null'

            reviewer_type = 'Critic' if review in critic_reviews else 'User'

            comment_element = review.find('div', class_='c-siteReview_quote')
            comment_text = comment_element.span.get_text(strip=True) if comment_element and comment_element.span else 'null'

            date_element = review.find('div', class_='c-siteReviewHeader_reviewDate')
            date_commented = date_element.get_text(strip=True) if date_element else 'null'

            score_element = review.find('div', class_='c-siteReviewScore')
            score_number = score_element.span.get_text(strip=True) if score_element and score_element.span else 'null'

            comment_data = {
                'Game Title': game_title,
                'Console': console_type,
                'Commenter Name': commenter_name,
                'Reviewer Type': reviewer_type,
                'Comment': comment_text,
                'Date Commented': date_commented,
                'Score Number': score_number
            }
            comments_data.append(comment_data)

    driver.quit()
    comments_df = pd.DataFrame(comments_data)
    return comments_df

raw_comments_df = comments_spider(df)
print(raw_comments_df)

1/24: Processing railbreak at https://www.metacritic.com/game/railbreak/
🔎 -- Found 0 reviews for railbreak -- 🔍
2/24: Processing synthetic lover at https://www.metacritic.com/game/synthetic-lover/
🔎 -- Found 0 reviews for synthetic lover -- 🔍
3/24: Processing three minutes to eight at https://www.metacritic.com/game/three-minutes-to-eight/
🔎 -- Found 1 reviews for three minutes to eight -- 🔍
4/24: Processing mortal kombat 1 quan chi at https://www.metacritic.com/game/mortal-kombat-1-quan-chi/
🔎 -- Found 0 reviews for mortal kombat 1 quan chi -- 🔍
5/24: Processing tiger blade at https://www.metacritic.com/game/tiger-blade/
🔎 -- Found 1 reviews for tiger blade -- 🔍
6/24: Processing 3d color labyrinth at https://www.metacritic.com/game/3d-color-labyrinth/
🔎 -- Found 0 reviews for 3d color labyrinth -- 🔍
7/24: Processing bigfoots journey at https://www.metacritic.com/game/bigfoots-journey/
🔎 -- Found 0 reviews for bigfoots journey -- 🔍
8/24: Processing rhapsody ii ballad of the little pri

---
---
---

### Lab Testing Area 🔍👨‍⚕️

In [45]:
from itables import show

In [48]:
print(raw_comments_df.shape)

(50, 7)


In [46]:
show(raw_comments_df)

Game Title,Console,Commenter Name,Reviewer Type,Comment,Date Commented,Score Number
Loading... (need help?),,,,,,
