In [16]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import re

In [7]:
# Headers to mimic a browser visit
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

In [4]:
def fetch_and_parse(url):
    # Fetch the page content
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup
    else:
        print("Failed to retrieve the page")
        return None

In [95]:
def extract_game_data(soup):
    games_data = []
    game_listings = soup.find_all('a', class_='c-finderProductCard_container')

    for game in game_listings:
        title_section = game.find('div', class_='c-finderProductCard_title')
        title = re.sub(r'^\d{1,3}(,\d{3})*\.', '', title_section.get_text(strip=True)).strip() if title_section else 'Title not found'
        
        meta_info_section = game.find('div', class_='c-finderProductCard_meta')
        release_date = "Unknown"
        rating = "Rating not found"
        if meta_info_section:
            meta_info = meta_info_section.get_text(strip=True, separator='|').split('|')
            try:
                release_date_str = [info.strip() for info in meta_info if ',' in info][0]
                release_date = datetime.strptime(release_date_str, "%b %d, %Y").strftime("%Y-%m-%d")
            except (ValueError, IndexError):
                pass  # Keep release_date as "Unknown"
            rating_text = meta_info_section.get_text()
            match = re.search(r'Rated\s+(\w+)', rating_text)
            if match:
                rating = match.group(1)
        
        description_section = game.find('div', class_='c-finderProductCard_description')
        description = description_section.get_text(strip=True) if description_section else 'Description not found'
        
        metascore_section = game.find('div', class_='c-siteReviewScore')
        metascore = metascore_section.get_text(strip=True) if metascore_section else 'Metascore not available'

        games_data.append({
            'title': title,
            'release_date': release_date,
            'rating': rating,
            'description': description,
            'metascore': metascore
        })
    
    return games_data


In [96]:
all_games_data = []
base_url = "https://www.metacritic.com/browse/game/?releaseYearMin=1958&releaseYearMax=2024&page={}"
page = 1  # Initialize page counter

while True:  # Use an infinite loop to iterate through pages
    url = base_url.format(page)
    print(f"Scraping {url}")
    soup = fetch_and_parse(url)
    if not soup:  # If the page couldn't be fetched, break the loop
        print(f"Failed to retrieve page {page}. Exiting loop.")
        break

    # Check if there are game listings on the page
    game_listings = soup.find_all("a", class_="c-finderProductCard_container")
    if (
        not game_listings
    ):  # If there are no game listings, we've likely hit a non-existent page
        print(
            f"No game listings found on page {page}. Assuming end of pages and exiting loop."
        )
        break

    page_data = extract_game_data(soup)
    all_games_data.extend(page_data)

    # Increment the page counter for the next iteration
    page += 1

# Once all pages have been scraped, create a DataFrame
df = pd.DataFrame(all_games_data)
print(df.head())  # Print the first few rows to verify
# Optionally, save the DataFrame to a CSV file
df.to_csv("metacritic_games_data.csv", index=False)


Scraping https://www.metacritic.com/browse/game/?releaseYearMin=1958&releaseYearMax=2024&page=1
Scraping https://www.metacritic.com/browse/game/?releaseYearMin=1958&releaseYearMax=2024&page=2
Scraping https://www.metacritic.com/browse/game/?releaseYearMin=1958&releaseYearMax=2024&page=3
Scraping https://www.metacritic.com/browse/game/?releaseYearMin=1958&releaseYearMax=2024&page=4
Scraping https://www.metacritic.com/browse/game/?releaseYearMin=1958&releaseYearMax=2024&page=5
Scraping https://www.metacritic.com/browse/game/?releaseYearMin=1958&releaseYearMax=2024&page=6
Scraping https://www.metacritic.com/browse/game/?releaseYearMin=1958&releaseYearMax=2024&page=7
Scraping https://www.metacritic.com/browse/game/?releaseYearMin=1958&releaseYearMax=2024&page=8
Scraping https://www.metacritic.com/browse/game/?releaseYearMin=1958&releaseYearMax=2024&page=9
Scraping https://www.metacritic.com/browse/game/?releaseYearMin=1958&releaseYearMax=2024&page=10
Scraping https://www.metacritic.com/bro

In [98]:
platforms = [
    "ps5",
    "xbox-series-x",
    "nintendo-switch",
    "pc",
    "mobile",
    "3ds",
    "dreamcast",
    "ds",
    "gba",
    "gamecube",
    "meta-quest",
    "nintendo-64",
    "ps1",
    "ps2",
    "ps3",
    "ps4",
    "psp",
    "ps-vita",
    "wii",
    "wii-u",
    "xbox",
    "xbox-360",
    "xbox-one",
]

In [99]:
def create_dfs(names):
    dfs = {}
    for x in names:
        dfs[x] = pd.DataFrame()
    return dfs
    
platforms_dfs = create_dfs(platforms)

In [100]:
base_url = "https://www.metacritic.com/browse/game/{}/all/all-time/metascore/?releaseYearMin=1958&releaseYearMax=2024&platform={}&page={}"

for platform in platforms:
    all_games_data = []
    page = 1  # Initialize page counter
    while True:  # Use an infinite loop to iterate through pages
        url = base_url.format(platform, platform, page)
        print(f"Scraping {url}")
        soup = fetch_and_parse(url)
        if not soup:  # If the page couldn't be fetched, break the loop
            print(f"Failed to retrieve page {page}. Exiting loop.")
            break

        # Check if there are game listings on the page
        game_listings = soup.find_all("a", class_="c-finderProductCard_container")
        if (
            not game_listings
        ):  # If there are no game listings, we've likely hit a non-existent page
            print(
                f"No game listings found on page {page}. Assuming end of pages and exiting loop."
            )
            break

        page_data = extract_game_data(soup)
        all_games_data.extend(page_data)

        # Increment the page counter for the next iteration
        page += 1

    # Once all pages have been scraped, create a DataFrame
    platforms_dfs[platform] = pd.DataFrame(all_games_data)
    # Optionally, save the DataFrame to a CSV file
    platforms_dfs[platform].to_csv("platforms/metacritic_games_data_"+platform+".csv", index=False)

Scraping https://www.metacritic.com/browse/game/ps5/all/all-time/metascore/?releaseYearMin=1958&releaseYearMax=2024&platform=ps5&page=1
Scraping https://www.metacritic.com/browse/game/ps5/all/all-time/metascore/?releaseYearMin=1958&releaseYearMax=2024&platform=ps5&page=2
Scraping https://www.metacritic.com/browse/game/ps5/all/all-time/metascore/?releaseYearMin=1958&releaseYearMax=2024&platform=ps5&page=3
Scraping https://www.metacritic.com/browse/game/ps5/all/all-time/metascore/?releaseYearMin=1958&releaseYearMax=2024&platform=ps5&page=4
Scraping https://www.metacritic.com/browse/game/ps5/all/all-time/metascore/?releaseYearMin=1958&releaseYearMax=2024&platform=ps5&page=5
Scraping https://www.metacritic.com/browse/game/ps5/all/all-time/metascore/?releaseYearMin=1958&releaseYearMax=2024&platform=ps5&page=6
Scraping https://www.metacritic.com/browse/game/ps5/all/all-time/metascore/?releaseYearMin=1958&releaseYearMax=2024&platform=ps5&page=7
Scraping https://www.metacritic.com/browse/game/

In [108]:
# Function to merge platform dataframes into the main dataframe
def merge_platform_dfs(main_df, platform_dfs):
    main_df['platforms'] = [[] for _ in range(len(main_df))]
    main_df['platform_scores'] = [[] for _ in range(len(main_df))]
    # For each platform, iterate over its DataFrame
    for platform, platform_df in platform_dfs.items():
        # Ensure that each game in the platform DataFrame is considered
        for idx, platform_row in platform_df.iterrows():
            # Attempt to match games in the main DataFrame based on title, description, and rating
            matched_idx = main_df[(main_df['title'] == platform_row['title'])].index

            # If a match is found, update the 'platforms' and 'platform_scores' lists for those rows
            for i in matched_idx:
                if platform not in main_df.at[i, 'platforms']:
                    main_df.at[i, 'platforms'].append(platform)
                    main_df.at[i, 'platform_scores'].append(platform_row['metascore'])

    return main_df

In [109]:
all_games_df = df
all_games_df = merge_platform_dfs(all_games_df, platforms_dfs)
all_games_df.head()

Unnamed: 0,title,release_date,rating,description,metascore,platforms,platform_scores
0,The Legend of Zelda: Ocarina of Time,1998-11-23,E,"As a young boy, Link is tricked by Ganondorf, ...",99,[nintendo-64],[99]
1,SoulCalibur,1999-09-08,T,"This is a tale of souls and swords, transcendi...",98,"[mobile, dreamcast, xbox-360]","[73, 98, 79]"
2,Grand Theft Auto IV,2008-04-29,M,[Metacritic's 2008 Xbox 360 Game of the Year; ...,98,"[pc, ps3, xbox-360]","[90, 98, 98]"
3,Super Mario Galaxy,2007-11-12,E,[Metacritic's 2007 Wii Game of the Year] The u...,97,[wii],[97]
4,Super Mario Galaxy 2,2010-05-23,E,"Super Mario Galaxy 2, the sequel to the galaxy...",97,[wii],[97]


In [110]:
genres = [
'action',
'action-adventure',
'action-puzzle',
'action-rpg',
'adventure',
'application',
'arcade',
"beat---'em---up",
'board-or-card-game',
'card-battle',
'compilation',
'edutainment',
'exercise-or-fitness',
'fighting',
'first---person-shooter',
'gambling',
'general',
'mmorpg',
'open---world',
'party-or-minigame',
'piball',
'platformer',
'puzzle',
'rpg',
'racing',
'real---time-strategy',
'rhythm',
'roguelike',
'sandbox',
'shooter',
'simulation',
'sports',
'strategy',
'survival',
'tactics',
'third---person-shooter',
'trivia-or-game-show',
'turn---based-strategy',
'virtual',
'visual-novel'
]