# Scraping Game Reviews

For this project, we'll scrape data from metacritic in order to try and see which games may be worth playing under quarantine...

## Import libraries

In [17]:
#For data wrangling
import pandas as pd
import numpy as np

#For webscraping
from bs4 import BeautifulSoup
import requests
import re
import datetime
import time

#For data visualisation
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from PythonFunctions.cb91visuals import *

In [None]:
def get_games_by_platform_and_letter(platform,letter):
    
    #Get the page URL, according to the platform and starting letter
    url = f'https://www.metacritic.com/browse/games/title/{platform}/{letter}?page=0'
    headers = {'User-Agent':
           'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'}
    
    #Request the URL, and make soup from the HTML
    page = requests.get(url, headers = headers)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    #Isolate the page links, which will tell us how many pages we have of this letter
    pagelinks = soup.find('div', class_="pages")
    
    #Get the number of the last page, so we can set a sensible loop later on
    try:
        last_page = int(pagelinks.find('li',class_="page last_page").get_text().replace('…',''))
    except:
        last_page = 0
    
    #Instantiate a dataframe
    df = pd.DataFrame(columns = ['GameTitle','Platform',
                                 'MetaRating','UserRating',
                                 'ReleaseDate','GameURL',
                                 'ReviewsURL'])
    id_count = 0
    
    
    startstrip = "\n\n                            "
    endstrip = "\n                                                    \n"
    
    for i in list(range(max(1,last_page))):
        
        if letter == '#':
            url = f'https://www.metacritic.com/browse/games/title/{platform}?page={i}'
        else:
            url = f'https://www.metacritic.com/browse/games/title/{platform}/{letter}?page={i}'
        
        print(url)
        
        page = requests.get(url, headers = headers)
        soup = BeautifulSoup(page.content, 'html.parser')
        
        games = soup.find('ol', class_="list_products")
        
        artists = [re.search('\nArtist:\n(.*)\n',
                     i.findAll('li',class_="stat product_artist")[0].get_text()).group(1)
           for i in albums.findAll('li',class_="product")]
        
        
        album_titles = [re.search(f'{startstrip}(.*){endstrip}',
                          i.findAll('div',class_="basic_stat product_title")[0].get_text()).group(1)
                for i in albums.findAll('li',class_="product")]

        
        meta_ratings = [list(list(i)[1])[3].get_text().replace('\n','')
                        for i in albums.findAll('li',class_="product")]

        
        release_dates = [i.findAll('li',class_="stat release_date full_release_date")[0].get_text(
                 ).replace('\n','').replace('Release Date:','') 
                 for i in albums.findAll('li',class_="product")]
        
        album_urls = [f"https://www.metacritic.com{i.attrs['href']}" for i in albums.findAll('a')]
        
                      
        review_urls = [f"https://www.metacritic.com{i.attrs['href']}/critic-reviews" for i in albums.findAll('a')]
        
                      
        album_ids = [f"{letter}{100000+id_count+i}" for i in range(len(artists))]
        
        id_count += len(album_ids)
        
        
        new_df = pd.DataFrame({'AlbumID':album_ids,
                               'Album':album_titles,
                               'Artist':artists,
                               'MetaRating':meta_ratings,
                               'ReleaseDate':release_dates,
                               'AlbumURL':album_urls,
                               'ReviewsURL':review_urls})
        
        df = pd.concat([df, new_df])
        
        time.sleep(round(3*np.random.random(),2))
        
    return df

In [13]:
url = f'https://www.metacritic.com/browse/games/title/xboxone/c?page=0'
headers = {'User-Agent':
       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'}

page = requests.get(url, headers = headers)
soup = BeautifulSoup(page.content, 'html.parser')

pagelinks = soup.find('div', class_="pages")

In [14]:
soup.find('div', class_="pages")

<div class="pages">
<div class="label">Page:</div>
<ul class="pages"><li class="page first_page active_page"><span class="page_num">1</span></li><li class="page last_page"><a class="page_num" href="/browse/games/title/xboxone/c?page=1">2</a></li></ul>
</div>

In [15]:
int(pagelinks.find('li',class_="page last_page").get_text().replace('…',''))

2

In [18]:
games = soup.find('ol', class_="list_products")

In [19]:
games

<ol class="list_products list_product_condensed">
<li class="product game_product first_product">
<div class="product_wrap">
<div class="basic_stat product_title">
<a href="/game/xbox-one/cabelas-african-adventures">
                            Cabela's African Adventures
                                                            (XONE)
                                                    </a>
</div>
<div class="basic_stat product_score brief_metascore">
<div class="metascore_w small game tbd">tbd</div>
</div>
<div class="more_stats condensed_stats">
<ul class="more_stats">
<li class="stat product_avguserscore">
<span class="label">User:</span>
<span class="data textscore textscore_tbd">tbd</span>
</li>
<li class="stat release_date full_release_date">
<span class="label">Release Date:</span>
<span class="data">Mar 17, 2015</span>
</li>
</ul>
</div>
</div>
</li>
<li class="product game_product">
<div class="product_wrap">
<div class="basic_stat product_title">
<a href="/game/xbox-one/ca