In [299]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from pprint import pprint
from collections import defaultdict
import datetime as dt
from dateutil.parser import parse
import pickle
import re
import time
import os

# Individual game page scraper

## Helper functions

In [419]:
def grab_soup(url):
    """Takes a url and returns a BeautifulSoup object"""
    
    response = requests.get(url)
    
    assert (response.status_code == 200), "Problem with url request! %s throws %s" % (url, response.status_code)   #checking that it worked
    
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    return soup


def get_score(soup):
    """Takes a BeautifulSoup object of an individual game page and returns the aggregate
    critic score of the game. If there is no critic score, returns None."""
    
    scorebox = soup.find(id='coreGameRank')('div')
    score = scorebox[5].text    #position of numerical score ("..." if no score listed)
    
    if score == "...":       #avoid casting a missing score to an int
        return None
    else:
        return int(score)
    

def get_title_console(soup):
    """Takes a BeautifulSoup object of an individual game page and returns the
    title and console of the game as a tuple (title, platform)"""
    header = soup.find(class_="niceHeaderTitle")("a")
    gametitle = header[0].text
    console = header[1].text
    
    return gametitle, console
    
    
def parse_game_data(soup):
    """Takes a BeautifulSoup object of a game's info page and returns a dictionary
    with the characteristics and target value (critic score) for the game"""
    
    gameinfo = defaultdict()     #Dictionary to hold the info for this game
    
    #Grabs the info from the coreGameRelease section of page
    releaseinfo = soup.find(id='coreGameRelease')('div')
    
    num_cells = len(releaseinfo)
    current_cell = 0
    
    while current_cell < (num_cells - 1):
        field = releaseinfo[current_cell].text.replace('\xa0', ' ').lower()
        value = releaseinfo[current_cell + 1].text.replace('\xa0', ' ')
    
        gameinfo[field] = value
        
        current_cell += 2
    
    #Grabs the info from the 'coreGameGenre' section of page
    genreinfo = soup.find(id='coreGameGenre')('div')

    num_cells = len(genreinfo)
    
    #The first cell in this section is either blank or garbage. Blank cells are followed by
    #a garbage cell. This if-statement skips the 1 garbage cell, or the blank cell AND the following
    #garbage cell.
    if genreinfo[0].text == "":
        current_cell = 2
    else:
        current_cell = 1

    while current_cell < (num_cells - 1):
        field = genreinfo[current_cell].text.replace('\xa0', ' ').lower()
        value = genreinfo[current_cell + 1].text.replace('\xa0', ' ')
        
        gameinfo[field] = value
    
        current_cell += 2
    
    #helper function - retrieves critic score (our target) if available, else 'None'
    score = get_score(soup)
    gameinfo['score'] = score
    
    #helper function - retrieves the title and console of the game
    title, console = get_title_console(soup)
    gameinfo['title'] = title
    gameinfo['console'] = console
    
    return gameinfo

## All together now - single page scraper function

In [366]:
def single_page_scrape(url):
    """Takes the url for a single game page on MobyGames, scrapes the html, parses the data,
    and returns a dict containing the characteristics and target value for that game"""
    soup = grab_soup(url)
    data = parse_game_data(soup)
    
    return data

**Testing the function:**

In [363]:
test = single_page_scrape('http://www.mobygames.com/game/switch/fire-emblem-warriors')
pprint(test)

78
defaultdict(None,
            {'also for': 'New Nintendo 3DS | Combined View',
             'art': 'Anime / Manga',
             'console': 'Nintendo Switch',
             'developed by': 'Intelligent Systems Co., Ltd., Omega Force, Team '
                             'Ninja',
             'esrb rating': 'Teen',
             'gameplay': 'Hack and Slash',
             'genre': 'Action',
             'interface': 'Direct Control',
             'official site': 'Fire Emblem: Warriors',
             'perspective': 'Behind view',
             'published by': 'Nintendo of America Inc.',
             'released': 'Oct 20, 2017',
             'score': 78,
             'setting': 'Fantasy',
             'title': 'Fire Emblem: Warriors'})


# Scrape urls for every game on a console

## Helper functions

In [124]:
def get_game_quantity(url):
    """This function takes the url of a game list on moby games and returns an int of how many total
    games are in that category"""
    soup = grab_soup(url)
    
    header = soup.find(class_="mobHeaderItems").text  #returns string '(items 1-25 of N)' where N is total games
    
    total_number = int(header[15:-1]) #pulls out only the total number of games and casts as int

    return total_number


def get_urls(url):
    """This function looks at a list of games on MobyGames.com (url specified as input parameter)
    and returns a list of urls for every individual game on that list"""
    
    url_list = []      #series to hold all the game page urls I'm pulling

    soup = grab_soup(url)

    gametable = soup.find(id="mof_object_list")    #grabs the main table with all the games
    tablerows = gametable.find("tbody")("tr")      #finds all "tr" tagged sections in the "tbody" of the gametable

    for row in tablerows:       #the first link in each row is what we want
        raw_tag = row.find("a")
        url_tail = raw_tag['href']      #pulls the relative url
        full_url = "www.mobygames.com" + url_tail

        url_list.append(full_url)

    return url_list

## All together now - function to collect game urls for any console

In [125]:
def get_game_urls_console(base_url='http://www.mobygames.com/browse/games/switch'):
    """Takes the url for a console's game list on MobyGames (defaults to Nintendo Switch)
    and returns a list of urls for every game on the console"""
    
    game_urls = []     #this will store the url list for return

    url_head = base_url + '/offset,'
    offset = 0       #this is plugged into the url to progress through pages of the games list
    url_tail = '/so,0a/list-games/'

    total_games = get_game_quantity(url_head + str(offset) + url_tail) #figure out how many games are in category

    while offset <= (total_games):
        url = url_head + str(offset) + url_tail

        games_on_page = get_urls(url)
        game_urls += games_on_page

        offset += 25
    
    return game_urls

## Generate URL lists

**Generating lists for Nintendo Switch, PS4, XBox One**

In [130]:
ns_list = get_game_urls_console()     #nintendo switch is the default for that function

In [None]:
ps_list = get_game_urls_console(base_url='http://www.mobygames.com/browse/games/playstation-4')

In [131]:
xb_list = get_game_urls_console(base_url='http://www.mobygames.com/browse/games/xbox-one')

**Saving each list as pickle to avoid rerunning the scrapes above**

In [134]:
with open('ns_urls.pk1', 'wb') as f:
    pickle.dump(ns_list, f)

with open('ps_urls.pkl', 'wb') as f:
    pickle.dump(ps_list, f) 

with open('xb_urls.pkl', 'wb') as f:
    pickle.dump(xb_list, f)

# Scrape and aggregate data from each game url

## Create empty dataframe with desired columns

In [None]:
fields = ['title','console','released','genre','gameplay','perspective',
              'setting','developed by','published by','esrb rating','score']

df = pd.DataFrame(columns=fields)

## Run scrape all game pages and populate dataframe

In [422]:
master_list = ns_list + ps_list + xb_list

for row, url in enumerate(master_list):
    gameinfo = single_page_scrape('http://' + url)
    
    for key,value in gameinfo.items():
        if key in list(df):
            df.loc[row,key] = value

In [427]:
df   #testing that it worked

Unnamed: 0,title,console,released,genre,gameplay,perspective,setting,developed by,published by,esrb rating,score
0,10 Second Run,Nintendo Switch,"Dec 21, 2017",Action,Platform,Side view,,"G-mode Co., Ltd.","Blue Print Inc, KaeruPanda Inc.",,
1,1-2-Switch,Nintendo Switch,"Mar 03, 2017",Action,"Mini-Games, Party Game","1st-person, Audio game",,Nintendo EPD,Nintendo of America Inc.,Everyone 10+,48
2,36 Fragments of Midnight,Nintendo Switch,"Sep 14, 2017",Action,Platform,Side view,,Petite Games,Ratalaika Games S.L.,Everyone,
3,60 Seconds!,Nintendo Switch,"Dec 18, 2017","Action, Adventure",,Bird's-eye view,Post-Apocalyptic,Robot Gentleman,Robot Gentleman,,
4,88 Heroes,Nintendo Switch,"Oct 10, 2017",Action,Platform,Side view,,Bitmap Bureau Ltd.,Rising Star Games Inc.,Everyone 10+,
5,Aces of the Luftwaffe: Squadron,Nintendo Switch,"Nov 17, 2017",Action,"Arcade, Shooter",Top-down,,Handy-Games GmbH,Handy-Games GmbH,Everyone 10+,
6,Acorn Tactics,Nintendo Switch,"Dec 07, 2017",Strategy/Tactics,Managerial / Business Simulation,Bird's-eye view,Sci-Fi / Futuristic,TACS Games,TACS Games,,
7,Aero Fighters 2,Nintendo Switch,"Aug 03, 2017",Action,"Arcade, Shooter",Top-down,,"Video System Co., Ltd.",HAMSTER Corporation,,
8,Alpha Mission II,Nintendo Switch,"Apr 06, 2017",Action,Shooter,Top-down,Sci-Fi / Futuristic,SNK Corporation,HAMSTER Corporation,Everyone,
9,Ambition of the Slimes,Nintendo Switch,"Dec 28, 2017",Role-Playing (RPG),Tactical RPG,,Fantasy,"Altair Works Corporation, Flyhigh Works Co., Ltd.",CIRCLE Entertainment Ltd.,,


## Save full raw dataframe as pickle for safekeeping

In [425]:
with open('allgames.pk1', 'wb') as f:     #whole dataset pickled!
    pickle.dump(df, f)

# Isolate rows with target value and export for analysis

In [469]:
mask = df['score'].notnull()
df_scores = df[mask]

with open('score_df.pk1', 'wb') as f:
    pickle.dump(df_scores, f)

# Appendix: Unusued Code

## Alternate method for getting game title

In [None]:
def get_title(soup):
    """Takes a BeautifulSoup object of an individual game page and returns the
    cleaned up title of the game as a string"""
    pagetitle = soup.title.text
    clean_game_title = re.sub(r'\ for .*MobyGames', '', pagetitle)
    return clean_game_title