In [29]:
import pandas as pd
import wikipedia as w
from bs4 import BeautifulSoup
import requests
import re
import csv 
def retrieve_page(game):
    # try grabbing the page, but if we can't just remove it
    try:
        print(game)
        return w.page(game).content
    except:
        print(f"Game {game} was not found")
        return "Page Not Found"


In [30]:
def grab_table(url):
    # given url, grabs table for games listed

    wiki_page = requests.get(url)
    wiki_soup = BeautifulSoup(wiki_page.text, 'html.parser')
    game_table = wiki_soup.find_all('table')
    game_table = game_table[1]
    return game_table


def write_csv(filename, headers, rows):
    with open(filename, 'w', newline="") as output:
        writer = csv.writer(output)
        writer.writerow(headers)
        writer.writerows(rows)
    


In [31]:
def create_csv(t, num_headers=8):
    headers = [header.text.strip() for header in t.find_all('th')][:num_headers]
    data_rows = t.find_all('tr')
    rows = []
    # Modified from: https://www.pylenin.com/blogs/python-beautiful-soup/#parsing-html-table-with-beautiful-soup
    for row in data_rows:
        value = row.find_all(re.compile('td|th'))
        beautified_value = [ele.text.strip() for ele in value]
        # Remove data arrays that are empty
        if len(beautified_value) == 0:
            continue
        # deal with numbers that are missing entries
        if len(beautified_value) < len(headers):
            beautified_value = [None] + beautified_value
        rows.append(beautified_value)
    return headers, rows


## Collect PS4 Data

In [32]:

playstation_table = 'https://en.wikipedia.org/wiki/List_of_best-selling_PlayStation_4_video_games'
p4_games = grab_table(playstation_table)

ps4_headers, ps_table_cleaned = create_csv(ps4_games, num_headers=6)

write_csv("ps4_bestsellers_notext.csv", ps4_headers, ps_table_cleaned)

#grab ps4 pages
#drop the repeated first row
ps4_bestsellers = pd.read_csv("ps4_bestsellers_notext.csv")[1:]


#try to grab each page
game_text = ps4_bestsellers["Game"].apply(lambda x: retrieve_page(x))

ps4_bestsellers["wiki_page"] = game_text
ps4_bestsellers = ps4_bestsellers[ps4_bestsellers.wiki_page != "Page Not Found"].reset_index(drop=True)

# drop any pages that are blank

#ps4_bestsellers.to_csv("ps4_bestsellers.csv")

Marvel's Spider-Man
God of War
Game God of War was not found
Uncharted 4: A Thief's End
The Witcher 3: Wild Hunt
The Last of Us Part II
Game The Last of Us Part II was not found
Horizon Zero Dawn
Game Horizon Zero Dawn was not found
The Last of Us Remastered
Star Wars Battlefront
Ghost of Tsushima
Infamous Second Son
Final Fantasy VII Remake
Monster Hunter: World
Cyberpunk 2077
Game Cyberpunk 2077 was not found
Detroit: Become Human
Game Detroit: Become Human was not found
FIFA 17
Game FIFA 17 was not found
Crash Bandicoot N. Sane Trilogy
Final Fantasy XV
Persona 5 Royal
Killzone: Shadow Fall
Grand Theft Auto V
Bloodborne
Driveclub
Game Driveclub was not found
Knack
Minecraft: PlayStation 4 Edition
Nier: Automata
Nioh 2
Game Nioh 2 was not found
Battlefield 1
Dragon Quest XI: Echoes of an Elusive Age
Gran Turismo Sport
Metal Gear Solid V: The Phantom Pain
Final Fantasy XII: The Zodiac Age
Kingdom Hearts III
Nioh
Game Nioh was not found




  lis = BeautifulSoup(html).find_all('li')


## Collect Nintendo Data

In [36]:
nintendo_table = "https://en.wikipedia.org/wiki/List_of_best-selling_Nintendo_Switch_video_games"
nintendo_games = grab_table(nintendo_table)

nin_headers, nin_cleaned = create_csv(nintendo_games, num_headers=8)

write_csv("nintendo_bestsellers_notext.csv", nin_headers, nin_cleaned)

#grab ps4 pages
#drop the repeated first row
nin_bestsellers = pd.read_csv("nintendo_bestsellers_notext.csv")[1:]


#try to grab each page
game_text = nin_bestsellers["Title"].apply(lambda x: retrieve_page(x))

nin_bestsellers["wiki_page"] = game_text
nin_bestsellers = nin_bestsellers[nin_bestsellers.wiki_page != "Page Not Found"].reset_index(drop=True)

# drop any pages that are blank

#nin_bestsellers.to_csv("nintendo_bestsellers.csv")


Mario Kart 8 Deluxe
Animal Crossing: New Horizons
Super Smash Bros. Ultimate
The Legend of Zelda: Breath of the Wild
Pokémon Sword and Shield
Super Mario Odyssey
Super Mario Party
Game Super Mario Party was not found
Pokémon Brilliant Diamond and Shining Pearl
Ring Fit Adventure
Pokémon: Let's Go, Pikachu! and Let's Go, Eevee!
Pokémon Legends: Arceus
New Super Mario Bros. U Deluxe
Splatoon 2
Luigi's Mansion 3
Pokémon Scarlet and Violet
Super Mario 3D World + Bowser's Fury
Super Mario 3D All-Stars
Game Super Mario 3D All-Stars was not found
Mario Party Superstars
Splatoon 3
Super Mario Maker 2
Monster Hunter Rise
Game Monster Hunter Rise was not found
Nintendo Switch Sports
The Legend of Zelda: Link's Awakening
Kirby and the Forgotten Land
Mario Tennis Aces
Clubhouse Games: 51 Worldwide Classics
Donkey Kong Country: Tropical Freeze
Hyrule Warriors: Age of Calamity
Kirby Star Allies
The Legend of Zelda: Skyward Sword HD
Fire Emblem: Three Houses
1-2-Switch
Momotaro Dentetsu: Showa, Heise

## Clean and Combine Data

In [37]:
ps4_bestsellers.head()

Unnamed: 0,Game,Copies sold,Release date[a],Genre(s),Developer(s),Publisher(s),wiki_page
0,Marvel's Spider-Man,20 million[1][b],"September 7, 2018",Action-adventure,Insomniac Games,Sony Interactive Entertainment,Marvel's Spider-Man is a 2018 action-adventure...
1,Uncharted 4: A Thief's End,16 million[4][5][6][d],"May 10, 2016",Action-adventure,Naughty Dog,Sony Interactive Entertainment,Uncharted 4: A Thief's End is a 2016 action-ad...
2,The Witcher 3: Wild Hunt,10.8 million[10],"May 18, 2015",Action role-playing,CD Projekt Red,CD Projekt,The Witcher 3: Wild Hunt is a 2015 action role...
3,The Last of Us Remastered,10 million[20][g],"July 29, 2014",Action-adventuresurvival horror,Naughty Dog,Sony Computer Entertainment,The Last of Us is a 2013 action-adventure game...
4,Star Wars Battlefront,8.42 million[21],"November 17, 2015",First-person shooterthird-person shooter,EA DICE,Electronic Arts,Star Wars: Battlefront is a series of first- a...


In [38]:
nin_bestsellers.head()


Unnamed: 0,No.,Title,Copies sold,As of,Release date[a],Genre(s),Developer(s),Publisher(s),wiki_page
0,1,Mario Kart 8 Deluxe,48.41 million[4],"September 30, 2022","April 28, 2017",Kart racing,Nintendo EPD,Nintendo,Mario Kart 8 Deluxe is a 2017 kart racing game...
1,2,Animal Crossing: New Horizons,40.17 million[4],"September 30, 2022","March 20, 2020",Social simulation,Nintendo EPD,Nintendo,Animal Crossing: New Horizons is a 2020 social...
2,3,Super Smash Bros. Ultimate,29.53 million[4],"September 30, 2022","December 7, 2018",Fighting,Bandai Namco StudiosSora Ltd.,Nintendo,Super Smash Bros. Ultimate is a 2018 crossover...
3,4,The Legend of Zelda: Breath of the Wild,27.79 million[4],"September 30, 2022","March 3, 2017",Action-adventure,Nintendo EPD,Nintendo,The Legend of Zelda: Breath of the Wild is a 2...
4,5,Pokémon Sword and Shield,25.37 million[4],"September 30, 2022","November 15, 2019",Role-playing,Game Freak,The Pokémon CompanyNintendo,Pokémon Sword and Pokémon Shield are 2019 role...


In [45]:
ps4_bestsellers

Unnamed: 0,Game,Genre(s),wiki_page,Console
0,Marvel's Spider-Man,Action-adventure,Marvel's Spider-Man is a 2018 action-adventure...,PS4
1,Uncharted 4: A Thief's End,Action-adventure,Uncharted 4: A Thief's End is a 2016 action-ad...,PS4
2,The Witcher 3: Wild Hunt,Action role-playing,The Witcher 3: Wild Hunt is a 2015 action role...,PS4
3,The Last of Us Remastered,Action-adventuresurvival horror,The Last of Us is a 2013 action-adventure game...,PS4
4,Star Wars Battlefront,First-person shooterthird-person shooter,Star Wars: Battlefront is a series of first- a...,PS4
5,Ghost of Tsushima,Action-adventurestealth,Ghost of Tsushima is a 2020 action-adventure g...,PS4
6,Infamous Second Son,Action-adventure,Infamous Second Son (stylized as inFAMOUS Seco...,PS4
7,Final Fantasy VII Remake,Action role-playing,Final Fantasy VII Remake is a 2020 action role...,PS4
8,Monster Hunter: World,Action role-playing,"Monster Hunter (モンスターハンター, Monsutā Hantā) is a...",PS4
9,Crash Bandicoot N. Sane Trilogy,Platform,Crash Bandicoot N. Sane Trilogy is a 2017 plat...,PS4


In [46]:
# Combine datasets and make columns the same
ps4_bestsellers = ps4_bestsellers[["Game", "Genre(s)", "wiki_page"]]
nin_bestsellers = nin_bestsellers[["Title", "Genre(s)", "wiki_page"]]
ps4_bestsellers = ps4_bestsellers.rename(columns = {"Game": "Title"})
ps4_bestsellers["Console"] = "PS4"
nin_bestsellers["Console"] = "Nintendo Switch"

vg_bestsellers = pd.concat([ps4_bestsellers, nin_bestsellers])


In [48]:
vg_bestsellers.describe()

Unnamed: 0,Title,Genre(s),wiki_page,Console
count,89,89,89,89
unique,89,42,87,2
top,Marvel's Spider-Man,Action role-playing,Fitness Boxing is an exergaming video game dev...,Nintendo Switch
freq,1,12,2,65


In [49]:
# write data out
vg_bestsellers.to_csv("video_game_bestsellers.csv")
