In [8]:
from bs4 import BeautifulSoup, element
import urllib
import pandas as pd
import numpy as np

# Scraping VGChartz

We will define a function to scrape this web site, as we will have to do it several times little by little, as it will often return the `HTTP error 503 (Service Unavailable)`.

We are going to try scraping 5 pages at once, making us run the function 12 times (we are going to show 1000 results per page).

In [9]:


urlhead = 'https://www.vgchartz.com/games/games.php?page='
urltail = '&console=&region=All&developer=&publisher=&genre=&boxart=Both&ownership=Both'
urltail += '&results=1000&order=Sales&showtotalsales=1&shownasales=1&showpalsales=1&showjapansales=1'
urltail += '&showothersales=1&showpublisher=1&showdeveloper=1&showreleasedate=1&showlastupdate=1'
urltail += '&showvgchartzscore=1&showcriticscore=1&showuserscore=1&showshipped=1'

In [None]:
def scrape(urlhead, urltail, p_start, p_finish, file_n = 1):
    
    '''
    Given a URL head, a URL tail and the range of pages we want to scrape, it returns a dataframe with the scraped data.
    urlhead : The url of the web we want to scrape (VGChartz in this case) until 'page=?' included.
    urltail : Same as above, but everything after the page number.
    p_start : The first page we want to scrape.
    p_finish : The (last page - 1) we want to scrape.
    file_n : The number we want to give our .csv output file.
    '''
    
    # We initialize the lists and not outside due to the high likelihood of getting the 503 error, so that wo don't get duplicates every time that happens.
    title = []
    platform = []
    publisher = []
    developer = []
    vgc_score = []
    critic_score = []
    user_score = []
    total_shipped = []
    total_sales = []
    sales_na = []
    sales_pal = []
    sales_jp = []
    sales_ot = []
    release = []
    update = []
    
    # To check for how many values we scraped. 
    rec_count = 0
    
    for page in range(p_start, p_finish):
        
        surl = urlhead + str(page) + urltail
        r = urllib.request.urlopen(surl).read()
        soup = BeautifulSoup(r, features="lxml")
        print(f"Page: {page}")

        chart = soup.find('div', id='generalBody').find('table')

        for row in chart.find_all('tr')[3:]:
            try:
                col = row.find_all('td')

                # extract data into column data

                column_2 = col[2].find('a').string.strip()      # Title
                column_3 = col[3].find('img')['alt'].strip()    # Platform
                column_4 = col[4].string.strip()                # Publisher
                column_5 = col[5].string.strip()                # Developer
                column_6 = col[6].string.strip()                # VGChartz Score
                column_7 = col[7].string.strip()                # Critic Score
                column_8 = col[8].string.strip()                # User Score
                column_9 = col[9].string.strip()                # Total Shipped
                column_10 = col[10].string.strip()              # Total Sales
                column_11 = col[11].string.strip()              # NA Sales
                column_12 = col[12].string.strip()              # PAL Sales (EU)
                column_13 = col[13].string.strip()              # Japan Sales
                column_14 = col[14].string.strip()              # Other Sales
                column_15 = col[15].string.strip()              # Release Date
                column_16 = col[16].string.strip()              # Last Update

                # Add Data to columns
                # Adding data only if able to read all of the columns

                title.append(column_2)
                platform.append(column_3)
                publisher.append(column_4)
                developer.append(column_5)
                vgc_score.append(column_6)
                critic_score.append(column_7)
                user_score.append(column_8)
                total_shipped.append(column_9)
                total_sales.append(column_10)
                sales_na.append(column_11)
                sales_pal.append(column_12)
                sales_jp.append(column_13)
                sales_ot.append(column_14)
                release.append(column_15)
                update.append(column_16)

                rec_count += 1

            except:
                print('Got Exception')
                continue

    columns = {'Title': title, 'Platform': platform, 'Publisher': publisher, 'Developer': developer, 'VGC_Score': vgc_score,
               'Critic_Score': critic_score, 'User_Score': user_score, 'Total_Shipped': total_shipped, 'Total_Sales': total_sales, 'NA_Sales': sales_na,
               'EU_Sales': sales_pal, 'JP_Sales': sales_jp, 'Other_Sales': sales_ot, 'Release': release, 'Last_Update': update}

    print (rec_count)
    
    df = pd.DataFrame(columns)
    
    print(df)
    
    df = df[['Title', 'Platform', 'Publisher', 'Developer', 'VGC_Score', 'Critic_Score', 'User_Score', 'Total_Shipped', 'Total_Sales', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales',
             'Release', 'Last_Update']]
    
    # Numbering the output
    
    if file_n < 10:
        file_n = '0' + str(file_n)
        
    else:
        file_n = str(file_n)
    
    filename = "vgsales_" + file_n + ".csv"
    df.to_csv(filename, sep=",", encoding='utf-8')

    print('done!')
    
    return df

---
Let's proceed with the scraping.

In [11]:
df1 = scrape(urlhead, urltail, 1, 6)

Page: 1
Page: 2
Page: 3
Page: 4
Page: 5
5000
                               Title Platform                    Publisher  \
0                             Tetris   Series                      Unknown   
1                        Super Mario   Series                     Nintendo   
2                            Pokemon   Series                     Nintendo   
3                   Grand Theft Auto   Series               Rockstar Games   
4                               FIFA   Series                    EA Sports   
...                              ...      ...                          ...   
4995             Jampack Summer 2002      PS2  Sony Computer Entertainment   
4996                 Trivial Pursuit      Wii              Electronic Arts   
4997     FIFA World Cup Germany 2006      PS2                    EA Sports   
4998  Wreckless: ThE YaKuza MisSiOns       XB                   Activision   
4999          Anomaly: Warzone Earth       PC                      Unknown   

                  

In [4]:
df2 = scrape(urlhead, urltail, 6, 11, file_n = 2)

Page: 6
Page: 7
Page: 8
Page: 9
Page: 10
5000
                                                  Title Platform  \
0                            Starlink: Battle for Atlas       NS   
1                                       Super Bomberman     SNES   
2                                            Top Spin 3      PS3   
3       SpongeBob SquarePants: Battle for Bikini Bottom       XB   
4                                                    Up      Wii   
...                                                 ...      ...   
4995                                    Among The Sleep       PC   
4996  Kouchuu Ouja Mushi King: Greatest Champion e n...       DS   
4997                                     Parlor! Mini 4     SNES   
4998                                  Breath of Fire II      GBA   
4999                                          BARRIER X       PC   

             Publisher           Developer VGC_Score Critic_Score User_Score  \
0              Ubisoft     Ubisoft Toronto       N/A     

In [11]:
df3 = scrape(urlhead, urltail, 11, 16, file_n = 3)

Page: 11
Page: 12
Page: 13
Page: 14
Page: 15
5000
                               Title Platform                    Publisher  \
0     Danganronpa 2: Goodbye Despair      PSP                        Spike   
1                              Conan     X360                          THQ   
2            Ice Age 2: The Meltdown       GC                Vivendi Games   
3          Petz Pony: Beauty Pageant       DS                      Ubisoft   
4                        Steins;Gate      PSP              Kadokawa Shoten   
...                              ...      ...                          ...   
4995     Discovery Kids: Parrot Pals       DS                    505 Games   
4996        Styx: Shards of Darkness      PS4       Focus Home Interactive   
4997                      Torchlight       PC  Perfect World Entertainment   
4998                     Chaos;Child      PSV                        PQube   
4999        Archer Maclean's 3D Pool      GBA          Crave Entertainment   

             

In [8]:
df4 = scrape(urlhead, urltail, 16, 21, file_n = 4)

Page: 16
Page: 17
Page: 18
Page: 19
Page: 20
5000
                                                  Title Platform  \
0                         Tak: The Great Juju Challenge       XB   
1                                    TMNT: Mutant Melee       GC   
2                                Combat: Task Force 121       XB   
3                                             Enchanter       PC   
4                                            Speed Zone      Wii   
...                                                 ...      ...   
4995  Super Robot Monkey Team: Game Boy Advance Vide...      GBA   
4996                                         Freekstyle      GBA   
4997                                    Aa Megami-samaa      PS2   
4998                              Battle Worlds: Kronos       PC   
4999                 Bunmei Kaika: Aoiza Ibunroku Saien      PSP   

                  Publisher                             Developer VGC_Score  \
0                       THQ                    Avalanc

In [4]:
df5 = scrape(urlhead, urltail, 21, 26, file_n = 5)

Page: 21
Page: 22
Page: 23
Page: 24
Page: 25
5000
                                                  Title Platform  \
0                                       Groovin' Blocks      Wii   
1                             Marvel: Ultimate Alliance       PC   
2        Tomoyo After: It's a Wonderful Life CS Edition      PS2   
3                                             Flashback      PS4   
4                                        Bakumatsu Rock      PSP   
...                                                 ...      ...   
4995  Bakumatsu Rouman Dai Ni Maku: Gekka no Kenshi ...       NG   
4996                  Bakumatsu Rouman: Gekka no Kenshi       PS   
4997              Bakumatsu Rouman: Gekka no Kenshi 1-2      PS2   
4998                                   Bakuretsu Akindo       PS   
4999                                   Bakuretsu Hunter      SAT   

                    Publisher                 Developer VGC_Score  \
0                   Zoo Games        Empty Clip Studios       N/

In [4]:
df6 = scrape(urlhead, urltail, 26, 31, file_n = 6)

Page: 26
Page: 27
Page: 28
Page: 29
Page: 30
5000
                                                  Title Platform  \
0                                    Bakuretsu Hunter R      SAT   
1                     Bakuretsu Hunter: Mahjong Special       PS   
2     Bakuretsu Hunter: Sorezore no Omoi...Nowaan Ch...       PS   
3                             Bakuretsu Muteki Bangai-O      N64   
4                                      Bakuretsu Soccer       PS   
...                                                 ...      ...   
4995                               Donkey Kong Jr. Math       VC   
4996                               Donkey Kong Jr. Math     WiiU   
4997                                 Donkey Kong Junior     7800   
4998                              Donkey Konga 1+2 Pack       GC   
4999  Donkey Konga 3: Tabe-houdai! Haru Mogitate 50 ...       GC   

         Publisher           Developer VGC_Score Critic_Score User_Score  \
0     King Records        King Records       N/A         

In [4]:
df7 = scrape(urlhead, urltail, 31, 36, file_n = 7)

Page: 31
Page: 32
Page: 33
Page: 34
Page: 35
5000
                                                  Title Platform  \
0                                           Donkey Xote      PSN   
1                                           Donkey Xote      PSP   
2                                              DonPachi      SAT   
3                                              DonPachi       PS   
4                                              DonPachi      PSN   
...                                                 ...      ...   
4995                           HTR+ Slot Car Simulation       PC   
4996                Hubert the Teddy Bear: Winter Games       WW   
4997  Hudson Best Collection Vol. 1: Bomberman Colle...      GBA   
4998  Hudson Best Collection Vol. 2: Lode Runner Col...      GBA   
4999   Hudson Best Collection Vol. 3: Action Collection      GBA   

                   Publisher                 Developer VGC_Score Critic_Score  \
0                Virgin Play               Virgin Pl

In [4]:
df8 = scrape(urlhead, urltail, 36, 41, file_n = 8)

Page: 36
Page: 37
Page: 38
Page: 39
Page: 40
5000
                                                  Title Platform  \
0     Hudson Best Collection Vol. 4: Nazotoki Collec...      GBA   
1     Hudson Best Collection Vol. 5: Shooting Collec...      GBA   
2     Hudson Best Collection Vol. 6: Bouken Jima Col...      GBA   
3                                           Hudson Hawk       GB   
4                                           Hudson Hawk      NES   
...                                                 ...      ...   
4995                               Nainai no Kiyotantei       PS   
4996                 Nakadashi Haramase Shinyaku Chousa       PC   
4997                                       Nakadasi Oni       PC   
4998                          Nakajima Miyuki: Namiromu       PS   
4999  Nakamura Sumiko Tettei Shidou: Shin TOEIC Test...       DS   

           Publisher       Developer VGC_Score Critic_Score User_Score  \
0        Hudson Soft          Hudson       N/A          N/A

In [4]:
df9 = scrape(urlhead, urltail, 41, 46, file_n = 9)

Page: 41
Page: 42
Page: 43
Page: 44
Page: 45
5000
                                                  Title Platform   Publisher  \
0     Nakamura Tooru Kanshuu: Indo Shiki Keisan Dril...       DS      GungHo   
1                Nakashima Tetsunari no Othello Seminar       XB     Success   
2        Nakayoshi Cooking Series 1: Oishii Cake Okusan       GB         MTO   
3         Nakayoshi Cooking Series 2: Oishii Pan Okusan       GB         MTO   
4                           Nakayoshi Mahjong Kapurichi      GBA      Konami   
...                                                 ...      ...         ...   
4995  Sea Animal Games for Toddlers and Kids with Ji...     XOne     Ubisoft   
4996  Sea Animal Games for Toddlers and Kids with Ji...      And     Ubisoft   
4997  Sea Animal Games for Toddlers and Kids with Ji...       DS     Ubisoft   
4998  Sea Animal Games for Toddlers and Kids with Ji...    Linux     Ubisoft   
4999                                         Sea Battle       GB  Info

In [4]:
df10 = scrape(urlhead, urltail, 46, 51, file_n = 10)

Page: 46
Page: 47
Page: 48
Page: 49
Page: 50
5000
                                                 Title Platform  \
0                                           Sea Battle     2600   
1                                            Sea Blast      XBL   
2                                           Sea Dragon     2600   
3                                           Sea Dragon       PC   
4                                             Sea Hawk     2600   
...                                                ...      ...   
4995                                 The Sword of Hope       GB   
4996                                 The Sword of Hope      3DS   
4997                            The Tale of Despereaux       PC   
4998  The Tales of Bearsworth Manor: Chaotic Conflicts       WW   
4999     The Tales of Bearsworth Manor: Puzzling Pages       WW   

                Publisher                Developer VGC_Score Critic_Score  \
0                 Unknown                     INTV       N/A        

In [4]:
df11 = scrape(urlhead, urltail, 51, 56, file_n = 11)

Page: 51
Page: 52
Page: 53
Page: 54
Page: 55
5000
                                      Title Platform  \
0                       The Talos Principle    Linux   
1                       The Talos Principle      OSX   
2                       The Talos Principle      And   
3     The Tekkyu Fight! Great Battle Gaiden       GB   
4                   The TEMPURA of the DEAD      XBL   
...                                     ...      ...   
4995                        Death Stranding      PS4   
4996                        Death Stranding       PC   
4997                         Death's Gambit      PS4   
4998                              Deathloop       PC   
4999                              Deathloop      PS5   

                           Publisher           Developer VGC_Score  \
0                   Devolver Digital             Croteam       N/A   
1                   Devolver Digital             Croteam       N/A   
2                   Devolver Digital             Croteam       N/A 

In [4]:
df12 = scrape(urlhead, urltail, 56, 61, file_n = 12)

Page: 56
Page: 57
Page: 58
Page: 59
Page: 60
4398
                                        Title Platform  \
0                           Deathtrap Dungeon       PC   
1                              Decay of Logos     XOne   
2                              Decay of Logos      PS4   
3                              Decay of Logos       PC   
4                              Decay of Logos       NS   
...                                       ...      ...   
4393       Zombieland: Double Tap - Road Trip       PC   
4394                                Zombillie       NS   
4395  Zone of the Enders: The 2nd Runner MARS       PC   
4396   Zoo Tycoon: Ultimate Animal Collection     XOne   
4397   Zoo Tycoon: Ultimate Animal Collection       PC   

                       Publisher                   Developer VGC_Score  \
0                        Unknown        Asylum Entertainment       N/A   
1              Rising Star Games           Amplify Creations       N/A   
2              Rising Star Game

---
# Joining the datasets into a single dataset

In [2]:
data = pd.DataFrame()

dfs = [df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12]

for df in dfs:
    
    data = pd.concat([data, df], ignore_index = True)

We know there were 59398 data points in total (5000 x 11 + 4398). Let's make sure the merged dataframe has this many rows.

In [5]:
data.shape[0]

59398

Save the dataset into a `.csv` file

In [4]:
data.head()

Unnamed: 0,Title,Platform,Publisher,Developer,VGC_Score,Critic_Score,User_Score,Total_Shipped,Total_Sales,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Release,Last_Update
0,Tetris,Series,Unknown,Alexey Pajitnov,,,,496.00m,,,,,,06th Jun 84,27th Feb 20
1,Super Mario,Series,Nintendo,Nintendo,,,,372.86m,,,,,,20th Jul 83,20th Feb 20
2,Pokemon,Series,Nintendo,Game Freak,,,,369.88m,,,,,,28th Sep 98,03rd Feb 20
3,Grand Theft Auto,Series,Rockstar Games,Rockstar North,,,,335.00m,,,,,,27th Mar 98,03rd Feb 20
4,FIFA,Series,EA Sports,Extended Play Productions (1991-1997),,,,325.00m,,,,,,15th Dec 93,03rd Feb 20


In [5]:
data.to_csv("vgsales.csv", sep=",", encoding='utf-8')