In [1]:
import pandas as pd
import numpy as np
import datetime as dt

# Turn off warnings to ease reading
import warnings
warnings.filterwarnings('ignore')

# Loading datasets

In [2]:
vgc_data = pd.read_csv('Scraping/vgsales.csv', index_col = 0)

In [3]:
rawg_data = pd.read_csv('Scraping/rawg_games_smaller.csv', parse_dates = ['released','updated'])

In [4]:
vgc_data.head()

Unnamed: 0,Title,Platform,Publisher,Developer,VGC_Score,Critic_Score,User_Score,Total_Shipped,Total_Sales,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Release,Last_Update
0,Tetris,Series,Unknown,Alexey Pajitnov,,,,496.00m,,,,,,06th Jun 84,27th Feb 20
1,Super Mario,Series,Nintendo,Nintendo,,,,372.86m,,,,,,20th Jul 83,20th Feb 20
2,Pokemon,Series,Nintendo,Game Freak,,,,369.88m,,,,,,28th Sep 98,03rd Feb 20
3,Grand Theft Auto,Series,Rockstar Games,Rockstar North,,,,335.00m,,,,,,27th Mar 98,03rd Feb 20
4,FIFA,Series,EA Sports,Extended Play Productions (1991-1997),,,,325.00m,,,,,,15th Dec 93,03rd Feb 20


In [5]:
rawg_data.head()

Unnamed: 0,id,slug,name,released,tba,metacritic,suggestions_count,updated,platforms,genres,stores,tags,esrb_rating
0,3498,grand-theft-auto-v,Grand Theft Auto V,2013-09-17,False,97.0,416,2021-03-03,"['PC', 'Xbox Series S/X', 'PlayStation 5', 'Pl...","['Action', 'Adventure']","['Epic Games', 'PlayStation Store', 'Xbox Stor...","['Singleplayer', 'Steam Achievements', 'Multip...",Mature
1,4200,portal-2,Portal 2,2011-04-18,False,95.0,582,2020-08-03,"['Xbox One', 'PlayStation 3', 'PC', 'Xbox 360'...","['Shooter', 'Puzzle']","['Xbox Store', 'Xbox 360 Store', 'PlayStation ...","['Singleplayer', 'Steam Achievements', 'Multip...",Everyone 10+
2,3328,the-witcher-3-wild-hunt,The Witcher 3: Wild Hunt,2015-05-18,False,92.0,678,2020-10-02,"['PC', 'Xbox One', 'Nintendo Switch', 'PlaySta...","['Action', 'Adventure', 'RPG']","['GOG', 'Xbox Store', 'Steam', 'PlayStation St...","['Singleplayer', 'Atmospheric', 'Full controll...",Mature
3,5286,tomb-raider,Tomb Raider (2013),2013-03-05,False,86.0,664,2020-08-03,"['PC', 'PlayStation 4', 'PlayStation 3', 'Xbox...","['Action', 'Adventure']","['App Store', 'Google Play', 'PlayStation Stor...","['Singleplayer', 'Multiplayer', 'Atmospheric',...",Mature
4,5679,the-elder-scrolls-v-skyrim,The Elder Scrolls V: Skyrim,2011-11-11,False,94.0,621,2020-07-06,"['PC', 'PlayStation 3', 'Xbox 360', 'Nintendo ...","['Action', 'RPG']","['Xbox 360 Store', 'Nintendo Store', 'Steam', ...","['Singleplayer', 'Steam Achievements', 'steam-...",Mature


# Cleaning vgc_data

There are columns we will not use:

- Sales  columns except `Total_Shipped` and `Total_Sales`.

- `Last_Update`.

- `VGC_Score`, `Critic_Score` and `User Score`, as we are going to get scores from Metacritic.



In [6]:
vgc_data = vgc_data.drop(['VGC_Score','Critic_Score','User_Score','NA_Sales','EU_Sales','JP_Sales','Other_Sales','Last_Update'], axis = 1)
vgc_data.head()

Unnamed: 0,Title,Platform,Publisher,Developer,Total_Shipped,Total_Sales,Release
0,Tetris,Series,Unknown,Alexey Pajitnov,496.00m,,06th Jun 84
1,Super Mario,Series,Nintendo,Nintendo,372.86m,,20th Jul 83
2,Pokemon,Series,Nintendo,Game Freak,369.88m,,28th Sep 98
3,Grand Theft Auto,Series,Rockstar Games,Rockstar North,335.00m,,27th Mar 98
4,FIFA,Series,EA Sports,Extended Play Productions (1991-1997),325.00m,,15th Dec 93


We are going to transform `Total_Shipped` and `Total_Sales` to numeric.

In [7]:
vgc_data['Total_Shipped'] = vgc_data['Total_Shipped'].str.strip('m').astype('float')
vgc_data['Total_Sales'] = vgc_data['Total_Sales'].str.strip('m').astype('float')

---
Now we proceed to clean the `Release` column.

We can observe the following:

- There are NaNs
- Days are in the ~st ~nd ~rd ~th format
- Months are in 3 letter format
- Years have the last 2 digits format

In [8]:
sorted(vgc_data['Release'].str[-2:].dropna().unique(), reverse = True)

['99',
 '98',
 '97',
 '96',
 '95',
 '94',
 '93',
 '92',
 '91',
 '90',
 '89',
 '88',
 '87',
 '86',
 '85',
 '84',
 '83',
 '82',
 '81',
 '80',
 '79',
 '78',
 '77',
 '75',
 '73',
 '71',
 '70',
 '21',
 '20',
 '19',
 '18',
 '17',
 '16',
 '15',
 '14',
 '13',
 '12',
 '11',
 '10',
 '09',
 '08',
 '07',
 '06',
 '05',
 '04',
 '03',
 '02',
 '01',
 '00']

We can see that:
- The years belonging to the 21th century range between 2000 and 2021
- The years belonging to the 20th century range between 1970 and 1999

Now we are going to split the string into the different elements to transform them into a more intelligible format and merge them later.

In [9]:
months = {'Jan':'01', 'Feb':'02', 'Mar':'03', 'Apr':'04', 'May':'05', 'Jun':'06',
          'Jul':'07', 'Aug':'08', 'Sep':'09', 'Oct':'10', 'Nov':'11', 'Dec':'12'}

In [10]:
def date_reformat(date):
    
    """
    This function converts string dates in the dddd mm yy (e.g. '02nd Jul 98') format into string dates in the dd-mm-YYYY format ('02-07-1998').
    """
    
    output = np.nan
    
    if type(date) != float:
        
        day = date[:2]                                  # Days
        month = date[5:8]
        year = date[-2:]
        current_year = dt.datetime.today().year - 2000
        
        month = months[month]                           # Months

        if int(year) < current_year:                    # Years

            year = '20' + year                          # 21th century

        else:

            year = '19' + year                          # 20th century
            
        output = '-'.join([day, month, year])           # Merge into a single string in the dd-mm-YYYY format

    return output

In [11]:
vgc_data['Release'] = vgc_data['Release'].apply(date_reformat)  

In [12]:
vgc_data.sample(10)

Unnamed: 0,Title,Platform,Publisher,Developer,Total_Shipped,Total_Sales,Release
3672,NHL 2000,PS,EA Sports,EA Canada,,0.87,30-09-1999
53273,You Have to Win the Game,PC,Minor Key Games,Unknown,,,06-05-2012
23786,American Conquest: Divided Nation,PC,CDV Software Entertainment,Unknown,,,27-02-2006
22230,4 in 1 Action Pack,PS3,SouthPeak Games,Unknown,,0.0,08-05-2012
17462,Caesar IV,PC,Sierra Entertainment,Tilted Mill Entertainment,,0.04,26-09-2006
2401,LEGO Harry Potter: Years 1-4,PS3,Warner Bros. Interactive,Traveller's Tales,,1.37,29-06-2010
12473,Venetica,PS3,Atari,DECK13 Interactive,,0.12,11-01-2011
35450,International Karate +,PS,Ignition Entertainment,Ignition Entertainment,,,01-01-2003
17608,The Raven Remastered,PS4,THQ Nordic,KING Art Games,,0.03,13-03-2018
55795,GRIS,PS4,Devolver Digital,Nomada Studio,,,26-11-2019


Now that there's nothing left to clean (in the formatting sense), let's check for NaN values to see what to do with them (as we have already seen many).

In [13]:
vgc_data.isna().sum()

Title                0
Platform             0
Publisher            0
Developer           17
Total_Shipped    55967
Total_Sales      40350
Release           4156
dtype: int64

There are so many NaNs in the dataframe we have to decide what to do with them!

- We are interested in the sales, so taking non-NaN `Total_Sales` values seems logical (other Sales columns have more NaNs, so it won't affect as much).
- Also, there are titles having NaN `Total_Sales` values but non-NaN `Total_Shipped` values.

Thus, we want to combine these two columns, and filling the NaN values.

However, to do so, first we have to check if there is any case in which both `Total_Sales` and `Total_Shipped` are non-NaN values.

We are going to fill the NaN values with zeros (we will consider 0 as not having a value in this case) and save them to a variable.

Then, we are going to perform the comparation `!= 0` to see how many non-zero values we have in each column.

Also, we cast the result to int so that we can sum them to check what we wanted to discover:

- If **sum = 0** : Both columns have either 0 or NaN.
- If **sum = 1** : Only one column have non-NaN or non-zero values.
- If **sum = 1** : Both columns have non-NaN and non-zero values.

In [14]:
vgc_data[vgc_data['Total_Sales'] == 0.34]

Unnamed: 0,Title,Platform,Publisher,Developer,Total_Shipped,Total_Sales,Release
7099,Kurushi Final: Mental Blocks,PS,Sony Computer Entertainment,SCEI,,0.34,01-08-1999
7100,Risk / Battleship / Clue,GBA,DSI Games,Gravity-I,,0.34,21-08-2005
7101,Mini Ninjas,PS3,Eidos Interactive,IO Interactive,,0.34,08-09-2009
7102,Pirates of the Caribbean: At World's End,DS,Disney Interactive Studios,Amaze Entertainment,,0.34,22-05-2007
7103,SD Gundam G Generation World,PSP,Namco Bandai,Tom Create,,0.34,24-02-2011
...,...,...,...,...,...,...,...
7213,Pirates of the Caribbean: At World's End,X360,Disney Interactive Studios,Eurocom Entertainment Software,,0.34,22-05-2007
7214,Delta Force: Black Hawk Down,XB,NovaLogic,Climax Group,,0.34,26-07-2005
7216,NHL 09,PS3,Electronic Arts,EA Canada,,0.34,09-09-2008
7217,Shaun White Snowboarding,PSP,Ubisoft,Ubisoft Montreal,,0.34,16-11-2008


In [15]:
total_sales = vgc_data['Total_Sales'].fillna(0)
total_shipped = vgc_data['Total_Shipped'].fillna(0)

In [16]:
zero_test = (total_sales != 0).astype(int) + (total_shipped != 0).astype(int)
zero_test.value_counts()

0    38279
1    21119
dtype: int64

This means that only one of the columns will hold a non-NaN or a non-zero value for the same data point.

In [17]:
zeros, ones = zero_test.value_counts()
zeros / (zeros + ones)

0.6444493080575103

And this means that 64.45% of the dataset would have zeros or NaNs in these two columns combined. 

Thus, in order to combine both columns and filling the NaNs, we will have to do the following:

- Save both columns to separate variables filling the NaNs with 0 (we can use the variables we used above).

- Perform a column-to-column addition.

- Remove the zeros.

- Get the median value.

- Sum `Total_sales` and `Total_shipped`, fill NaNs with the median value we calculated and save to a new column: `Sales`.

In [18]:
combined = total_sales + total_shipped
idx_zeros = list(combined[combined <= 0].index)
combined = combined.drop(index = idx_zeros)
med = combined.median()

# Median value we will use to fill the NaNs.
med

0.17

In [19]:
vgc_data['Sales'] = vgc_data[['Total_Shipped','Total_Sales']].sum(axis = 1, min_count = 1)\
                                                             .fillna(med)
vgc_data = vgc_data.drop(['Total_Shipped','Total_Sales'], axis = 1)

In [20]:
vgc_data.sample(10)

Unnamed: 0,Title,Platform,Publisher,Developer,Release,Sales
596,Final Fantasy X-2,PS2,Square Enix,Square Enix,18-11-2003,5.5
40750,No Fear Downhill Mountain Biking,GB,THQ,Codemasters,02-11-2001,0.17
33665,Grand Theft Auto IV: The Ballad of Gay Tony,XBL,Rockstar Games,Rockstar North,29-10-2009,0.17
23554,Alien,ACPC,Sega AM7,Concept Software,01-01-1985,0.17
39300,Mobile Suit Gundam Seed: Tomo to Kimi to Koko de,GBA,Bandai,Bandai,13-05-2004,0.17
53911,ACA NEOGEO SAMURAI SHODOWN III,PS4,Hamster Corporation,SNK Corporation,19-04-2018,0.17
59069,Virginia,OSX,Unknown,Variable State,,0.17
35969,Joe Montana Football,PC,Sega,MindSpan,01-01-1990,0.17
53529,Zoids Saga Fuzors,GBA,Tomy Corporation,Amedio,16-12-2004,0.17
28095,Command & Conquer: Covert Operations,PC,Virgin Interactive,Westwood Studios,30-04-1996,0.17


---

We will strip `vgc_data` from the IPs (`Platform` = `Series`) and save them into a separate dataframe in case we happen to use it in the future.

In [21]:
vgc_ips = pd.DataFrame()

In [22]:
ips = vgc_data[vgc_data['Platform'] == 'Series']

In [23]:
ips = ips.drop(['Platform'], axis = 1)
ips

Unnamed: 0,Title,Publisher,Developer,Release,Sales
0,Tetris,Unknown,Alexey Pajitnov,06-06-1984,496.00
1,Super Mario,Nintendo,Nintendo,20-07-1983,372.86
2,Pokemon,Nintendo,Game Freak,28-09-1998,369.88
3,Grand Theft Auto,Rockstar Games,Rockstar North,27-03-1998,335.00
4,FIFA,EA Sports,Extended Play Productions (1991-1997),15-12-1993,325.00
...,...,...,...,...,...
56413,Lumines,Unknown,Q Entertainment,,0.17
57160,Perfect Dark,Unknown,Rare,,0.17
57454,Red Steel,Unknown,Ubisoft Paris,,0.17
57939,Sly Cooper,Unknown,Sucker Punch,,0.17


In [24]:
ips.to_csv("01_vgc_ips.csv", encoding='utf-8', index=False)

In [25]:
ip_idx = list(vgc_data[vgc_data['Platform'] == 'Series'].index)

In [26]:
vgc_data = vgc_data.drop(index = ip_idx)

We will save the result into a csv.

In [27]:
vgc_data.to_csv("01_vgc_clean.csv", encoding='utf-8', index=False)

## Some observations on vgc_data

### No. of Platforms

In [28]:
vgc_data['Platform'].value_counts()

PC      11653
PS2      3566
DS       3288
PS       2706
PS4      2255
        ...  
CD32        3
S32X        3
BBCM        1
Aco         1
C128        1
Name: Platform, Length: 78, dtype: int64

There are **77 platforms** (`Series` refers to the IP, and `All` refers to 'multiplatform')

---

### No. of Publishers

In [29]:
vgc_data['Publisher'].value_counts()

Unknown                5948
Sega                   2130
Ubisoft                1583
Electronic Arts        1532
Activision             1530
                       ... 
Culture Publishers        1
Simpy Entertainment       1
Dan Da Rocha              1
Rogue Play, Inc.          1
Logic Artists             1
Name: Publisher, Length: 3295, dtype: int64

There are **3294 publishers** (one is Unkown)

---

### No. of Developers

In [30]:
vgc_data['Developer'].value_counts()

Unknown                  4433
Konami                    932
Sega                      865
Capcom                    763
Namco                     427
                         ... 
Trans-Pegasus Limited       1
Mark Healey                 1
Case in Point Studios       1
JDPCreations                1
Punchline                   1
Name: Developer, Length: 8420, dtype: int64

There are **8441 developers** in total.

---

### **Decisions yet to make**
---

- **What do we do with the empty 38279 `Sales` values?**

    - Discard them.
    - Fill them with values -> which values?
    
    
- **What do we do with the 4156 NaN values from the `Release` column?**
    
    - If the release date is unknown it is higly likely that the title is very old (we should check them).
    
    - If it is a very old game, we shuld discard them as they will not hold much information.
    
    - If it is an IP, it will depend on what we decide to do with `All` platform.
    
    
- **What do we do with the 17 NaN velues from the `Developer` column?**

    - Probably same as `Release` column.


- **¿Qué hacemos con los juegos que tienen clasificación de 'All' y 'Series' en la columna `Platform`?**

    - The sum of sales in the different platforms does not usually match the `All` and `Series` total sales.
    
    - We have to take into account that the RAWG dataset has platforms grouped from the beginning and that there is no `All` or `Series` category.
        - Should we unpack those raws? -> it already has >500k data points so it does not seem to be the most intelligent thing to do.
        

- **How are we going to transform these categorical variables into featrues (numerical/boolan)?**

    - Platform: There are 77+1 platforms. Its dimensions is too high to do the usual One-Hot Encoding.
    
    - Publisher y Developer: There are 2 orders of magnitude above the number of platforms, thus we have to see what do we do with them to avoid the *Curse of Dimensionality*.

---

# Cleaning rawg_data

Let's see how many games are in the TBA state (`tba`) column

In [31]:
rawg_data[rawg_data['tba']==True]

Unnamed: 0,id,slug,name,released,tba,metacritic,suggestions_count,updated,platforms,genres,stores,tags,esrb_rating
1581,303576,vampire-the-masquerade-bloodlines-2,Vampire: The Masquerade - Bloodlines 2,2022-12-31,True,,563,2021-02-23,"['PC', 'PlayStation 4', 'PlayStation 5', 'Xbox...","['Action', 'RPG']","['Steam', 'Epic Games', 'GOG']","['Singleplayer', 'Steam Achievements', 'Atmosp...",
1999,25829,gauntlet,Gauntlet,2009-02-28,True,,541,2019-09-17,"['Android', 'PlayStation 4', 'PC', 'Nintendo DS']","['Action', 'RPG']","['Google Play', 'Nintendo Store']","['Multiplayer', 'combat', 'online', 'death', '...",Everyone 10+
2687,272483,whispers-of-a-machine,Whispers of a Machine,2019-04-16,True,77.0,431,2019-09-10,"['iOS', 'macOS', 'PC']","['Adventure', 'Indie', 'Puzzle']","['App Store', 'Steam', 'GOG']","['Singleplayer', 'Steam Achievements', 'Atmosp...",
3034,58751,halo-infinite,Halo Infinite,2021-12-31,True,,539,2020-11-29,"['PC', 'Xbox Series S/X', 'Xbox One']","['Action', 'Shooter', 'Adventure']",Steam,"['Singleplayer', 'Multiplayer', 'Atmospheric',...",
3295,11025,totally-accurate-battle-simulator,Totally Accurate Battle Simulator,NaT,True,,208,2020-08-11,PC,"['Strategy', 'Simulation', 'Indie']","['Epic Games', 'Steam']","['Singleplayer', 'Multiplayer', 'Open World', ...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...
513054,43660,anne,A.N.N.E.,NaT,True,,206,2019-01-09,"['PC', 'macOS', 'Nintendo Switch', 'Linux']","['Action', 'Adventure', 'Platformer']",,,
514307,41141,jack-claw,Jack Claw,NaT,True,,385,2020-02-16,PC,Action,,,
514898,39758,bonk-brink-of-extinction,Bonk: Brink of Extinction,NaT,True,,462,2020-06-19,"['Xbox 360', 'Wii', 'PlayStation 3']",Arcade,,,
514999,39514,pirates-of-the-caribbean-armada-of-the-damned,Pirates of the Caribbean: Armada of the Damned,NaT,True,,665,2021-01-09,"['Xbox 360', 'PlayStation 3', 'PC']","['Action', 'RPG']",,,


There are 2435 titles in the TBA state. However, we can see that there are games in that stare whose `release` is **in the past**.

Let's check how many future releases are in this dataset, using Feb 28th as the delimiter for 'future', as our VGChartz dataset has been retrieved that day.

In we get to improve the scraper in the future, we should save the date with something like the following:

```
today = dt.datetime.today().strftime("%Y-%m-%d")
```

In [32]:
rawg_data[rawg_data['released']>='2021-02-28']

Unnamed: 0,id,slug,name,released,tba,metacritic,suggestions_count,updated,platforms,genres,stores,tags,esrb_rating
1581,303576,vampire-the-masquerade-bloodlines-2,Vampire: The Masquerade - Bloodlines 2,2022-12-31,True,,563,2021-02-23,"['PC', 'PlayStation 4', 'PlayStation 5', 'Xbox...","['Action', 'RPG']","['Steam', 'Epic Games', 'GOG']","['Singleplayer', 'Steam Achievements', 'Atmosp...",
3010,45775,biomutant,Biomutant,2021-05-25,False,,476,2021-01-26,"['PlayStation 4', 'Xbox One', 'PC']","['Action', 'RPG']","['GOG', 'Steam']","['Singleplayer', 'Steam Achievements', 'steam-...",
3034,58751,halo-infinite,Halo Infinite,2021-12-31,True,,539,2020-11-29,"['PC', 'Xbox Series S/X', 'Xbox One']","['Action', 'Shooter', 'Adventure']",Steam,"['Singleplayer', 'Multiplayer', 'Atmospheric',...",
3255,287342,subnautica-below-zero,Subnautica: Below Zero,2021-05-14,False,,362,2021-02-25,"['macOS', 'PC', 'Nintendo Switch']","['Adventure', 'Indie']","['Nintendo Store', 'Epic Games', 'Steam']","['Singleplayer', 'Open World', 'First-Person',...",Everyone 10+
3585,463723,far-cry-6,Far Cry 6,2021-12-31,False,,628,2021-02-15,"['PlayStation 5', 'Xbox Series S/X', 'PlayStat...","['Action', 'Shooter']","['Epic Games', 'Xbox Store', 'PlayStation Store']","['Multiplayer', 'Atmospheric', 'Story Rich', '...",Rating Pending
...,...,...,...,...,...,...,...,...,...,...,...,...,...
516096,35712,game-110,Исход,2030-01-01,False,,378,2019-01-09,PC,RPG,,,
516244,35133,atriarch,Atriarch,2030-01-01,False,,497,2019-01-09,PC,"['RPG', 'Strategy', 'Massively Multiplayer']",,,
516325,34593,french-street-racing,French Street Racing,2030-01-01,False,,377,2019-01-09,PC,"['Racing', 'Arcade']",,,
516344,32973,adellion,Adellion,2030-01-01,False,,340,2019-01-09,PC,"['RPG', 'Massively Multiplayer']",,,


We can see that there are far more games that should have the `tba` but they don't. Thus, we can drop the `tba` column as it gives non-accurate information.

Furthermore:

- We will also drop `id` as it's only useful if we are going to do requests to the API, but we are not allowed to do more than 20.000 requests a month, so nothing we can do with a dataset with >500k ids. 

- `updated` is a useless column, as our goal is not to evaluate the updates for a games' success (if we want to predict if a game is a success, it shouldn't even have an update).

- We will keep `slug` even though it seems useless, as the Metacritic page for games follows the `https://www.metacritic.com/game/PLATFORM/SLUG` format.

In [33]:
rawg_data = rawg_data.drop(['id','tba','updated'], axis = 1)
rawg_data.head()

Unnamed: 0,slug,name,released,metacritic,suggestions_count,platforms,genres,stores,tags,esrb_rating
0,grand-theft-auto-v,Grand Theft Auto V,2013-09-17,97.0,416,"['PC', 'Xbox Series S/X', 'PlayStation 5', 'Pl...","['Action', 'Adventure']","['Epic Games', 'PlayStation Store', 'Xbox Stor...","['Singleplayer', 'Steam Achievements', 'Multip...",Mature
1,portal-2,Portal 2,2011-04-18,95.0,582,"['Xbox One', 'PlayStation 3', 'PC', 'Xbox 360'...","['Shooter', 'Puzzle']","['Xbox Store', 'Xbox 360 Store', 'PlayStation ...","['Singleplayer', 'Steam Achievements', 'Multip...",Everyone 10+
2,the-witcher-3-wild-hunt,The Witcher 3: Wild Hunt,2015-05-18,92.0,678,"['PC', 'Xbox One', 'Nintendo Switch', 'PlaySta...","['Action', 'Adventure', 'RPG']","['GOG', 'Xbox Store', 'Steam', 'PlayStation St...","['Singleplayer', 'Atmospheric', 'Full controll...",Mature
3,tomb-raider,Tomb Raider (2013),2013-03-05,86.0,664,"['PC', 'PlayStation 4', 'PlayStation 3', 'Xbox...","['Action', 'Adventure']","['App Store', 'Google Play', 'PlayStation Stor...","['Singleplayer', 'Multiplayer', 'Atmospheric',...",Mature
4,the-elder-scrolls-v-skyrim,The Elder Scrolls V: Skyrim,2011-11-11,94.0,621,"['PC', 'PlayStation 3', 'Xbox 360', 'Nintendo ...","['Action', 'RPG']","['Xbox 360 Store', 'Nintendo Store', 'Steam', ...","['Singleplayer', 'Steam Achievements', 'steam-...",Mature


In [34]:
rawg_data.shape[0]

521613

## Some observations on rawg_data

### No. of non-NaN values

In [35]:
rawg_data.notna().sum()

slug                 521611
name                 521611
released             497005
metacritic             4926
suggestions_count    521613
platforms            517536
genres               405457
stores               491543
tags                 459545
esrb_rating           57830
dtype: int64

We can see that there are very few `metacritic` values available, as well as kind of few values on `esrb_rating`.

We may opt to drop this column (`metacritic`) after retrieving the scores from the Metacritic web using a scraper.

---

### No. of NaN values

In [36]:
rawg_data.isna().sum()

slug                      2
name                      2
released              24608
metacritic           516687
suggestions_count         0
platforms              4077
genres               116156
stores                30070
tags                  62068
esrb_rating          463783
dtype: int64

---

We are dropping the 2 rows with `slug` and `name` missing, as we won't know which games they are.

In [37]:
rawg_data[rawg_data['slug'].isna()]

Unnamed: 0,slug,name,released,metacritic,suggestions_count,platforms,genres,stores,tags,esrb_rating
34946,,,2018-01-09,,72,"['PC', 'macOS', 'Linux']","['Adventure', 'Puzzle']",itch.io,"['2D', 'Pixel Graphics', 'Short', 'Multiple En...",
467110,,,2016-09-25,,67,PC,Adventure,itch.io,"['Pixel Graphics', 'Point & Click']",


In [38]:
rawg_data = rawg_data.drop(index = [34946, 467110])

In [39]:
rawg_data.shape[0]

521611

Saving the dataset into a csv.

In [40]:
rawg_data.to_csv("01_rawg_clean.csv", encoding='utf-8', index=False)

---
### **Decisions yet to make**
---

- How are we going to merge this dataset with vgc_data?

    - We are going to lowercase `name` and `title`, and if they match, they are added to the same row.
    
        - There are problems with the `platforms`, `genres`, `stores`, and `tags` columns.
        
        - Especially `platforms`, as vgc_data is separated into different platforms.
        

- How are we going to unpack the list-like columns? Are we going to use One-Hot Encoding?

    - We still **DON'T KNOW**


- What are we going to do with NaN values in some columns after the merge?

    - We still **DON'T KNOW**

In [41]:
a = vgc_data[vgc_data['Release'].isna()]

In [42]:
a

Unnamed: 0,Title,Platform,Publisher,Developer,Release,Sales
96,Call of Duty: Modern Warfare,All,Unknown,Infinity Ward,,30.00
117,Human: Fall Flat,All,Unknown,No Brakes Games,,25.00
134,Borderlands 2,All,Unknown,Gearbox Software,,22.00
135,Call of Duty: Modern Warfare 2,All,Unknown,Infinity Ward,,22.00
146,Fallout 4,All,Unknown,Bethesda Game Studios,,20.00
...,...,...,...,...,...,...
59375,Yuppie Psycho,PS4,Unknown,Baroque Decay,,0.17
59377,Zarvot,NS,Unknown,Snowhydra Games,,0.17
59383,Zombeer,WiiU,Unknown,Padaone Games,,0.17
59387,Zombie Army Trilogy,NS,Unknown,Rebellion Developments,,0.17


In [43]:
a[a['Platform']=='All']

Unnamed: 0,Title,Platform,Publisher,Developer,Release,Sales
96,Call of Duty: Modern Warfare,All,Unknown,Infinity Ward,,30.00
117,Human: Fall Flat,All,Unknown,No Brakes Games,,25.00
134,Borderlands 2,All,Unknown,Gearbox Software,,22.00
135,Call of Duty: Modern Warfare 2,All,Unknown,Infinity Ward,,22.00
146,Fallout 4,All,Unknown,Bethesda Game Studios,,20.00
...,...,...,...,...,...,...
58487,The Elder Scrolls IV: Oblivion,All,Unknown,Bethesda Game Studios,,0.17
58810,Torchlight II,All,Unknown,Runic Games,,0.17
59119,Warriors Orochi 2,All,Unknown,Omega Force,,0.17
59121,Warriors Orochi Z,All,Unknown,Omega Force,,0.17


In [44]:
a[a['Sales']>0]

Unnamed: 0,Title,Platform,Publisher,Developer,Release,Sales
96,Call of Duty: Modern Warfare,All,Unknown,Infinity Ward,,30.00
117,Human: Fall Flat,All,Unknown,No Brakes Games,,25.00
134,Borderlands 2,All,Unknown,Gearbox Software,,22.00
135,Call of Duty: Modern Warfare 2,All,Unknown,Infinity Ward,,22.00
146,Fallout 4,All,Unknown,Bethesda Game Studios,,20.00
...,...,...,...,...,...,...
59375,Yuppie Psycho,PS4,Unknown,Baroque Decay,,0.17
59377,Zarvot,NS,Unknown,Snowhydra Games,,0.17
59383,Zombeer,WiiU,Unknown,Padaone Games,,0.17
59387,Zombie Army Trilogy,NS,Unknown,Rebellion Developments,,0.17
