In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt

from ast import literal_eval
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
data = pd.read_csv('04_merged_data.csv', parse_dates = ['Release'])

data['Genres'] = data['Genres'].apply(lambda x: literal_eval(x) if pd.notnull(x) and ('[' in x) else x)
data['Stores'] = data['Stores'].apply(lambda x: literal_eval(x) if pd.notnull(x) and ('[' in x) else x)
data['Tags'] = data['Tags'].apply(lambda x: literal_eval(x) if pd.notnull(x) and ('[' in x) else x)

## One-Hot Encoding *Genres*, *Stores* and *ESRB* column

As `Genres` and `Stores` contain list-like values, we cannot just one-hot encode it with `pd.get_dummy()`.

On the contrary, `ESRB` doesn't have this problem as it doesn't contain lists, thus we can use `pd.get_dummy()`.

We are going to one-hot encode it using `MultiLabelBinarizer` from sklearn.

But first, we have to convert the string values to lists, as it's needed for MLB to work properly.

We are going to create an auxiliar dataframe copying the columns we want to make the operations easier.

In [3]:
def unnest(a_col):
    
    col = a_col
    result = list()
        
    if type(col) == list: 
        
        # if a list we return the list itself
        result = (col)

    else:
        
        # if not a list we return the value converted to list (so that we can use MLB)
        result.append(col) 
        
    return result

In [4]:
data[['Genres','Stores']] = data[['Genres','Stores']].applymap(unnest, na_action = 'ignore')
data

Unnamed: 0,Title,Platform,Publisher,Developer,Release,Sales,Suggest_count,Genres,Stores,Tags,...,C_Score,C_Reviews,C_Positive,C_Mixed,C_Negative,U_Score,U_Reviews,U_Positive,U_Mixed,U_Negative
0,"""Nuke It""",PC,CrystalVision,CrystalVision,1998-01-01,0.17,,,,,...,,,,,,,,,,
1,#IDARB,XOne,Other Ocean Interactive,Other Ocean Interactive,2015-01-02,0.17,182.0,,[Xbox Store],"[Music, Story, battle, future, strange, ball, ...",...,77.0,31.0,23.0,8.0,0.0,69.0,88.0,47.0,26.0,15.0
2,#killallzombies,PS4,Beatshapers,Beatshapers,2015-01-01,0.17,,,,,...,,,,,,,,,,
3,'70s Robot Anime: Geppy-X,PS,Aroma,Aroma,1999-05-27,0.17,,,,,...,,,,,,,,,,
4,'98 Koshien,PS,Magical Company,Magical Company,1998-06-18,0.41,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57846,XCOM: Enemy Unknown,,Feral Interactive,Firaxis Games,2014-06-19,0.17,,,,,...,,,,,,,,,,
57847,Yakuza: Like A Dragon,,Sega,Sega,2020-10-11,0.17,,,,,...,,,,,,,,,,
57848,Yakuza: Like A Dragon,,Sega,Sega,2020-01-12,0.17,,,,,...,,,,,,,,,,
57849,Zaxxon,,Coleco,Coleco,1983-01-01,0.17,,,,,...,,,,,,,,,,


Now we are going to separate the values.

In [5]:
mlb = MultiLabelBinarizer()


data[['Genres','Stores']] = data[['Genres','Stores']].fillna('')

# Genre #############################################################################################

X = mlb.fit_transform(data.pop('Genres'))

data = data.join(pd.DataFrame(X, index=data.index, columns=['G_' + col for col in mlb.classes_]))

    # We add another column for when there is no genre associated.

data['G_Other'] = data.filter(like ='G_', axis=1)\
                      .apply(lambda x: 1 if sum(x) <= 0 else 0, axis =1)

# Stores #############################################################################################

X = mlb.fit_transform(data.pop('Stores'))

data = data.join(pd.DataFrame(X, index=data.index, columns=['S_' + col for col in mlb.classes_]))

In [6]:
data

Unnamed: 0,Title,Platform,Publisher,Developer,Release,Sales,Suggest_count,Tags,ESRB,C_Score,...,S_App Store,S_Epic Games,S_GOG,S_Google Play,S_Nintendo Store,S_PlayStation Store,S_Steam,S_Xbox 360 Store,S_Xbox Store,S_itch.io
0,"""Nuke It""",PC,CrystalVision,CrystalVision,1998-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
1,#IDARB,XOne,Other Ocean Interactive,Other Ocean Interactive,2015-01-02,0.17,182.0,"[Music, Story, battle, future, strange, ball, ...",Everyone,77.0,...,0,0,0,0,0,0,0,0,1,0
2,#killallzombies,PS4,Beatshapers,Beatshapers,2015-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
3,'70s Robot Anime: Geppy-X,PS,Aroma,Aroma,1999-05-27,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
4,'98 Koshien,PS,Magical Company,Magical Company,1998-06-18,0.41,,,,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57846,XCOM: Enemy Unknown,,Feral Interactive,Firaxis Games,2014-06-19,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
57847,Yakuza: Like A Dragon,,Sega,Sega,2020-10-11,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
57848,Yakuza: Like A Dragon,,Sega,Sega,2020-01-12,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
57849,Zaxxon,,Coleco,Coleco,1983-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0


---

Now we proceed to One-hot encode the `ESRB` column with `pd.get_dummies` directly.

We will put `Rating Pending` to `NaN` because it gives the same information: None.

In [7]:
data['ESRB'] = data['ESRB'].map(lambda x: np.nan if x == 'Rating Pending' else x)
esrb = pd.get_dummies(data['ESRB'], prefix = 'ESRB')
data = data.join(esrb)

We also drop `Genres`, `Stores` and `ESRB` as they are no longer of use for us.

In [8]:
data.drop('ESRB', axis = 1, inplace = True)
data

Unnamed: 0,Title,Platform,Publisher,Developer,Release,Sales,Suggest_count,Tags,C_Score,C_Reviews,...,S_PlayStation Store,S_Steam,S_Xbox 360 Store,S_Xbox Store,S_itch.io,ESRB_Adults Only,ESRB_Everyone,ESRB_Everyone 10+,ESRB_Mature,ESRB_Teen
0,"""Nuke It""",PC,CrystalVision,CrystalVision,1998-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
1,#IDARB,XOne,Other Ocean Interactive,Other Ocean Interactive,2015-01-02,0.17,182.0,"[Music, Story, battle, future, strange, ball, ...",77.0,31.0,...,0,0,0,1,0,0,1,0,0,0
2,#killallzombies,PS4,Beatshapers,Beatshapers,2015-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
3,'70s Robot Anime: Geppy-X,PS,Aroma,Aroma,1999-05-27,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
4,'98 Koshien,PS,Magical Company,Magical Company,1998-06-18,0.41,,,,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57846,XCOM: Enemy Unknown,,Feral Interactive,Firaxis Games,2014-06-19,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
57847,Yakuza: Like A Dragon,,Sega,Sega,2020-10-11,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
57848,Yakuza: Like A Dragon,,Sega,Sega,2020-01-12,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
57849,Zaxxon,,Coleco,Coleco,1983-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0


We drop `S_Google Play` and `S_App Store` as we don't want them.

In [9]:
data = data.drop(['S_Google Play', 'S_App Store'], axis = 1)

We also drop rows where `Platform` = `All`

In [10]:
all_plat_idx = list(data[data['Platform']=='All'].index)

data = data.drop(index = all_plat_idx).reset_index(drop=True)
data

Unnamed: 0,Title,Platform,Publisher,Developer,Release,Sales,Suggest_count,Tags,C_Score,C_Reviews,...,S_PlayStation Store,S_Steam,S_Xbox 360 Store,S_Xbox Store,S_itch.io,ESRB_Adults Only,ESRB_Everyone,ESRB_Everyone 10+,ESRB_Mature,ESRB_Teen
0,"""Nuke It""",PC,CrystalVision,CrystalVision,1998-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
1,#IDARB,XOne,Other Ocean Interactive,Other Ocean Interactive,2015-01-02,0.17,182.0,"[Music, Story, battle, future, strange, ball, ...",77.0,31.0,...,0,0,0,1,0,0,1,0,0,0
2,#killallzombies,PS4,Beatshapers,Beatshapers,2015-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
3,'70s Robot Anime: Geppy-X,PS,Aroma,Aroma,1999-05-27,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
4,'98 Koshien,PS,Magical Company,Magical Company,1998-06-18,0.41,,,,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57209,XCOM: Enemy Unknown,,Feral Interactive,Firaxis Games,2014-06-19,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
57210,Yakuza: Like A Dragon,,Sega,Sega,2020-10-11,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
57211,Yakuza: Like A Dragon,,Sega,Sega,2020-01-12,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
57212,Zaxxon,,Coleco,Coleco,1983-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0


---
## Encoding: **Tags**

We first need to locate the top 20 values for tags and decide if any of them should be removed.

In [11]:
def tag_count(x):
    
    result = list()
    
    for row in x:

        if type(row) == list:
            result = result + row
            
    result = pd.Series(result)
                
    return result

We also want to convert tags to lowercase in case there are same tags differing in capitals.

In [12]:
def tag_lower(x):
    
    result = list()
    
    if type(x) == list:
        for element in x:

            result.append(element.lower())
    else:
        result = x.lower()
    
    return result

In [13]:
data['Tags'] = data['Tags'].map(tag_lower, na_action = 'ignore')

In [14]:
tags = tag_count(data.Tags)
tags.value_counts()[:20]

singleplayer                  9844
multiplayer                   5422
steam achievements            5170
full controller support       3568
steam cloud                   3297
steam-trading-cards           3079
rpg                           2096
exclusive                     2092
great soundtrack              2008
atmospheric                   2003
co-op                         1851
true exclusive                1850
2d                            1816
cooperative                   1661
partial controller support    1593
story rich                    1364
open world                    1308
steam leaderboards            1301
horror                        1231
split screen                  1200
dtype: int64

Filtering out the steam-exclusive and other strange top values.

In [15]:
out = ['steam', 'exclusive', 'controller']

for word in out:
    
    tags = tags.map(lambda x: x if word not in x.lower() else np.nan).dropna()

In [16]:
tags.value_counts()[:20]

singleplayer          9844
multiplayer           5422
rpg                   2096
great soundtrack      2008
atmospheric           2003
co-op                 1851
2d                    1816
cooperative           1661
story rich            1364
open world            1308
horror                1231
split screen          1200
first-person          1131
sci-fi                1084
third person          1042
fantasy               1041
funny                  998
female protagonist     937
local co-op            924
difficult              921
dtype: int64

What we can see here is that there are several tags that are redundant such as `co-op` and `cooperative`.

Let's check the top 50 values to see if we can replace some of them into the same category:

In [17]:
tags.value_counts()[:50]

singleplayer          9844
multiplayer           5422
rpg                   2096
great soundtrack      2008
atmospheric           2003
co-op                 1851
2d                    1816
cooperative           1661
story rich            1364
open world            1308
horror                1231
split screen          1200
first-person          1131
sci-fi                1084
third person          1042
fantasy               1041
funny                  998
female protagonist     937
local co-op            924
difficult              921
local multiplayer      862
online co-op           854
role-playing           842
fps                    821
retro                  813
comedy                 809
gore                   805
classic                799
exploration            752
online multiplayer     747
pixel graphics         705
survival               699
online                 684
anime                  683
violent                664
battle                 663
sandbox                612
p

We can see other redundant tags or tags that could be interpreted as another:

- `rpg` > drop : We already have a genre for that. We will remove it when doing the `value_counts()`.

- `cooperative` > `co-op` : They are exactly the same thing.

- `local co-op` & `local multiplayer` & `split screen` > `multiplayer` : Because the nature of the game is multiplayer.

- `online co-op` & `online multiplayer` > `online` : Because the most important aspect here is the online part over the multiplyer nature.

- `comedy` > `funny` : They are the same thing.

- `space` > `sci-fi` : We can consider them the same thing.

- `role-playing` > `rpg` : They are the same thing.

- `classic` > `retro` : We can consider them the same thing.

- `story` > `story rich` : We can consider them the same thing.

- `zombies` > `horror` : Horror includes zombies.

- `gore` & `war` > `violent` : They are inherently the same.

In [18]:
## We replace the tags we discussed above

tags = tags.replace('cooperative','co-op')\
           .replace('local co-op', 'multiplayer')\
           .replace('local multiplayer', 'multiplayer')\
           .replace('split screen', 'multiplayer')\
           .replace('online co-op', 'online')\
           .replace('online multiplayer', 'online')\
           .replace('comedy', 'funny')\
           .replace('space', 'sci-fi')\
           .replace('role-playing', 'rpg')\
           .replace('classic', 'retro')\
           .replace('story', 'story rich')\
           .replace('zombies', 'horror')\
           .replace('gore', 'violent')\
           .replace('war', 'violent')\

Let's see the top 20 values now (not forgetting to remove rpg):

In [19]:
tags.value_counts().drop('rpg')[:20]

singleplayer          9844
multiplayer           8408
co-op                 3512
online                2285
great soundtrack      2008
atmospheric           2003
violent               1892
story rich            1831
2d                    1816
funny                 1807
horror                1636
retro                 1612
sci-fi                1554
open world            1308
first-person          1131
third person          1042
fantasy               1041
female protagonist     937
difficult              921
fps                    821
dtype: int64

This seems legit, and thus we will continue with the encoding of `Tags`.

In [20]:
## We keep this for OHE, top 20 tags

tags = tags.value_counts().drop('rpg')[:20].index.tolist()

In [21]:
def unnest_tags(col):
    
    result = list()
    
    for element in col:
            if element in tags:
                result.append(element)
        
    if len(result) == 0: # If no tags are in tags, we won't save an empty list but a NaN so that we can fill them with ease
        result = np.nan
    
    return result

In [22]:
data['Tags'] = data['Tags'].map(unnest_tags, na_action = 'ignore')
data

Unnamed: 0,Title,Platform,Publisher,Developer,Release,Sales,Suggest_count,Tags,C_Score,C_Reviews,...,S_PlayStation Store,S_Steam,S_Xbox 360 Store,S_Xbox Store,S_itch.io,ESRB_Adults Only,ESRB_Everyone,ESRB_Everyone 10+,ESRB_Mature,ESRB_Teen
0,"""Nuke It""",PC,CrystalVision,CrystalVision,1998-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
1,#IDARB,XOne,Other Ocean Interactive,Other Ocean Interactive,2015-01-02,0.17,182.0,,77.0,31.0,...,0,0,0,1,0,0,1,0,0,0
2,#killallzombies,PS4,Beatshapers,Beatshapers,2015-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
3,'70s Robot Anime: Geppy-X,PS,Aroma,Aroma,1999-05-27,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
4,'98 Koshien,PS,Magical Company,Magical Company,1998-06-18,0.41,,,,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57209,XCOM: Enemy Unknown,,Feral Interactive,Firaxis Games,2014-06-19,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
57210,Yakuza: Like A Dragon,,Sega,Sega,2020-10-11,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
57211,Yakuza: Like A Dragon,,Sega,Sega,2020-01-12,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
57212,Zaxxon,,Coleco,Coleco,1983-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0


### One Hot Encoding Tags

In [23]:
mlb = MultiLabelBinarizer()

data['Tags'] = data['Tags'].apply(lambda x: x if type(x) == list else [])

X = mlb.fit_transform(data.pop('Tags'))

data = data.join(pd.DataFrame(X, index=data.index, columns=['T_' + col for col in mlb.classes_]))

In [24]:
data

Unnamed: 0,Title,Platform,Publisher,Developer,Release,Sales,Suggest_count,C_Score,C_Reviews,C_Positive,...,T_horror,T_multiplayer,T_online,T_open world,T_retro,T_sci-fi,T_singleplayer,T_story rich,T_third person,T_violent
0,"""Nuke It""",PC,CrystalVision,CrystalVision,1998-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
1,#IDARB,XOne,Other Ocean Interactive,Other Ocean Interactive,2015-01-02,0.17,182.0,77.0,31.0,23.0,...,0,0,0,0,0,0,0,0,0,0
2,#killallzombies,PS4,Beatshapers,Beatshapers,2015-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
3,'70s Robot Anime: Geppy-X,PS,Aroma,Aroma,1999-05-27,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
4,'98 Koshien,PS,Magical Company,Magical Company,1998-06-18,0.41,,,,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57209,XCOM: Enemy Unknown,,Feral Interactive,Firaxis Games,2014-06-19,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
57210,Yakuza: Like A Dragon,,Sega,Sega,2020-10-11,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
57211,Yakuza: Like A Dragon,,Sega,Sega,2020-01-12,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
57212,Zaxxon,,Coleco,Coleco,1983-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0


---
## Encoding: **Release**

We'll convert `Release` to numeric by separating the year and month to separate columns, storing them as **integers**.

Moreover, we are going to store the **quarters** instead of the months (1st~4th quarter), as developers won't know which quarter

`NaN`s will be stored as `0`.

In [25]:
years = data['Release'].dt.year
months = data['Release'].dt.month

In [26]:
years = years.fillna(0).astype(int)
months = months.fillna(0).astype(int)

In [27]:
data['Release_Y'] = years
data['Release_M'] = months

# We drop the original Release column

data = data.drop('Release', axis = 1)

---
## Encoding: **Platforms**

We are going to create a group of platforms by the hardware's owner:

- All PlayStation consoles > `Sony`.

- All Nintendo consoles > `Nintendo`.

- All XBox consoles > `Microsoft`.

- `PC` will remain the same.

- We will also group minority platforms as `Other` (we will also include `Sega` consoles here as they are not currently relevant).

In [28]:
plat_remap = {'PS': 'Sony', 'PS2': 'Sony', 'PS3': 'Sony', 'PS4': 'Sony', 'PSN': 'Sony', 'PSP': 'Sony', 'Vita': 'Sony',
              'NES': 'Nintendo', 'SNES': 'Nintendo', 'N64': 'Nintendo', 'GCube': 'Nintendo', 'Wii': 'Nintendo', 'Wii U': 'Nintendo', 'Switch': 'Nintendo', 'VC': 'Nintendo',
              'GB': 'Nintendo', 'GBC': 'Nintendo', 'GBA': 'Nintendo', 'DS': 'Nintendo', '3DS': 'Nintendo', 
              'Xbox': 'Microsoft', 'X360': 'Microsoft', 'XOne': 'Microsoft', 'Xbox Live': 'Microsoft',
              'Saturn': 'Other', 'Dreamcast': 'Other', 'Sega CD': 'Other', 'Genesis': 'Other',
              '3DO': 'Other', 'Amiga': 'Other', 'Atari 2600': 'Other', 'Game Gear': 'Other', 'Mini Mobile': 'Other', 'NeoGeo': 'Other', 'PC Engine': 'Other', 'PCFX': 'Other', 'WSwan': 'Other', 
              'PC': 'PC'}

In [29]:
data['Platform Group'] = data['Platform'].map(plat_remap)
data

Unnamed: 0,Title,Platform,Publisher,Developer,Sales,Suggest_count,C_Score,C_Reviews,C_Positive,C_Mixed,...,T_open world,T_retro,T_sci-fi,T_singleplayer,T_story rich,T_third person,T_violent,Release_Y,Release_M,Platform Group
0,"""Nuke It""",PC,CrystalVision,CrystalVision,0.17,,,,,,...,0,0,0,0,0,0,0,1998,1,PC
1,#IDARB,XOne,Other Ocean Interactive,Other Ocean Interactive,0.17,182.0,77.0,31.0,23.0,8.0,...,0,0,0,0,0,0,0,2015,1,Microsoft
2,#killallzombies,PS4,Beatshapers,Beatshapers,0.17,,,,,,...,0,0,0,0,0,0,0,2015,1,Sony
3,'70s Robot Anime: Geppy-X,PS,Aroma,Aroma,0.17,,,,,,...,0,0,0,0,0,0,0,1999,5,Sony
4,'98 Koshien,PS,Magical Company,Magical Company,0.41,,,,,,...,0,0,0,0,0,0,0,1998,6,Sony
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57209,XCOM: Enemy Unknown,,Feral Interactive,Firaxis Games,0.17,,,,,,...,0,0,0,0,0,0,0,2014,6,
57210,Yakuza: Like A Dragon,,Sega,Sega,0.17,,,,,,...,0,0,0,0,0,0,0,2020,10,
57211,Yakuza: Like A Dragon,,Sega,Sega,0.17,,,,,,...,0,0,0,0,0,0,0,2020,1,
57212,Zaxxon,,Coleco,Coleco,0.17,,,,,,...,0,0,0,0,0,0,0,1983,1,


## Estimating **sales in the first year**

Sales in video games have approximatedly this shape:

<img src = https://a.storyblok.com/f/106061/932x539/1bdac7c86e/sales-curve.png width = 400>

Thus, we are going to create a function to estimate the sales in the first year, as what we have is the accumulative sales over the years.

The sales in the first year have a certain area below the curve, which means that at some point, the "tail" will accumulate enough sales to be comparable to the first year (and may happen more than once!).

So, taking this into account, the estimator function could look like something like this:

$$ {\Large
Sales_{1st year}= \left\{
\begin{array}{ll}
      \frac {Sales_{total}} {1 + \frac {(\Delta Release - 1)} {\gamma}} & \Delta Release < 10 \\
      \frac {Sales_{total}} {\frac {\gamma + 9} {\gamma}} & \Delta Release >= 10
\end{array} 
\right.
}$$


- $Sales_{total}$ : It's the current `Sales` column.

- $\Delta Release$ : It's the difference between **current year** and `Release_Y`.

    - The $-1$ is to filter out the first year, which is already taken into account in the $1+$ side of the denominator.
    
    - If the delta is greater than 10 years (we will call this `limit` in the code to ease changing its value later if needed), we will fix delta's value to 10. This is because it's highly unlikely a product accumulates enough sales after 10 years to be relevant.
    
    - The scraping process needs to be improved further as there are still problems to automate the scraping process due to issues such as error 503, taking too long to gather some data (4 days to scrape Metacritic on 2021).
    
        - We should consider 2021 as the upper limit for `Release` until this issue is tackeld.
       

- $\gamma$ : It's the number of years we estimate for accumulative sales to be comparable to the first year (i.e: if $\gamma = 5$, it would mean that every 5 years we consider that the product reached another $Sales_{1st year}$).

In [30]:
def sales_1st_y(data, gamma = 5, limit = 10):
    
    sales_t = data['Sales']
    release_y = data['Release_Y']
    
    current_y = dt.datetime.now().year
    
    delta = current_y - release_y
    
    if delta >= limit:
        delta = limit
    
    result = round(sales_t / (1 + (delta - 1)/ gamma))
    
    return result

In [31]:
### We do this so that we don't lose information after applying the transformation.

data['Sales'] = data['Sales'] * 1000000

In [32]:
data['Sales'] = data.apply(sales_1st_y, axis = 1)

In [33]:
# We revert it back to millions scale

data['Sales'] = data['Sales'] / 1000000

---

## Grouping

We want same titles launched in different platforms to merge into a single row in the dataframe.



##### From [Group by: split-apply-combine](https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#aggregation)
```
(...)

If your desired output column names are not valid Python keywords, construct a dictionary and unpack the keyword arguments.

Additional keyword arguments are not passed through to the aggregation functions. Only pairs of `(column, aggfunc)` should be passed as `**kwargs`.

If your aggregation functions requires additional arguments, partially apply them with `functools.partial()`.
```
---

By grouping, some columns may contain multiple values, which we do not want:

For `Publisher` and `Developer`: We are going to apply `pd.Series.mode()` to get the most frequent element. Furthermore, if there is a tie we will get the first element of them.

In [34]:
grouped = data.groupby(['Title'])

pub_and_dev = grouped.agg(
                **{'Publisher' : pd.NamedAgg(column = 'Publisher', aggfunc = lambda x: pd.Series.mode(x) if pd.Series.mode(x).shape[0] < 1 else pd.Series.mode(x)[0]),
                   'Developer' : pd.NamedAgg(column = 'Developer', aggfunc = lambda x: pd.Series.mode(x) if pd.Series.mode(x).shape[0] < 1 else pd.Series.mode(x)[0])
                   }).\
                reset_index()

pub_and_dev

Unnamed: 0,Title,Publisher,Developer
0,"""Nuke It""",CrystalVision,CrystalVision
1,#IDARB,Other Ocean Interactive,Other Ocean Interactive
2,#killallzombies,Beatshapers,Beatshapers
3,'70s Robot Anime: Geppy-X,Aroma,Aroma
4,'98 Koshien,Magical Company,Magical Company
...,...,...,...
37362,yOm,Microsoft,jojito
37363,yOm_fury,Microsoft,jojito
37364,¡Shin Chan Flipa en colores!,505 Games,Inti Creates
37365,じんるいのみなさまへ,Unknown,Nippon Ichi Software


For `Suggest_count`: We are going to use `max()` in the `aggfunc` as suggestion counts is shared between the same title.

In [35]:
grouped = data.groupby(['Title','Platform Group'])

sales_by_group = grouped.agg(
                **{'Sales/ Group' : pd.NamedAgg(column = 'Sales', aggfunc = sum),
                   'Suggest_count' : pd.NamedAgg(column = 'Suggest_count', aggfunc = max)}).\
                reset_index()

sales_by_group

Unnamed: 0,Title,Platform Group,Sales/ Group,Suggest_count
0,"""Nuke It""",PC,0.060714,
1,#IDARB,Microsoft,0.085000,182.0
2,#killallzombies,Sony,0.085000,
3,'70s Robot Anime: Geppy-X,Sony,0.060714,
4,'98 Koshien,Sony,0.146429,
...,...,...,...,...
48167,yOm,Microsoft,0.060714,
48168,yOm_fury,Microsoft,0.060714,
48169,¡Shin Chan Flipa en colores!,Nintendo,0.050000,
48170,じんるいのみなさまへ,Nintendo,0.060714,


We will one hot encode `sales_by_group` to convert `Platform Group` to numeric.

In [36]:
sales_ohe = pd.get_dummies(sales_by_group['Platform Group'])

sales_by_group = sales_by_group.join(sales_ohe)

# We pick Minecraft as an example, as it is present in 4 platform groups
sales_by_group[sales_by_group['Title']=='Minecraft']

Unnamed: 0,Title,Platform Group,Sales/ Group,Suggest_count,Microsoft,Nintendo,Other,PC,Sony
25068,Minecraft,Microsoft,7.884849,411.0,1,0,0,0,0
25069,Minecraft,Nintendo,2.397917,411.0,0,1,0,0,0
25070,Minecraft,PC,11.839286,411.0,0,0,0,1,0
25071,Minecraft,Sony,6.75,411.0,0,0,0,0,1


We will create a dataframe using the one-hot-encoded columns and the `Sales/ Group` so that one-hot-encoded columns include the sales instead of 1's: **we will perform a multiplication between `Sales/ Group` and the OHE.**

Now we are going to group by `Title` and sum `Sales/ Group` to get the total sales for each title, apart from the sales in the different platform groups.

We will also add the `Hit` column, checking if total sales >= 1M or not.

In [37]:
grouped = sales_by_group.groupby(['Title'])

sales_by_title = grouped.agg(
                **{'Hit': pd.NamedAgg(column = 'Sales/ Group', aggfunc = lambda x: 1 if sum(x) >= 1 else 0),
                   'Sales_total' : pd.NamedAgg(column = 'Sales/ Group', aggfunc = sum),
                   'Suggest_count' : pd.NamedAgg(column = 'Suggest_count', aggfunc = max),
                   
                   'P_MicroSoft' : pd.NamedAgg(column = 'Microsoft', aggfunc = max),
                   'P_Nintendo' : pd.NamedAgg(column = 'Nintendo', aggfunc = max),
                   'P_Other' : pd.NamedAgg(column = 'Other', aggfunc = max),
                   'P_PC' : pd.NamedAgg(column = 'PC', aggfunc = max),
                   'P_Sony' : pd.NamedAgg(column = 'Sony', aggfunc = max)
                   }).\
                reset_index()
sales_by_title

#                    'P_Sega' : pd.NamedAgg(column = 'Sega', aggfunc = max),

Unnamed: 0,Title,Hit,Sales_total,Suggest_count,P_MicroSoft,P_Nintendo,P_Other,P_PC,P_Sony
0,"""Nuke It""",0.0,0.060714,,0,0,0,1,0
1,#IDARB,0.0,0.085000,182.0,1,0,0,0,0
2,#killallzombies,0.0,0.085000,,0,0,0,0,1
3,'70s Robot Anime: Geppy-X,0.0,0.060714,,0,0,0,0,1
4,'98 Koshien,0.0,0.146429,,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
36171,yOm,0.0,0.060714,,1,0,0,0,0
36172,yOm_fury,0.0,0.060714,,1,0,0,0,0
36173,¡Shin Chan Flipa en colores!,0.0,0.050000,,0,1,0,0,0
36174,じんるいのみなさまへ,0.0,0.060714,,0,1,0,0,0


Now, we group by title to get the rest of columns of the original dataframe.

In [38]:
grouped = data.groupby('Title')

rest_of_cols = grouped.agg(
                **{
                    'C_Score' : pd.NamedAgg(column = 'C_Score', aggfunc = max), 
                    'C_Positive' : pd.NamedAgg(column = 'C_Positive', aggfunc = sum),
                    'C_Mixed' : pd.NamedAgg(column = 'C_Mixed', aggfunc = sum),
                    'C_Negative' : pd.NamedAgg(column = 'C_Negative', aggfunc = sum),

                    'U_Score' : pd.NamedAgg(column = 'U_Score', aggfunc = max),
                    'U_Positive' : pd.NamedAgg(column = 'U_Positive', aggfunc = sum),
                    'U_Mixed' : pd.NamedAgg(column = 'U_Mixed', aggfunc = sum),
                    'U_Negative' : pd.NamedAgg(column = 'U_Negative', aggfunc = sum),

                    'G_Action' : pd.NamedAgg(column = 'G_Action', aggfunc = max), 
                    'G_Adventure' : pd.NamedAgg(column = 'G_Adventure', aggfunc = max), 
                    'G_Arcade' : pd.NamedAgg(column = 'G_Arcade', aggfunc = max), 
                    'G_Board_Games' : pd.NamedAgg(column = 'G_Board Games', aggfunc = max), 
                    'G_Card' : pd.NamedAgg(column = 'G_Card', aggfunc = max),
                    'G_Casual' : pd.NamedAgg(column = 'G_Casual', aggfunc = max),
                    'G_Educational' : pd.NamedAgg(column = 'G_Educational', aggfunc = max),
                    'G_Family' : pd.NamedAgg(column = 'G_Family', aggfunc = max),
                    'G_Fighting' : pd.NamedAgg(column = 'G_Fighting', aggfunc = max),
                    'G_Indie' : pd.NamedAgg(column = 'G_Indie', aggfunc = max),
                    'G_Massively_Multiplayer' : pd.NamedAgg(column = 'G_Massively Multiplayer', aggfunc = max),
                    'G_Platformer' : pd.NamedAgg(column = 'G_Platformer', aggfunc = max),
                    'G_Puzzle' : pd.NamedAgg(column = 'G_Puzzle', aggfunc = max),
                    'G_RPG' : pd.NamedAgg(column = 'G_RPG', aggfunc = max),
                    'G_Racing' : pd.NamedAgg(column = 'G_Racing', aggfunc = max),
                    'G_Shooter' : pd.NamedAgg(column = 'G_Shooter', aggfunc = max),
                    'G_Simulation' : pd.NamedAgg(column = 'G_Simulation', aggfunc = max),
                    'G_Sports' : pd.NamedAgg(column = 'G_Sports', aggfunc = max),
                    'G_Strategy' : pd.NamedAgg(column = 'G_Strategy', aggfunc = max),

                    'S_Epic_Games' : pd.NamedAgg(column = 'S_Epic Games', aggfunc = max),
                    'S_GOG' : pd.NamedAgg(column = 'S_GOG', aggfunc = max),
                    'S_Nintendo_Store' : pd.NamedAgg(column = 'S_Nintendo Store', aggfunc = max),
                    'S_PlayStation_Store' : pd.NamedAgg(column = 'S_PlayStation Store', aggfunc = max),
                    'S_Steam' : pd.NamedAgg(column = 'S_Steam', aggfunc = max),
                    'S_Xbox_360_Store' : pd.NamedAgg(column = 'S_Xbox 360 Store', aggfunc = max),
                    'S_Xbox_Store' : pd.NamedAgg(column = 'S_Xbox Store', aggfunc = max),
                    'S_itch.io' : pd.NamedAgg(column = 'S_itch.io', aggfunc = max),

                    
                    'ESRB_All' : pd.NamedAgg(column = 'ESRB_Everyone', aggfunc = max),
                    'ESRB_10+' : pd.NamedAgg(column = 'ESRB_Everyone 10+', aggfunc = max),
                    'ESRB_Teen' : pd.NamedAgg(column = 'ESRB_Teen', aggfunc = max),
                    'ESRB_17+' : pd.NamedAgg(column = 'ESRB_Mature', aggfunc = max),
                    'ESRB_18+' : pd.NamedAgg(column = 'ESRB_Adults Only', aggfunc = max),

                    'T_Singleplayer' : pd.NamedAgg(column = 'T_singleplayer', aggfunc = max),
                    'T_Multiplayer' : pd.NamedAgg(column = 'T_multiplayer', aggfunc = max),
                    'T_Co_Op' : pd.NamedAgg(column = 'T_co-op', aggfunc = max),
                    'T_Online' : pd.NamedAgg(column = 'T_online', aggfunc = max),
                    'T_Great_OST' : pd.NamedAgg(column = 'T_great soundtrack', aggfunc = max),
                    'T_Atmospheric' : pd.NamedAgg(column = 'T_atmospheric', aggfunc = max),
                    'T_Violent' : pd.NamedAgg(column = 'T_violent', aggfunc = max),
                    'T_Story_Rich' : pd.NamedAgg(column = 'T_story rich', aggfunc = max), 
                    'T_2D' : pd.NamedAgg(column = 'T_2d', aggfunc = max),
                    'T_Funny' : pd.NamedAgg(column = 'T_funny', aggfunc = max),
                    'T_Horror' : pd.NamedAgg(column = 'T_horror', aggfunc = max),
                    'T_Retro' : pd.NamedAgg(column = 'T_retro', aggfunc = max),
                    'T_Sci_fi' : pd.NamedAgg(column = 'T_sci-fi', aggfunc = max),
                    'T_Open_World' : pd.NamedAgg(column = 'T_open world', aggfunc = max),
                    'T_1st_Person' : pd.NamedAgg(column = 'T_first-person', aggfunc = max),     
                    'T_3rd_Person' : pd.NamedAgg(column = 'T_third person', aggfunc = max),
                    'T_Fantasy' : pd.NamedAgg(column = 'T_fantasy', aggfunc = max),
                    'T_Fem_Protag' : pd.NamedAgg(column = 'T_female protagonist', aggfunc = max),
                    'T_Hard' : pd.NamedAgg(column = 'T_difficult', aggfunc = max),
                    'T_FPS' : pd.NamedAgg(column = 'T_fps', aggfunc = max),
                   
                    'Release_Y' : pd.NamedAgg(column = 'Release_Y', aggfunc = min),
                    'Release_M' : pd.NamedAgg(column = 'Release_M', aggfunc = min)
                   }).\
                reset_index()
               
rest_of_cols                 

Unnamed: 0,Title,C_Score,C_Positive,C_Mixed,C_Negative,U_Score,U_Positive,U_Mixed,U_Negative,G_Action,...,T_Sci_fi,T_Open_World,T_1st_Person,T_3rd_Person,T_Fantasy,T_Fem_Protag,T_Hard,T_FPS,Release_Y,Release_M
0,"""Nuke It""",,0.0,0.0,0.0,,0.0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,1998,1
1,#IDARB,77.0,23.0,8.0,0.0,69.0,47.0,26.0,15.0,0,...,0,0,0,0,0,0,0,0,2015,1
2,#killallzombies,,0.0,0.0,0.0,,0.0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,2015,1
3,'70s Robot Anime: Geppy-X,,0.0,0.0,0.0,,0.0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,1999,5
4,'98 Koshien,,0.0,0.0,0.0,,0.0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,1998,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37362,yOm,,0.0,0.0,0.0,,0.0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,2009,10
37363,yOm_fury,,0.0,0.0,0.0,,0.0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,2009,12
37364,¡Shin Chan Flipa en colores!,,0.0,0.0,0.0,,0.0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,2007,11
37365,じんるいのみなさまへ,,0.0,0.0,0.0,,0.0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
result = pub_and_dev.merge(sales_by_title).merge(rest_of_cols)
result

Unnamed: 0,Title,Publisher,Developer,Hit,Sales_total,Suggest_count,P_MicroSoft,P_Nintendo,P_Other,P_PC,...,T_Sci_fi,T_Open_World,T_1st_Person,T_3rd_Person,T_Fantasy,T_Fem_Protag,T_Hard,T_FPS,Release_Y,Release_M
0,"""Nuke It""",CrystalVision,CrystalVision,0.0,0.060714,,0,0,0,1,...,0,0,0,0,0,0,0,0,1998,1
1,#IDARB,Other Ocean Interactive,Other Ocean Interactive,0.0,0.085000,182.0,1,0,0,0,...,0,0,0,0,0,0,0,0,2015,1
2,#killallzombies,Beatshapers,Beatshapers,0.0,0.085000,,0,0,0,0,...,0,0,0,0,0,0,0,0,2015,1
3,'70s Robot Anime: Geppy-X,Aroma,Aroma,0.0,0.060714,,0,0,0,0,...,0,0,0,0,0,0,0,0,1999,5
4,'98 Koshien,Magical Company,Magical Company,0.0,0.146429,,0,0,0,0,...,0,0,0,0,0,0,0,0,1998,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36171,yOm,Microsoft,jojito,0.0,0.060714,,1,0,0,0,...,0,0,0,0,0,0,0,0,2009,10
36172,yOm_fury,Microsoft,jojito,0.0,0.060714,,1,0,0,0,...,0,0,0,0,0,0,0,0,2009,12
36173,¡Shin Chan Flipa en colores!,505 Games,Inti Creates,0.0,0.050000,,0,1,0,0,...,0,0,0,0,0,0,0,0,2007,11
36174,じんるいのみなさまへ,Unknown,Nippon Ichi Software,0.0,0.060714,,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
result.to_csv("05_encoded.csv", encoding='utf-8', index=False)