In [1]:
import pandas as pd
import numpy as np
import datetime as dt

from ast import literal_eval
from sklearn.preprocessing import MultiLabelBinarizer

# 0. Loading Dataset

In [2]:
data = pd.read_csv('Files/04_merged_data.csv', parse_dates = ['Release'])

data['Genres'] = data['Genres'].apply(lambda x: literal_eval(x) if pd.notnull(x) and ('[' in x) else x)
data['Stores'] = data['Stores'].apply(lambda x: literal_eval(x) if pd.notnull(x) and ('[' in x) else x)
data['Tags'] = data['Tags'].apply(lambda x: literal_eval(x) if pd.notnull(x) and ('[' in x) else x)

# 1. One-Hot Encoding *Genres*, *Stores* and *ESRB* Columns

As `Genres` and `Stores` contain list-like values, we cannot just one-hot encode it with `pd.get_dummy()`.

On the contrary, `ESRB` doesn't have this problem as it doesn't contain lists, thus we can use `pd.get_dummy()`.

We are going to one-hot encode it using `MultiLabelBinarizer` from sklearn.

But first, we have to convert the string values to lists, as it's needed for MLB to work properly.

We are going to create an auxiliar dataframe copying the columns we want to make the operations easier.

## 1.1. Preparing Genres and Stores for MLB

In [3]:
def conv_list(x):
    
    '''
    Given an input, it returns a list of it. If the input was a list, it returns itself.
    
    x: The input value.
    '''
    
    result = list()
        
    if type(x) == list: 
        
        # if a list we return the list itself
        result = x

    else:
        
        # if not a list we return the value converted to list (so that we can use MLB)
        result.append(x) 
        
    return result

In [4]:
data[['Genres','Stores']] = data[['Genres','Stores']].applymap(conv_list, na_action = 'ignore')
data

Unnamed: 0,Title,Platform,Publisher,Developer,Release,Sales,Suggest_count,Genres,Stores,Tags,...,C_Score,C_Reviews,C_Positive,C_Mixed,C_Negative,U_Score,U_Reviews,U_Positive,U_Mixed,U_Negative
0,"""Nuke It""",PC,CrystalVision,CrystalVision,1998-01-01,0.17,,,,,...,,,,,,,,,,
1,#IDARB,XOne,Other Ocean Interactive,Other Ocean Interactive,2015-01-02,0.17,182.0,,[Xbox Store],"[Music, Story, battle, future, strange, ball, ...",...,77.0,31.0,23.0,8.0,0.0,69.0,88.0,47.0,26.0,15.0
2,#killallzombies,PS4,Beatshapers,Beatshapers,2015-01-01,0.17,,,,,...,,,,,,,,,,
3,'70s Robot Anime: Geppy-X,PS,Aroma,Aroma,1999-05-27,0.17,,,,,...,,,,,,,,,,
4,'98 Koshien,PS,Magical Company,Magical Company,1998-06-18,0.41,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57846,XCOM: Enemy Unknown,,Feral Interactive,Firaxis Games,2014-06-19,0.17,,,,,...,,,,,,,,,,
57847,Yakuza: Like A Dragon,,Sega,Sega,2020-10-11,0.17,,,,,...,,,,,,,,,,
57848,Yakuza: Like A Dragon,,Sega,Sega,2020-01-12,0.17,,,,,...,,,,,,,,,,
57849,Zaxxon,,Coleco,Coleco,1983-01-01,0.17,,,,,...,,,,,,,,,,


## 1.2. Applying MLB

After applying MLB we are also going to add a new column called `G_Other`, whose value will be 1 by default if no other genres are put to 1.

In [5]:
mlb = MultiLabelBinarizer()

data[['Genres','Stores']] = data[['Genres','Stores']].fillna('')

# Genre #############################################################################################

X = mlb.fit_transform(data.pop('Genres'))

data = data.join(pd.DataFrame(X, index=data.index, columns=['G_' + col for col in mlb.classes_]))

    # We add another column for when there is no genre associated.

data['G_Other'] = data.filter(like ='G_', axis=1)\
                      .apply(lambda x: 1 if sum(x) <= 0 else 0, axis =1)

# Stores #############################################################################################

X = mlb.fit_transform(data.pop('Stores'))

data = data.join(pd.DataFrame(X, index=data.index, columns=['S_' + col for col in mlb.classes_]))

In [6]:
data

Unnamed: 0,Title,Platform,Publisher,Developer,Release,Sales,Suggest_count,Tags,ESRB,C_Score,...,S_App Store,S_Epic Games,S_GOG,S_Google Play,S_Nintendo Store,S_PlayStation Store,S_Steam,S_Xbox 360 Store,S_Xbox Store,S_itch.io
0,"""Nuke It""",PC,CrystalVision,CrystalVision,1998-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
1,#IDARB,XOne,Other Ocean Interactive,Other Ocean Interactive,2015-01-02,0.17,182.0,"[Music, Story, battle, future, strange, ball, ...",Everyone,77.0,...,0,0,0,0,0,0,0,0,1,0
2,#killallzombies,PS4,Beatshapers,Beatshapers,2015-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
3,'70s Robot Anime: Geppy-X,PS,Aroma,Aroma,1999-05-27,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
4,'98 Koshien,PS,Magical Company,Magical Company,1998-06-18,0.41,,,,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57846,XCOM: Enemy Unknown,,Feral Interactive,Firaxis Games,2014-06-19,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
57847,Yakuza: Like A Dragon,,Sega,Sega,2020-10-11,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
57848,Yakuza: Like A Dragon,,Sega,Sega,2020-01-12,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
57849,Zaxxon,,Coleco,Coleco,1983-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0


## 1.3. One-Hot Encoding ESRB

Now we proceed to One-hot encode the `ESRB` column with `pd.get_dummies` directly.

We will put `Rating Pending` to NaN because it gives the same information: None.

In [7]:
data['ESRB'] = data['ESRB'].map(lambda x: np.nan if x == 'Rating Pending' else x)
esrb = pd.get_dummies(data['ESRB'], prefix = 'ESRB')
data = data.join(esrb)

## 1.4. Dropping Columns

We will drop `ESRB` as it's no longer of use for us. `Genres` and `Stores` got dropped in the process of MLB due to `pop()` method.

We are also going to drop `S_Google Play` and `S_App Store` because mobile gaming is not considered in this project.

In [8]:
data = data.drop('ESRB', axis = 1)
data = data.drop(['S_Google Play', 'S_App Store'], axis = 1)
data.columns

Index(['Title', 'Platform', 'Publisher', 'Developer', 'Release', 'Sales',
       'Suggest_count', 'Tags', 'C_Score', 'C_Reviews', 'C_Positive',
       'C_Mixed', 'C_Negative', 'U_Score', 'U_Reviews', 'U_Positive',
       'U_Mixed', 'U_Negative', 'G_Action', 'G_Adventure', 'G_Arcade',
       'G_Board Games', 'G_Card', 'G_Casual', 'G_Educational', 'G_Family',
       'G_Fighting', 'G_Indie', 'G_Massively Multiplayer', 'G_Platformer',
       'G_Puzzle', 'G_RPG', 'G_Racing', 'G_Shooter', 'G_Simulation',
       'G_Sports', 'G_Strategy', 'G_Other', 'S_Epic Games', 'S_GOG',
       'S_Nintendo Store', 'S_PlayStation Store', 'S_Steam',
       'S_Xbox 360 Store', 'S_Xbox Store', 'S_itch.io', 'ESRB_Adults Only',
       'ESRB_Everyone', 'ESRB_Everyone 10+', 'ESRB_Mature', 'ESRB_Teen'],
      dtype='object')

---
# 2. Encoding 

## 2.1. Encoding - Platforms

We are going to create a group of platforms by the hardware's owner (we are not one-hot encoding `Platforms` yet):

- All PlayStation consoles > `Sony`.

- All Nintendo consoles > `Nintendo`.

- All XBox consoles > `Microsoft`.

- `PC` will remain the same.

- `All` will also remain the same to check what proportion of them can be removed without losing information.

- We will also group minority platforms as `Other` (we will also include `Sega` consoles here as they are not currently relevant).

In [9]:
plat_remap = {'PS': 'Sony', 'PS2': 'Sony', 'PS3': 'Sony', 'PS4': 'Sony', 'PSN': 'Sony', 'PSP': 'Sony', 'Vita': 'Sony',
              'NES': 'Nintendo', 'SNES': 'Nintendo', 'N64': 'Nintendo', 'GCube': 'Nintendo', 'Wii': 'Nintendo', 'Wii U': 'Nintendo', 'Switch': 'Nintendo', 'VC': 'Nintendo',
              'GB': 'Nintendo', 'GBC': 'Nintendo', 'GBA': 'Nintendo', 'DS': 'Nintendo', '3DS': 'Nintendo', 
              'Xbox': 'Microsoft', 'X360': 'Microsoft', 'XOne': 'Microsoft', 'Xbox Live': 'Microsoft',
              'Saturn': 'Other', 'Dreamcast': 'Other', 'Sega CD': 'Other', 'Genesis': 'Other',
              '3DO': 'Other', 'Amiga': 'Other', 'Atari 2600': 'Other', 'Game Gear': 'Other', 'Mini Mobile': 'Other', 'NeoGeo': 'Other', 'PC Engine': 'Other', 'PCFX': 'Other', 'WSwan': 'Other', 
              'PC': 'PC', 'All': 'All'}

### 2.1.1. Creating New Column for Platform Group
Let's apply the dictionary on the `Platform` column and save it to a new column called `Platform_Group`.

In [10]:
data['Platform_Group'] = data['Platform'].map(plat_remap)
data

Unnamed: 0,Title,Platform,Publisher,Developer,Release,Sales,Suggest_count,Tags,C_Score,C_Reviews,...,S_Steam,S_Xbox 360 Store,S_Xbox Store,S_itch.io,ESRB_Adults Only,ESRB_Everyone,ESRB_Everyone 10+,ESRB_Mature,ESRB_Teen,Platform_Group
0,"""Nuke It""",PC,CrystalVision,CrystalVision,1998-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,PC
1,#IDARB,XOne,Other Ocean Interactive,Other Ocean Interactive,2015-01-02,0.17,182.0,"[Music, Story, battle, future, strange, ball, ...",77.0,31.0,...,0,0,1,0,0,1,0,0,0,Microsoft
2,#killallzombies,PS4,Beatshapers,Beatshapers,2015-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,Sony
3,'70s Robot Anime: Geppy-X,PS,Aroma,Aroma,1999-05-27,0.17,,,,,...,0,0,0,0,0,0,0,0,0,Sony
4,'98 Koshien,PS,Magical Company,Magical Company,1998-06-18,0.41,,,,,...,0,0,0,0,0,0,0,0,0,Sony
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57846,XCOM: Enemy Unknown,,Feral Interactive,Firaxis Games,2014-06-19,0.17,,,,,...,0,0,0,0,0,0,0,0,0,
57847,Yakuza: Like A Dragon,,Sega,Sega,2020-10-11,0.17,,,,,...,0,0,0,0,0,0,0,0,0,
57848,Yakuza: Like A Dragon,,Sega,Sega,2020-01-12,0.17,,,,,...,0,0,0,0,0,0,0,0,0,
57849,Zaxxon,,Coleco,Coleco,1983-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,


### 2.1.2. Games with Platform as All

Games having `Platform` as `All`, usually consider the rest of the platforms. Thus, we could say we simply drop them.

However, we have to check first if there are games only having `All`, as it would mean that we lose data.

In [11]:
titles = list(data[data['Platform'] == 'All']['Title']) # Games having Platform = All

drop = [] # These have more platforms than All
keep = [] # These only have All as platform, thus the one we want to keep

for title in titles:
    
    shape = data[data['Title'] == title].shape[0]
    
    if shape > 1:
        drop.append(title)
    else:
        keep.append(title)
        
len(keep)

78

So we would have missed 78 data points. Let's first drop the rows of games with `All` present in the list `drop`.

To do so, we create an auxiliar dataframe containing only games whose `Title` are in the list `drop`. Then, we are only keeping the indexes of those whose `Platform` is `All` and convert them into a list.

In [12]:
drop_candidate_df = data[data['Title'].apply(lambda x: True if x in drop else False)]

drop_idx = drop_candidate_df.index[drop_candidate_df['Platform'] == 'All']\
                            .tolist()

data = data.drop(index = drop_idx).reset_index(drop=True)

data

Unnamed: 0,Title,Platform,Publisher,Developer,Release,Sales,Suggest_count,Tags,C_Score,C_Reviews,...,S_Steam,S_Xbox 360 Store,S_Xbox Store,S_itch.io,ESRB_Adults Only,ESRB_Everyone,ESRB_Everyone 10+,ESRB_Mature,ESRB_Teen,Platform_Group
0,"""Nuke It""",PC,CrystalVision,CrystalVision,1998-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,PC
1,#IDARB,XOne,Other Ocean Interactive,Other Ocean Interactive,2015-01-02,0.17,182.0,"[Music, Story, battle, future, strange, ball, ...",77.0,31.0,...,0,0,1,0,0,1,0,0,0,Microsoft
2,#killallzombies,PS4,Beatshapers,Beatshapers,2015-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,Sony
3,'70s Robot Anime: Geppy-X,PS,Aroma,Aroma,1999-05-27,0.17,,,,,...,0,0,0,0,0,0,0,0,0,Sony
4,'98 Koshien,PS,Magical Company,Magical Company,1998-06-18,0.41,,,,,...,0,0,0,0,0,0,0,0,0,Sony
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57287,XCOM: Enemy Unknown,,Feral Interactive,Firaxis Games,2014-06-19,0.17,,,,,...,0,0,0,0,0,0,0,0,0,
57288,Yakuza: Like A Dragon,,Sega,Sega,2020-10-11,0.17,,,,,...,0,0,0,0,0,0,0,0,0,
57289,Yakuza: Like A Dragon,,Sega,Sega,2020-01-12,0.17,,,,,...,0,0,0,0,0,0,0,0,0,
57290,Zaxxon,,Coleco,Coleco,1983-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,


#### **2.1.2.1. Separating Releases by Periods in the Video Games History**

Now, we wonder what should we do with the `All` platforms. Furthermore, there are also NaNs here. We could replace all these values with the **mode**, but the video games industry is a quite fast-evolving sector, so giving old games the same treatment as the newer ones could not always work.

We are going to separate them into sections representative of the video games historic mailstones (releases of new gen consoles). As many of these intervals overlapped in time with others, we will adjust them:

- **Section 0**: Games prior to 1972.
- **Section 1**: Games released between 1972-1983. Corresponds to the 1st and 2nd generations of consoles.
- **Section 2**: Games released between 1983-1993. Corresponds to the 3rd and 4th generations of consoles.
- **Section 3**: Games released between 1993-1999. Corresponds to the 5th generation of consoles.
- **Section 4**: Games released between 1999-2005. Corresponds to the 6th generation of consoles.
- **Section 5**: Games released between 2005-2012. Corresponds to the 7th generation of consoles.
- **Section 6**: Games released between 2012-2020. Corresponds to the 8th generation of consoles.
- **Section 7**: Games released between 2020-2021. Corresponds to the 9th generation of consoles.

And check the top 2 values and and if the top value is more than twice as big than the second value (if it is true, we only keep the top value). .

In [13]:
def release_binner(df):
    
    '''
    Given a Dataframe object return a list of lists, each one of them containing the top 2 platform groups for each video games period.
    If the top value is more than twice as big the second value, the list will only contain the top value.
    
    df: Input Dataframe.
    '''
    
    aux  = df[['Platform','Release']].copy()
    aux['Release'] = aux['Release'].map(lambda x: x.year)
    
    section_0 = aux[aux['Release'] < 1972]['Platform'].map(plat_remap).value_counts()
    section_1 = aux[aux['Release'].between(1972, 1983)]['Platform'].map(plat_remap).value_counts()
    section_2 = aux[aux['Release'].between(1983, 1993)]['Platform'].map(plat_remap).value_counts()
    section_3 = aux[aux['Release'].between(1993, 1999)]['Platform'].map(plat_remap).value_counts()
    section_4 = aux[aux['Release'].between(1999, 2005)]['Platform'].map(plat_remap).value_counts()
    section_5 = aux[aux['Release'].between(2005, 2012)]['Platform'].map(plat_remap).value_counts()
    section_6 = aux[aux['Release'].between(2012, 2020)]['Platform'].map(plat_remap).value_counts()
    section_7 = aux[aux['Release'].between(2020, 2021)]['Platform'].map(plat_remap).value_counts()
    
    sections = [section_0, section_1, section_2, section_3, section_4, section_5, section_6, section_7]
    
    output = list()
    
    for section in sections:
        
        if section[0] // section[1] < 2:
            output.append(section[:2].index.tolist())
        else:
            output.append(section[:1].index.tolist())
        
    
    print('~1972: ')
    print(section_0)
    print('\n1972-1983: ')
    print(section_1)
    print('\n1983-1993: ')
    print(section_2)    
    print('\n1993-1999: ')
    print(section_3)
    print('\n1999-2005: ')
    print(section_4)    
    print('\n2005-2012: ')
    print(section_5)    
    print('\n2012-2020: ')
    print(section_6)    
    print('\n2020~: ')
    print(section_7)    
    
    return output

In [14]:
sect_0, sect_1, sect_2, sect_3, sect_4, sect_5, sect_6, sect_7 = release_binner(data)

~1972: 
PC           68
Sony          7
Nintendo      5
Microsoft     2
Other         1
Name: Platform, dtype: int64

1972-1983: 
Other       410
PC           40
Nintendo      3
Name: Platform, dtype: int64

1983-1993: 
Nintendo    2071
Other       1500
PC           586
Sony           1
Name: Platform, dtype: int64

1993-1999: 
Other       2392
Nintendo    2085
Sony        1633
PC          1424
Name: Platform, dtype: int64

1999-2005: 
Sony         3716
Nintendo     3052
PC           2292
Microsoft     887
Other         875
Name: Platform, dtype: int64

2005-2012: 
Nintendo     6783
Sony         6636
PC           3793
Microsoft    3539
Other          32
Name: Platform, dtype: int64

2012-2020: 
Sony         4121
PC           3888
Nintendo     3026
Microsoft    1841
Other          17
Name: Platform, dtype: int64

2020~: 
PC           444
Sony         359
Nintendo     337
Microsoft    269
Other          7
Name: Platform, dtype: int64


#### **2.1.2.2. Separating, Transforming and Recombining Dataset**

Now we've got the values we will use to replace `All` and NaN with.

To apply these changes, we have to add new rows due to some of the sections having 2 console groups in them.

Section 0 and section 1 have only one value to replace them with, so we are going to split the dataframe to apply the changes and merge after.

\*For the years in the border between sections, we will consider the lower limit a closed interval, and the upper one, an open interval.

In [15]:
sect_0_idx = data.index[data['Release'].map(lambda x: x.year) < 1972].tolist()
sect_1_idx = data.index[data['Release'].map(lambda x: x.year).between(1972,1982)].tolist()
rest_idx = data.index[data['Release'].map(lambda x: x.year) > 1982].tolist()

sect_0_df = data.drop(index = (rest_idx + sect_1_idx))
sect_1_df = data.drop(index = (sect_0_idx + rest_idx))
rest_df = data.drop(index = (sect_0_idx + sect_1_idx))

In [16]:
sect_0_df['Platform_Group'] = sect_0_df['Platform_Group'].apply(lambda x: sect_0[0] if x == 'All' or type(x) == float else x)
sect_1_df['Platform_Group'] = sect_1_df['Platform_Group'].apply(lambda x: sect_1[0] if x == 'All' or type(x) == float else x)

In [17]:
def section_rows(row):
    
    '''
    Given a row returns a list of the same row copied twice, with the 'Plarform Group' column modified depending on what section its release year fell on.
    
    row: A dataframe row.
    '''
    
    release_y = row['Release'].year
    
    out_1 = row.copy()
    out_2 = row.copy()
    
    out_plat = out_1['Platform_Group']
    
    if release_y in range(1983, 1993):
        
        out_1['Platform_Group'] = sect_2[0] if out_plat == 'All' or type(out_plat) == float else out_plat
        out_2['Platform_Group'] = sect_2[1] if out_plat == 'All' or type(out_plat) == float else out_plat
        
        rows = [out_1, out_2]
    
    elif release_y in range(1993,1999):
        
        out_1['Platform_Group'] = sect_3[0] if out_plat == 'All' or type(out_plat) == float else out_plat
        out_2['Platform_Group'] = sect_3[1] if out_plat == 'All' or type(out_plat) == float else out_plat
        
        rows = [out_1, out_2]
        
    elif release_y in range(1999,2005):

        out_1['Platform_Group'] = sect_4[0] if out_plat == 'All' or type(out_plat) == float else out_plat
        out_2['Platform_Group'] = sect_4[1] if out_plat == 'All' or type(out_plat) == float else out_plat
        
        rows = [out_1, out_2]
        
    elif release_y in range(2005,2012):
        
        out_1['Platform_Group'] = sect_5[0] if out_plat == 'All' or type(out_plat) == float else out_plat
        out_2['Platform_Group'] = sect_5[1] if out_plat == 'All' or type(out_plat) == float else out_plat
        
        rows = [out_1, out_2]
        
    elif release_y in range(2012,2020):
        
        out_1['Platform_Group'] = sect_6[0] if out_plat == 'All' or type(out_plat) == float else out_plat
        out_2['Platform_Group'] = sect_6[1] if out_plat == 'All' or type(out_plat) == float else out_plat
        
        rows = [out_1, out_2]
        
    else:

        out_1['Platform_Group'] = sect_7[0] if out_plat == 'All' or type(out_plat) == float else out_plat
        out_2['Platform_Group'] = sect_7[1] if out_plat == 'All' or type(out_plat) == float else out_plat
        
        rows = [out_1, out_2]
    
    return rows

#### **This cell below takes a while to execute**

In [18]:
new_rest_df = pd.DataFrame(columns = rest_df.columns)

for idx, row in rest_df.iterrows():
    
    plat_group = row['Platform_Group']
    rel_y = row['Release'].year
    
    if plat_group == 'All' or type(plat_group) == float:
        
        new_1, new_2 = section_rows(row)
        
        new_rest_df = new_rest_df.append(new_1).append(new_2)
        
    else:
        
        new_rest_df = new_rest_df.append(row)
    
print('Finished!')

Finished!


Let's join the split dataframes together again

In [19]:
data = pd.concat([sect_0_df, sect_1_df, new_rest_df])
data

Unnamed: 0,Title,Platform,Publisher,Developer,Release,Sales,Suggest_count,Tags,C_Score,C_Reviews,...,S_Steam,S_Xbox 360 Store,S_Xbox Store,S_itch.io,ESRB_Adults Only,ESRB_Everyone,ESRB_Everyone 10+,ESRB_Mature,ESRB_Teen,Platform_Group
67,100 Bullets,PSP,Unknown,Unknown,NaT,0.17,,,,,...,0,0,0,0,0,0,0,0,0,Sony
72,100 Years' War,PC,Unknown,Internet Gaming Gate,NaT,0.17,422.0,"[War, Medieval, combat, fight, unique]",,,...,0,0,0,0,0,0,0,0,1,PC
74,100-Yen Gomibako,PSN,Unknown,SCEI,NaT,0.17,,,,,...,0,0,0,0,0,0,0,0,0,Sony
79,1001 Nights: The Adventures of Sindbad,PC,Unknown,Unknown,NaT,0.17,,,,,...,0,0,0,0,0,0,0,0,0,PC
87,101-in-1 Explosive Megamix,Wii,Unknown,Nordcurrent,NaT,0.17,166.0,fast,16.0,5.0,...,0,0,0,0,0,0,0,0,0,Nintendo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57289,Yakuza: Like A Dragon,,Sega,Sega,2020-01-12,0.17,,,,,...,0,0,0,0,0,0,0,0,0,Sony
57290,Zaxxon,,Coleco,Coleco,1983-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,Nintendo
57290,Zaxxon,,Coleco,Coleco,1983-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,Other
57291,Zelda's Adventure,,Philips Interactive Media,Viridis Corporation,1994-05-06,0.17,,,,,...,0,0,0,0,0,0,0,0,0,Other


---
## 2.2. Encoding - Tags

### 2.2.1. Top 20 Tags

We first need to locate the top 20 values for tags and decide if any of them should be removed.

In [20]:
def tag_count(a_series):
    
    '''
    Given an input Series object, returns a list of tags in it.
    
    a_series: Input Series object.
    '''
    
    result = list()
    
    for row in a_series:

        if type(row) == list:
            result = result + row
            
    result = pd.Series(result)
                
    return result

We also want to convert tags to lowercase in case there are same tags differing in capitals.

In [21]:
def tag_lower(x):
    
    '''
    Given an input string or list of strings, returns a list of the same input lowecased.
    
    x: Input string or list of strings.
    '''
    
    result = list()
    
    if type(x) == list:
        for element in x:

            result.append(element.lower())
    else:
        result = x.lower()
    
    return result

In [22]:
data['Tags'] = data['Tags'].map(tag_lower, na_action = 'ignore')

In [23]:
tags = tag_count(data.Tags)
tags.value_counts()[:20]

singleplayer                  11871
steam achievements             6815
multiplayer                    6129
full controller support        4594
steam cloud                    4245
steam-trading-cards            4088
rpg                            2582
great soundtrack               2495
2d                             2449
atmospheric                    2347
co-op                          2250
exclusive                      2137
cooperative                    1987
partial controller support     1960
true exclusive                 1891
steam leaderboards             1689
story rich                     1651
horror                         1550
split screen                   1517
open world                     1498
dtype: int64

### 2.2.2. Removing Strange and Useless Values
Let's filter out the steam-exclusive and other strange top values.

In [24]:
out = ['steam', 'exclusive', 'controller']

for word in out:
    
    tags = tags.map(lambda x: x if word not in x.lower() else np.nan).dropna()

In [25]:
tags.value_counts()[:20]

singleplayer         11871
multiplayer           6129
rpg                   2582
great soundtrack      2495
2d                    2449
atmospheric           2347
co-op                 2250
cooperative           1987
story rich            1651
horror                1550
split screen          1517
open world            1498
first-person          1342
funny                 1330
fantasy               1258
sci-fi                1242
local co-op           1206
difficult             1195
local multiplayer     1166
third person          1161
dtype: int64

What we can see here is that there are **several tags** that **are redundant** such as `co-op` and `cooperative`.

Let's check the top 50 values to see if we can replace some of them into the same category:

In [26]:
tags.value_counts()[:50]

singleplayer          11871
multiplayer            6129
rpg                    2582
great soundtrack       2495
2d                     2449
atmospheric            2347
co-op                  2250
cooperative            1987
story rich             1651
horror                 1550
split screen           1517
open world             1498
first-person           1342
funny                  1330
fantasy                1258
sci-fi                 1242
local co-op            1206
difficult              1195
local multiplayer      1166
third person           1161
female protagonist     1139
role-playing           1034
online co-op           1032
comedy                 1021
retro                  1009
pixel graphics          994
gore                    988
online multiplayer      940
fps                     937
exploration             910
survival                888
classic                 872
anime                   858
violent                 797
online                  744
sandbox             

We can see other redundant tags or tags that could be interpreted as another:

- `rpg` > drop : We already have a genre for that. We will remove it when doing the `value_counts()`.

- `cooperative` > `co-op` : They are exactly the same thing.

- `local co-op` & `local multiplayer` & `split screen` > `multiplayer` : Because the nature of the game is multiplayer.

- `online co-op` & `online multiplayer` > `online` : Because the most important aspect here is the online part over the multiplyer nature.

- `comedy` > `funny` : They are the same thing.

- `space` > `sci-fi` : We can consider them the same thing.

- `role-playing` > `rpg` : They are the same thing.

- `classic` > `retro` : We can consider them the same thing.

- `story` > `story rich` : We can consider them the same thing.

- `zombies` > `horror` : Horror includes zombies.

- `gore` & `war` > `violent` : They are inherently the same.

In [27]:
## We replace the tags we discussed above

tags = tags.replace('cooperative','co-op')\
           .replace('local co-op', 'multiplayer')\
           .replace('local multiplayer', 'multiplayer')\
           .replace('split screen', 'multiplayer')\
           .replace('online co-op', 'online')\
           .replace('online multiplayer', 'online')\
           .replace('comedy', 'funny')\
           .replace('space', 'sci-fi')\
           .replace('role-playing', 'rpg')\
           .replace('classic', 'retro')\
           .replace('story', 'story rich')\
           .replace('zombies', 'horror')\
           .replace('gore', 'violent')\
           .replace('war', 'violent')\

Let's see the top 20 values now (not forgetting to remove rpg):

In [28]:
tags.value_counts().drop('rpg')[:20]

singleplayer          11871
multiplayer           10018
co-op                  4237
online                 2716
great soundtrack       2495
2d                     2449
funny                  2351
atmospheric            2347
violent                2242
story rich             2180
horror                 2034
retro                  1881
sci-fi                 1805
open world             1498
first-person           1342
fantasy                1258
difficult              1195
third person           1161
female protagonist     1139
pixel graphics          994
dtype: int64

This seems legit, and thus we will continue with the encoding of `Tags`.

### 2.2.3. Preparing Tags for One-Hot Encoding

In [29]:
## We keep this for OHE, top 20 tags

tags = tags.value_counts().drop('rpg')[:20].index.tolist()

In [30]:
def unnest_tags(col):
    
    '''
    Given an input row from a Series object, it returns a list of the same values after filtering out the values not found in the top 20 tags.
    If there is no tag in the input, it returns a list with NaN.
    
    col: Input data.
    '''
    
    result = list()
    
    for element in col:
            if element in tags:
                result.append(element)
        
    if len(result) == 0: # If no tags are in tags, we won't save an empty list but a NaN so that we can fill them with ease
        result = np.nan
    
    return result

In [31]:
data['Tags'] = data['Tags'].map(unnest_tags, na_action = 'ignore')
data

Unnamed: 0,Title,Platform,Publisher,Developer,Release,Sales,Suggest_count,Tags,C_Score,C_Reviews,...,S_Steam,S_Xbox 360 Store,S_Xbox Store,S_itch.io,ESRB_Adults Only,ESRB_Everyone,ESRB_Everyone 10+,ESRB_Mature,ESRB_Teen,Platform_Group
67,100 Bullets,PSP,Unknown,Unknown,NaT,0.17,,,,,...,0,0,0,0,0,0,0,0,0,Sony
72,100 Years' War,PC,Unknown,Internet Gaming Gate,NaT,0.17,422.0,,,,...,0,0,0,0,0,0,0,0,1,PC
74,100-Yen Gomibako,PSN,Unknown,SCEI,NaT,0.17,,,,,...,0,0,0,0,0,0,0,0,0,Sony
79,1001 Nights: The Adventures of Sindbad,PC,Unknown,Unknown,NaT,0.17,,,,,...,0,0,0,0,0,0,0,0,0,PC
87,101-in-1 Explosive Megamix,Wii,Unknown,Nordcurrent,NaT,0.17,166.0,,16.0,5.0,...,0,0,0,0,0,0,0,0,0,Nintendo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57289,Yakuza: Like A Dragon,,Sega,Sega,2020-01-12,0.17,,,,,...,0,0,0,0,0,0,0,0,0,Sony
57290,Zaxxon,,Coleco,Coleco,1983-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,Nintendo
57290,Zaxxon,,Coleco,Coleco,1983-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,Other
57291,Zelda's Adventure,,Philips Interactive Media,Viridis Corporation,1994-05-06,0.17,,,,,...,0,0,0,0,0,0,0,0,0,Other


### 2.2.4. One-Hot Encoding Tags

In [32]:
mlb = MultiLabelBinarizer()

data['Tags'] = data['Tags'].apply(lambda x: x if type(x) == list else [])

X = mlb.fit_transform(data.pop('Tags'))

data = data.join(pd.DataFrame(X, index=data.index, columns=['T_' + col for col in mlb.classes_]))

In [33]:
data

Unnamed: 0,Title,Platform,Publisher,Developer,Release,Sales,Suggest_count,C_Score,C_Reviews,C_Positive,...,T_multiplayer,T_online,T_open world,T_pixel graphics,T_retro,T_sci-fi,T_singleplayer,T_story rich,T_third person,T_violent
0,"""Nuke It""",PC,CrystalVision,CrystalVision,1998-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
1,#IDARB,XOne,Other Ocean Interactive,Other Ocean Interactive,2015-01-02,0.17,182.0,77.0,31.0,23.0,...,0,0,0,0,0,0,0,0,0,0
2,#killallzombies,PS4,Beatshapers,Beatshapers,2015-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
3,'70s Robot Anime: Geppy-X,PS,Aroma,Aroma,1999-05-27,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
4,'98 Koshien,PS,Magical Company,Magical Company,1998-06-18,0.41,,,,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57290,Zaxxon,,Coleco,Coleco,1983-01-01,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
57291,Zelda's Adventure,,Philips Interactive Media,Viridis Corporation,1994-05-06,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
57291,Zelda's Adventure,,Philips Interactive Media,Viridis Corporation,1994-05-06,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0
57291,Zelda's Adventure,,Philips Interactive Media,Viridis Corporation,1994-05-06,0.17,,,,,...,0,0,0,0,0,0,0,0,0,0


---
## 2.3. Encoding - Release

We'll convert `Release` to numeric by separating the year and month to separate columns, storing them as **integers**.

`NaN`s will be stored as `0`.

In [34]:
years = data['Release'].dt.year
months = data['Release'].dt.month

In [35]:
years = years.fillna(0).astype(int)
months = months.fillna(0).astype(int)

In [36]:
data['Release_Y'] = years
data['Release_M'] = months

# We drop the original Release column

data = data.drop('Release', axis = 1)

# 3. Estimating Sales in 1st Year

Sales in video games have approximately this shape:

<img src = https://a.storyblok.com/f/106061/932x539/1bdac7c86e/sales-curve.png width = 400>

Thus, we are going to create a function to estimate the sales in the first year, as what we have is the accumulative sales over the years.

The sales in the first year have a certain area below the curve, which means that at some point, the "tail" will accumulate enough sales to be comparable to the first year (and may happen more than once!).

So, taking this into account, the estimator function could look like something like this:

$$ {\Large
Sales_{1st year}= \left\{
\begin{array}{ll}
      \frac {Sales_{total}} {1 + \frac {(\Delta Release - 1)} {\gamma}} & \Delta Release < 10 \\
      \frac {Sales_{total}} {\frac {\gamma + 9} {\gamma}} & \Delta Release >= 10
\end{array} 
\right.
}$$


- $Sales_{total}$ : It's the current `Sales` column.

- $\Delta Release$ : It's the difference between **current year** and `Release_Y`.

    - The $-1$ is to filter out the first year, which is already considered in the $1+$ side of the denominator.
    
    - If the delta is greater than 10 years (we will call this `limit` in the code to ease changing its value later if needed), we will fix delta's value to 10. This is because it's highly unlikely a product accumulates enough sales after 10 years to be relevant.
    
    - The scraping process needs to be improved further as there are still problems to automate the scraping process due to issues such as error 503, taking too long to gather some data (4 days to scrape Metacritic on 2021).
    
        - We should consider 2021 as the upper limit for `Release` until this issue is tackled.
       

- $\gamma$ : It's the number of years we estimate for accumulative sales to be comparable to the first year (i.e: if $\gamma = 5$, it would mean that every 5 years we consider that the product reached another $Sales_{1st year}$). **It's totally arbitrary as we don't know what the correct value is but we assume that 5 is the value**.

In [37]:
def sales_1st_y(data, gamma = 5, limit = 10):
    
    '''
    Given an input, it returns its approximation of sales in the first year after release.
    
    data: Input data.
    gamma: Our estimate of how many years are needed to reach the sales in the first year again. Default = 5.
    limit: How many years we consider games stop selling new units. Default = 10.
    '''
    
    sales_t = data['Sales']
    release_y = data['Release_Y']
    
    current_y = dt.datetime.now().year
    
    delta = current_y - release_y
    
    if delta >= limit:
        delta = limit
    
    result = round(sales_t / (1 + (delta - 1)/ gamma))
    
    return result

In [38]:
### We do this so that we don't lose information after applying the transformation.

data['Sales'] = data['Sales'] * 1000000

In [39]:
data['Sales'] = data.apply(sales_1st_y, axis = 1)

In [40]:
# We revert it back to millions scale

data['Sales'] = data['Sales'] / 1000000

---

# 4. Grouping

We want same titles launched in different platforms to merge into a single row in the dataframe.

We are going to use `groupby()` and `agg()` to that purpose.

By grouping, some columns may contain multiple values, which we do not want:

## 4.1. Grouping Publisher and Developer

For `Publisher` and `Developer`: We are going to apply `pd.Series.mode()` to get the most frequent element. Furthermore, if there is a tie we will get the first element of them.

In [41]:
grouped = data.groupby(['Title'])

pub_and_dev = grouped.agg(
                **{'Publisher' : pd.NamedAgg(column = 'Publisher', aggfunc = lambda x: pd.Series.mode(x) if pd.Series.mode(x).shape[0] < 1 else pd.Series.mode(x)[0]),
                   'Developer' : pd.NamedAgg(column = 'Developer', aggfunc = lambda x: pd.Series.mode(x) if pd.Series.mode(x).shape[0] < 1 else pd.Series.mode(x)[0])
                   }).\
                reset_index()

pub_and_dev

Unnamed: 0,Title,Publisher,Developer
0,"""Nuke It""",CrystalVision,CrystalVision
1,#IDARB,Other Ocean Interactive,Other Ocean Interactive
2,#killallzombies,Beatshapers,Beatshapers
3,'70s Robot Anime: Geppy-X,Aroma,Aroma
4,'98 Koshien,Magical Company,Magical Company
...,...,...,...
37440,yOm,Microsoft,jojito
37441,yOm_fury,Microsoft,jojito
37442,¡Shin Chan Flipa en colores!,505 Games,Inti Creates
37443,じんるいのみなさまへ,Unknown,Nippon Ichi Software


## 4.2. Grouping Suggest_count and Sales

We are going to group by `Platform_Group` to do this, as both values should take them into account.

For `Suggest_count`: We are going to use `max()` in the `aggfunc` as suggestion counts is shared between the same title.

For `Sales/Group`: We are going to use `sum()` in the `aggfunc` as sales in the same group should be considered as one.

In [42]:
grouped = data.groupby(['Title','Platform_Group'])

sales_by_group = grouped.agg(
                **{'Sales/ Group' : pd.NamedAgg(column = 'Sales', aggfunc = sum),
                   'Suggest_count' : pd.NamedAgg(column = 'Suggest_count', aggfunc = max)}).\
                reset_index()

sales_by_group

Unnamed: 0,Title,Platform_Group,Sales/ Group,Suggest_count
0,"""Nuke It""",PC,0.060714,
1,#IDARB,Microsoft,0.085000,182.0
2,#killallzombies,Sony,0.085000,
3,'70s Robot Anime: Geppy-X,Sony,0.060714,
4,'98 Koshien,Sony,0.146429,
...,...,...,...,...
51799,yOm,Microsoft,0.060714,
51800,yOm_fury,Microsoft,0.060714,
51801,¡Shin Chan Flipa en colores!,Nintendo,0.050000,
51802,じんるいのみなさまへ,Nintendo,0.546426,


## 4.3. OneHot Encoding Platform_Group

We will one hot encode `sales_by_group` to convert `Platform_Group` to numeric.

In [43]:
sales_ohe = pd.get_dummies(sales_by_group['Platform_Group'])

sales_by_group = sales_by_group.join(sales_ohe)

# We pick Minecraft as an example, as it is present in 4 platform groups
sales_by_group[sales_by_group['Title']=='Minecraft']

Unnamed: 0,Title,Platform_Group,Sales/ Group,Suggest_count,Microsoft,Nintendo,Other,PC,Sony
27122,Minecraft,Microsoft,7.884849,411.0,1,0,0,0,0
27123,Minecraft,Nintendo,2.397917,411.0,0,1,0,0,0
27124,Minecraft,PC,11.839286,411.0,0,0,0,1,0
27125,Minecraft,Sony,6.75,411.0,0,0,0,0,1


## 4.4. Arranging New Dataframe

Now we are going to group by `Title` and sum `Sales/ Group` to get the total sales for each title, apart from the sales in the different platform groups.

We will also add the `Hit` column, checking if total sales >= 1M or not.

In [44]:
grouped = sales_by_group.groupby(['Title'])

sales_by_title = grouped.agg(
                **{'Hit': pd.NamedAgg(column = 'Sales/ Group', aggfunc = lambda x: 1 if sum(x) >= 1 else 0),
                   'Sales_total' : pd.NamedAgg(column = 'Sales/ Group', aggfunc = sum),
                   'Suggest_count' : pd.NamedAgg(column = 'Suggest_count', aggfunc = max),
                   
                   'P_Microsoft' : pd.NamedAgg(column = 'Microsoft', aggfunc = max),
                   'P_Nintendo' : pd.NamedAgg(column = 'Nintendo', aggfunc = max),
                   'P_Other' : pd.NamedAgg(column = 'Other', aggfunc = max),
                   'P_PC' : pd.NamedAgg(column = 'PC', aggfunc = max),
                   'P_Sony' : pd.NamedAgg(column = 'Sony', aggfunc = max)
                   }).\
                reset_index()
sales_by_title

Unnamed: 0,Title,Hit,Sales_total,Suggest_count,P_Microsoft,P_Nintendo,P_Other,P_PC,P_Sony
0,"""Nuke It""",0.0,0.060714,,0,0,0,1,0
1,#IDARB,0.0,0.085000,182.0,1,0,0,0,0
2,#killallzombies,0.0,0.085000,,0,0,0,0,1
3,'70s Robot Anime: Geppy-X,0.0,0.060714,,0,0,0,0,1
4,'98 Koshien,0.0,0.146429,,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
37440,yOm,0.0,0.060714,,1,0,0,0,0
37441,yOm_fury,0.0,0.060714,,1,0,0,0,0
37442,¡Shin Chan Flipa en colores!,0.0,0.050000,,0,1,0,0,0
37443,じんるいのみなさまへ,0.0,0.546426,,0,1,0,0,0


Now, we group by `Title` to get the rest of columns of the original dataframe.

In [45]:
grouped = data.groupby('Title')

rest_of_cols = grouped.agg(
                **{
                    'C_Score' : pd.NamedAgg(column = 'C_Score', aggfunc = max), 
                    'C_Positive' : pd.NamedAgg(column = 'C_Positive', aggfunc = sum),
                    'C_Mixed' : pd.NamedAgg(column = 'C_Mixed', aggfunc = sum),
                    'C_Negative' : pd.NamedAgg(column = 'C_Negative', aggfunc = sum),

                    'U_Score' : pd.NamedAgg(column = 'U_Score', aggfunc = max),
                    'U_Positive' : pd.NamedAgg(column = 'U_Positive', aggfunc = sum),
                    'U_Mixed' : pd.NamedAgg(column = 'U_Mixed', aggfunc = sum),
                    'U_Negative' : pd.NamedAgg(column = 'U_Negative', aggfunc = sum),

                    'G_Action' : pd.NamedAgg(column = 'G_Action', aggfunc = max), 
                    'G_Adventure' : pd.NamedAgg(column = 'G_Adventure', aggfunc = max), 
                    'G_Arcade' : pd.NamedAgg(column = 'G_Arcade', aggfunc = max), 
                    'G_Board_Games' : pd.NamedAgg(column = 'G_Board Games', aggfunc = max), 
                    'G_Card' : pd.NamedAgg(column = 'G_Card', aggfunc = max),
                    'G_Casual' : pd.NamedAgg(column = 'G_Casual', aggfunc = max),
                    'G_Educational' : pd.NamedAgg(column = 'G_Educational', aggfunc = max),
                    'G_Family' : pd.NamedAgg(column = 'G_Family', aggfunc = max),
                    'G_Fighting' : pd.NamedAgg(column = 'G_Fighting', aggfunc = max),
                    'G_Indie' : pd.NamedAgg(column = 'G_Indie', aggfunc = max),
                    'G_Massively_Multiplayer' : pd.NamedAgg(column = 'G_Massively Multiplayer', aggfunc = max),
                    'G_Platformer' : pd.NamedAgg(column = 'G_Platformer', aggfunc = max),
                    'G_Puzzle' : pd.NamedAgg(column = 'G_Puzzle', aggfunc = max),
                    'G_RPG' : pd.NamedAgg(column = 'G_RPG', aggfunc = max),
                    'G_Racing' : pd.NamedAgg(column = 'G_Racing', aggfunc = max),
                    'G_Shooter' : pd.NamedAgg(column = 'G_Shooter', aggfunc = max),
                    'G_Simulation' : pd.NamedAgg(column = 'G_Simulation', aggfunc = max),
                    'G_Sports' : pd.NamedAgg(column = 'G_Sports', aggfunc = max),
                    'G_Strategy' : pd.NamedAgg(column = 'G_Strategy', aggfunc = max),
                    'G_Other' : pd.NamedAgg(column = 'G_Strategy', aggfunc = max),

                    'S_Epic_Games' : pd.NamedAgg(column = 'S_Epic Games', aggfunc = max),
                    'S_GOG' : pd.NamedAgg(column = 'S_GOG', aggfunc = max),
                    'S_Nintendo_Store' : pd.NamedAgg(column = 'S_Nintendo Store', aggfunc = max),
                    'S_PlayStation_Store' : pd.NamedAgg(column = 'S_PlayStation Store', aggfunc = max),
                    'S_Steam' : pd.NamedAgg(column = 'S_Steam', aggfunc = max),
                    'S_Xbox_360_Store' : pd.NamedAgg(column = 'S_Xbox 360 Store', aggfunc = max),
                    'S_Xbox_Store' : pd.NamedAgg(column = 'S_Xbox Store', aggfunc = max),
                    'S_itch.io' : pd.NamedAgg(column = 'S_itch.io', aggfunc = max),

                    
                    'ESRB_All' : pd.NamedAgg(column = 'ESRB_Everyone', aggfunc = max),
                    'ESRB_10+' : pd.NamedAgg(column = 'ESRB_Everyone 10+', aggfunc = max),
                    'ESRB_Teen' : pd.NamedAgg(column = 'ESRB_Teen', aggfunc = max),
                    'ESRB_17+' : pd.NamedAgg(column = 'ESRB_Mature', aggfunc = max),
                    'ESRB_18+' : pd.NamedAgg(column = 'ESRB_Adults Only', aggfunc = max),

                    'T_Singleplayer' : pd.NamedAgg(column = 'T_singleplayer', aggfunc = max),
                    'T_Multiplayer' : pd.NamedAgg(column = 'T_multiplayer', aggfunc = max),
                    'T_Co_Op' : pd.NamedAgg(column = 'T_co-op', aggfunc = max),
                    'T_Online' : pd.NamedAgg(column = 'T_online', aggfunc = max),
                    'T_Great_OST' : pd.NamedAgg(column = 'T_great soundtrack', aggfunc = max),
                    'T_Atmospheric' : pd.NamedAgg(column = 'T_atmospheric', aggfunc = max),
                    'T_Violent' : pd.NamedAgg(column = 'T_violent', aggfunc = max),
                    'T_Story_Rich' : pd.NamedAgg(column = 'T_story rich', aggfunc = max), 
                    'T_2D' : pd.NamedAgg(column = 'T_2d', aggfunc = max),
                    'T_Funny' : pd.NamedAgg(column = 'T_funny', aggfunc = max),
                    'T_Horror' : pd.NamedAgg(column = 'T_horror', aggfunc = max),
                    'T_Retro' : pd.NamedAgg(column = 'T_retro', aggfunc = max),
                    'T_Sci_fi' : pd.NamedAgg(column = 'T_sci-fi', aggfunc = max),
                    'T_Open_World' : pd.NamedAgg(column = 'T_open world', aggfunc = max),
                    'T_1st_Person' : pd.NamedAgg(column = 'T_first-person', aggfunc = max),     
                    'T_3rd_Person' : pd.NamedAgg(column = 'T_third person', aggfunc = max),
                    'T_Fantasy' : pd.NamedAgg(column = 'T_fantasy', aggfunc = max),
                    'T_Female_Protagonist' : pd.NamedAgg(column = 'T_female protagonist', aggfunc = max),
                    'T_Hard' : pd.NamedAgg(column = 'T_difficult', aggfunc = max),
                    'T_Pixel_Graphics' : pd.NamedAgg(column = 'T_pixel graphics', aggfunc = max),
                   
                    'Release_Y' : pd.NamedAgg(column = 'Release_Y', aggfunc = min),
                    'Release_M' : pd.NamedAgg(column = 'Release_M', aggfunc = min)
                   }).\
                reset_index()              

In [46]:
result = pub_and_dev.merge(sales_by_title).merge(rest_of_cols)
result

Unnamed: 0,Title,Publisher,Developer,Hit,Sales_total,Suggest_count,P_Microsoft,P_Nintendo,P_Other,P_PC,...,T_Sci_fi,T_Open_World,T_1st_Person,T_3rd_Person,T_Fantasy,T_Female_Protagonist,T_Hard,T_Pixel_Graphics,Release_Y,Release_M
0,"""Nuke It""",CrystalVision,CrystalVision,0.0,0.060714,,0,0,0,1,...,0,0,0,0,0,0,0,0,1998,1
1,#IDARB,Other Ocean Interactive,Other Ocean Interactive,0.0,0.085000,182.0,1,0,0,0,...,0,0,0,0,0,0,0,0,2015,1
2,#killallzombies,Beatshapers,Beatshapers,0.0,0.085000,,0,0,0,0,...,0,0,0,0,0,0,0,0,2015,1
3,'70s Robot Anime: Geppy-X,Aroma,Aroma,0.0,0.060714,,0,0,0,0,...,0,0,0,0,0,0,0,0,1999,5
4,'98 Koshien,Magical Company,Magical Company,0.0,0.146429,,0,0,0,0,...,0,0,0,0,0,0,0,0,1998,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37440,yOm,Microsoft,jojito,0.0,0.060714,,1,0,0,0,...,0,0,0,0,0,0,0,0,2009,10
37441,yOm_fury,Microsoft,jojito,0.0,0.060714,,1,0,0,0,...,0,0,0,0,0,0,0,0,2009,12
37442,¡Shin Chan Flipa en colores!,505 Games,Inti Creates,0.0,0.050000,,0,1,0,0,...,0,0,0,0,0,0,0,0,2007,11
37443,じんるいのみなさまへ,Unknown,Nippon Ichi Software,0.0,0.546426,,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


# 5. Saving Dataset

In [47]:
result.to_csv("Files/05_encoded.csv", encoding='utf-8', index=False)