In [1]:
import pandas as pd
import numpy as np
# import copy
from ast import literal_eval ### to convert dicts stored as str to str

In [2]:
def col_series(column, series, long = False, list_like = True):
    
    '''
    Function returning a pd.Series containing lists or values nested in a column from the RAWG games dataset.
    
    Keyword arguments:
    column -- The name of the column in the dataset.
    series -- The values stored as a pd.Series object in the column.
    long -- Whether if the column has the highest amount of nesting or not. (default=False)
    list_like -- Whether if the column contains lists or dictionaries. (default=True)
    
    '''
    
    aux_big = list()
    
    if long == True:        
        key = column[:-1] # Strip the s so that the key works  
        
        for row in series:
            aux = list()
            
            for item in row:
                aux.append(item[key]['name'])
                
            if len(aux) == 1:
                    aux = item[key]['name']
                
            if len(aux) <= 0:
                    aux = np.nan
                
            aux_big.append(aux)
                
    else:      
        if list_like == True:
            for row in series:
                aux = list()
                
                for item in row:
                    aux.append(item['name'])
                
                if len(aux) == 1:
                    aux = item['name']
                    
                elif len(aux) <= 0:
                    aux = np.nan
                    
                aux_big.append(aux)

        else:
            for row in series:
                aux = np.nan
                
                if not np.any(pd.isna(row)):
                    aux = row['name']
                    
                aux_big.append(aux)
    
    return pd.Series(aux_big, name = column)

In [5]:
chunk_size = 10000
count = 1

series_df = pd.DataFrame()
others = pd.DataFrame(columns = ['id','slug','name','released','tba','metacritic','suggestions_count','updated'])
others_aux = pd.DataFrame()

platforms = pd.Series(dtype=object, name = 'platforms')
genres = pd.Series(dtype=object, name = 'genres')
stores = pd.Series(dtype=object, name = 'stores')
tags = pd.Series(dtype=object, name = 'tags')
esrb = pd.Series(dtype=object, name = 'esrb_rating')


for chunk in pd.read_csv('rawg_games.csv', sep = '^', chunksize = chunk_size, index_col = None):
    
    ####################################
    ##  Dropping unnecessary columns  ##
    ####################################
    
    data = chunk[['id','slug','name','released','tba','metacritic','suggestions_count',
                'updated','platforms','genres','stores','tags','esrb_rating']].copy()
    
    ###########################
    ##  Str to list of dict  ##
    ###########################
    
    data['platforms'] = data['platforms'].apply(lambda x: literal_eval(x) if pd.notnull(x) else x)
    data['genres'] = data['genres'].apply(lambda x: literal_eval(x) if pd.notnull(x) else x)
    data['stores'] = data['stores'].apply(lambda x: literal_eval(x) if pd.notnull(x) else x)
    data['tags'] = data['tags'].apply(lambda x: literal_eval(x) if pd.notnull(x) else x)
    data['esrb_rating'] = data['esrb_rating'].apply(lambda x: literal_eval(x) if pd.notnull(x) else x)
    
    #############################################
    ##  Slice column to get YYYY-MM-DD format  ##
    #############################################
    
    data['updated'] = data['updated'].str.slice(stop=-9) # We slice the 'updated' col so that it only shows the YYYY-MM-DD format
    
    ##################################
    ##  Getting the values we want  ##
    ##################################
    
    aux = pd.DataFrame(columns = ['id','name','released','tba','metacritic','suggestions_count','updated'])
    
    for cols, series in data.iteritems():

        if cols == 'platforms':
            platforms = platforms.append(col_series(cols, series, long = True)).reset_index(drop=True)

        elif cols == 'genres':
            genres = genres.append(col_series(cols, series)).reset_index(drop=True)

        elif cols == 'stores':         
            stores = stores.append(col_series(cols, series, long = True)).reset_index(drop=True)

        elif cols == 'tags':          
            tags = tags.append(col_series(cols, series)).reset_index(drop=True)

        elif cols == 'esrb_rating':
            esrb = esrb.append(col_series(cols, series, list_like = False)).reset_index(drop=True)
        
        else:
            aux[cols] = pd.Series(series, name = cols)
            

    others = pd.concat([others, aux])
    series_df = pd.concat([platforms, genres, stores, tags, esrb], axis = 1)
                
    ##################################################            
    ##  Output saved by concatenation to DataFrame  ##
    ##################################################
    
    print('Chunk #'+ str(count))
    count += 1
    
first_half = others.reset_index()
second_half = series_df.reset_index()

output = first_half.merge(second_half)
    
print('Finished!')

Chunk #1
Chunk #2
Chunk #3
Chunk #4
Chunk #5
Chunk #6
Chunk #7
Chunk #8
Chunk #9
Chunk #10
Chunk #11
Chunk #12
Chunk #13
Chunk #14
Chunk #15
Chunk #16
Chunk #17
Chunk #18
Chunk #19
Chunk #20
Chunk #21
Chunk #22
Chunk #23
Chunk #24
Chunk #25
Chunk #26
Chunk #27
Chunk #28
Chunk #29
Chunk #30
Chunk #31
Chunk #32
Chunk #33
Chunk #34
Chunk #35
Chunk #36
Chunk #37
Chunk #38
Chunk #39
Chunk #40
Chunk #41
Chunk #42
Chunk #43
Chunk #44
Chunk #45
Chunk #46
Chunk #47
Chunk #48
Chunk #49
Chunk #50
Chunk #51
Chunk #52
Chunk #53
Finished!


In [6]:
output

Unnamed: 0,index,id,slug,name,released,tba,metacritic,suggestions_count,updated,platforms,genres,stores,tags,esrb_rating
0,0,3498,grand-theft-auto-v,Grand Theft Auto V,2013-09-17,False,97.0,416,2021-03-03,"[PC, Xbox Series S/X, PlayStation 5, PlayStati...","[Action, Adventure]","[Epic Games, PlayStation Store, Xbox Store, Xb...","[Singleplayer, Steam Achievements, Multiplayer...",Mature
1,1,4200,portal-2,Portal 2,2011-04-18,False,95.0,582,2020-08-03,"[Xbox One, PlayStation 3, PC, Xbox 360, Linux,...","[Shooter, Puzzle]","[Xbox Store, Xbox 360 Store, PlayStation Store...","[Singleplayer, Steam Achievements, Multiplayer...",Everyone 10+
2,2,3328,the-witcher-3-wild-hunt,The Witcher 3: Wild Hunt,2015-05-18,False,92.0,678,2020-10-02,"[PC, Xbox One, Nintendo Switch, PlayStation 4]","[Action, Adventure, RPG]","[GOG, Xbox Store, Steam, PlayStation Store]","[Singleplayer, Atmospheric, Full controller su...",Mature
3,3,5286,tomb-raider,Tomb Raider (2013),2013-03-05,False,86.0,664,2020-08-03,"[PC, PlayStation 4, PlayStation 3, Xbox 360, X...","[Action, Adventure]","[App Store, Google Play, PlayStation Store, St...","[Singleplayer, Multiplayer, Atmospheric, Full ...",Mature
4,4,5679,the-elder-scrolls-v-skyrim,The Elder Scrolls V: Skyrim,2011-11-11,False,94.0,621,2020-07-06,"[PC, PlayStation 3, Xbox 360, Nintendo Switch]","[Action, RPG]","[Xbox 360 Store, Nintendo Store, Steam, PlaySt...","[Singleplayer, Steam Achievements, steam-tradi...",Mature
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
521608,521608,1973,no-human,"No, Human",2010-08-24,False,,78,2019-10-22,iOS,"[Action, Puzzle]",App Store,"[Physics, Music, Story, Relaxing, Destruction]",Everyone 10+
521609,521609,1971,descend-rpg,Descend RPG,2011-10-13,False,,140,2019-10-22,iOS,RPG,App Store,"[RPG, Mystery, combat, battle, party, fight, s...",Everyone 10+
521610,521610,1957,hodappybird,HodappyBird,2014-04-17,False,,67,2019-10-22,iOS,"[Action, Arcade]",App Store,"[color, bird]",Everyone 10+
521611,521611,1688,brigands-and-barbarians-hd,Brigands and Barbarians HD,2012-11-14,False,,225,2019-10-22,iOS,"[Adventure, RPG]",App Store,"[RPG, Fantasy, War, JRPG, combat, Dragons, pri...",Everyone 10+


In [7]:
games = output.drop('index', axis = 1)
games

Unnamed: 0,id,slug,name,released,tba,metacritic,suggestions_count,updated,platforms,genres,stores,tags,esrb_rating
0,3498,grand-theft-auto-v,Grand Theft Auto V,2013-09-17,False,97.0,416,2021-03-03,"[PC, Xbox Series S/X, PlayStation 5, PlayStati...","[Action, Adventure]","[Epic Games, PlayStation Store, Xbox Store, Xb...","[Singleplayer, Steam Achievements, Multiplayer...",Mature
1,4200,portal-2,Portal 2,2011-04-18,False,95.0,582,2020-08-03,"[Xbox One, PlayStation 3, PC, Xbox 360, Linux,...","[Shooter, Puzzle]","[Xbox Store, Xbox 360 Store, PlayStation Store...","[Singleplayer, Steam Achievements, Multiplayer...",Everyone 10+
2,3328,the-witcher-3-wild-hunt,The Witcher 3: Wild Hunt,2015-05-18,False,92.0,678,2020-10-02,"[PC, Xbox One, Nintendo Switch, PlayStation 4]","[Action, Adventure, RPG]","[GOG, Xbox Store, Steam, PlayStation Store]","[Singleplayer, Atmospheric, Full controller su...",Mature
3,5286,tomb-raider,Tomb Raider (2013),2013-03-05,False,86.0,664,2020-08-03,"[PC, PlayStation 4, PlayStation 3, Xbox 360, X...","[Action, Adventure]","[App Store, Google Play, PlayStation Store, St...","[Singleplayer, Multiplayer, Atmospheric, Full ...",Mature
4,5679,the-elder-scrolls-v-skyrim,The Elder Scrolls V: Skyrim,2011-11-11,False,94.0,621,2020-07-06,"[PC, PlayStation 3, Xbox 360, Nintendo Switch]","[Action, RPG]","[Xbox 360 Store, Nintendo Store, Steam, PlaySt...","[Singleplayer, Steam Achievements, steam-tradi...",Mature
...,...,...,...,...,...,...,...,...,...,...,...,...,...
521608,1973,no-human,"No, Human",2010-08-24,False,,78,2019-10-22,iOS,"[Action, Puzzle]",App Store,"[Physics, Music, Story, Relaxing, Destruction]",Everyone 10+
521609,1971,descend-rpg,Descend RPG,2011-10-13,False,,140,2019-10-22,iOS,RPG,App Store,"[RPG, Mystery, combat, battle, party, fight, s...",Everyone 10+
521610,1957,hodappybird,HodappyBird,2014-04-17,False,,67,2019-10-22,iOS,"[Action, Arcade]",App Store,"[color, bird]",Everyone 10+
521611,1688,brigands-and-barbarians-hd,Brigands and Barbarians HD,2012-11-14,False,,225,2019-10-22,iOS,"[Adventure, RPG]",App Store,"[RPG, Fantasy, War, JRPG, combat, Dragons, pri...",Everyone 10+


In [5]:
genres = list()

for col, series in games.iteritems():
    if col == 'genres':
        for row in series:
            if type(row) != list:
                if row not in genres:
                    genres.append(row)
                
            else:
                for item in row:
                    if item not in genres:
                        genres.append(item)

pd.Series(genres).reset_index()

Unnamed: 0,index,0
0,0,Action
1,1,Adventure
2,2,Shooter
3,3,Puzzle
4,4,RPG
5,5,Indie
6,6,Platformer
7,7,Sports
8,8,Racing
9,9,Massively Multiplayer


In [5]:
uniques = list()

for col, series in games.iteritems():
    if col == 'platforms':
        for row in series:
            if type(row) != list:
                if row not in uniques:
                    uniques.append(row)
                
            else:
                for item in row:
                    if item not in uniques:
                        uniques.append(item)

pd.Series(uniques).reset_index()

Unnamed: 0,index,0
0,0,PC
1,1,Xbox Series S/X
2,2,PlayStation 5
3,3,PlayStation 4
4,4,PlayStation 3
5,5,Xbox 360
6,6,Xbox One
7,7,Linux
8,8,macOS
9,9,Nintendo Switch


These are the unique platforms in the data set.

- iOS, Android and Web should be dropped.
- We have to decide whether we want old consoles to be included here or not (from what year onward will be considered valid?)

In [8]:
games.to_csv("rawg_games_smaller.csv", encoding='utf-8', index=False)