In [89]:
import pandas as pd

from datetime import datetime as dt
#from copy import deepcopy
import pickle

%matplotlib inline

In [90]:
df = pd.read_pickle('score_df.pk1')

# Generate 'game type' characteristic (genre + gameplay)

Combining 'genre' and 'gameplay' columns - similar/interchangable info. Calling this info 'game_type'

In [91]:
#getting rid of NaNs so that the combo column (game_type) below works
df.genre.fillna('', inplace=True)
df.gameplay.fillna('', inplace=True)

df['game_type'] = df['genre'] + " & " + df['gameplay']

df.drop(['genre', 'gameplay'], axis=1, inplace=True)  #now redundant

Pulling out defining gameplay trait keywords to re-classify 'game type'

In [92]:
df.replace(r'.*RPG.*', 'RPG', inplace=True, regex=True)   #RPGs
df.replace(r'.*Driving.*', 'Racing', inplace=True, regex=True)  #racing
df.replace(r'.*Shoot.*', 'Shooter', inplace=True, regex=True)   #shooters
df.replace(r'.*Platform.*', 'Platformer', inplace=True, regex=True)  #platformers
df.replace(r'.*Fight.*', 'Fighting', inplace=True, regex=True)  #fighting
df.replace(r'.*Sport.*', 'Sports', inplace=True, regex=True)  #sports

#Classifying open world, adventure, metroidvania, etc. as general adventure category
df.replace(r'.*Adventure.*', 'Adventure', inplace=True, regex=True)
df.replace(r'.*Metroid.*', 'Adventure', inplace=True, regex=True)
df.replace(r'.*Open\ World.*', 'Adventure', inplace=True, regex=True)

#Grouping Puzzle/Strategy together as more cerebral games
df.replace(r'.*Puzzle.*', 'Puzzle/Strategy', inplace=True, regex=True)
df.replace(r'.*Strat.*', 'Puzzle/Strategy', inplace=True, regex=True)

#Replacing stragglers/unique genres with 'Misc Action/Other'
df.replace(r'.*\&.*', 'Misc Action/Other', inplace=True, regex=True)

In [93]:
df.game_type.value_counts()   #checking to see that everything is categorized

Shooter              67
RPG                  48
Misc Action/Other    36
Platformer           36
Racing               24
Adventure            17
Puzzle/Strategy      14
Fighting             12
Sports               11
Name: game_type, dtype: int64

# Generate target_age characteristic (from esrb)

In [94]:
esrb_column = deepcopy(df['esrb rating'])   #copying esrb rating column - don't want to overwrite it

#transform ratings into continuous values based on minimum age recommended by esrb
esrb_column.replace('Everyone', 6, inplace=True) 
esrb_column.replace('Everyone 10+', 10, inplace=True) 
esrb_column.replace('Teen', 13, inplace=True) 
esrb_column.replace('Mature', 17, inplace=True)

df['target_age'] = esrb_column

In [95]:
df['target_age'].value_counts()

17.0    62
10.0    34
13.0    28
6.0     13
Name: target_age, dtype: int64

# Create dummy columns (console, game type)

In [96]:
df_mod = pd.get_dummies(df, columns=['game_type', 'console'], drop_first=True)

df_mod.head(1)

Unnamed: 0,title,released,perspective,setting,developed by,published by,esrb rating,score,target_age,game_type_Fighting,game_type_Misc Action/Other,game_type_Platformer,game_type_Puzzle/Strategy,game_type_RPG,game_type_Racing,game_type_Shooter,game_type_Sports,console_PlayStation 4,console_Xbox One
1,1-2-Switch,"Mar 03, 2017","1st-person, Audio game",,Nintendo EPD,Nintendo of America Inc.,Everyone 10+,48,10.0,0,1,0,0,0,0,0,0,0,0


# Clean up

In [97]:
#drop rows with 'NaN' target_age/esrb rating
df_mod = df_mod[df_mod.target_age.notnull()]

#turn release date into proper datetime
df_mod.released = pd.to_datetime(df_mod.released)

# Reorder columns to put target at the end
cols = list(df_mod.columns.values) 
cols.pop(cols.index('score'))
df_mod = df_mod[cols+['score']]

# Create pickle for next steps

In [99]:
#create pickle of cleaned up dataframe
with open('df_mod.pk1', 'wb') as f:
    pickle.dump(df_mod, f)