In [1]:
import sys
import warnings
warnings.filterwarnings('ignore')

sys.path.append("../utils/")
from myFunctions import jsonGzipToDataframe, toDommyColumns

Extraemos los datos del archivo origina y lo cargamos en un dataframe

In [2]:
df = jsonGzipToDataframe('../datasource/steam_games.json.gz')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120445 entries, 0 to 120444
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   publisher     24083 non-null  object
 1   genres        28852 non-null  object
 2   app_name      32133 non-null  object
 3   title         30085 non-null  object
 4   url           32135 non-null  object
 5   release_date  30068 non-null  object
 6   tags          31972 non-null  object
 7   reviews_url   32133 non-null  object
 8   specs         31465 non-null  object
 9   price         30758 non-null  object
 10  early_access  32135 non-null  object
 11  id            32133 non-null  object
 12  developer     28836 non-null  object
dtypes: object(13)
memory usage: 11.9+ MB


Nos aseguramos de trabajar en otro espacio de memoria para no tener que volver a cargar el JSON

In [4]:
# Nos aseguramos de crear un nuevo objeto en memoria usando 'copy()'
dfSteamGames = df.copy()

#### ðŸ”Ž ID

Si hay 'id' duplicados nos quedamos con el primero y los demas los eliminamos

In [5]:
# Eliminar filas con 'user_id' duplicados
dfSteamGames = dfSteamGames.drop_duplicates(['id'], keep = 'first').reset_index(drop = True)

In [6]:
dfSteamGames.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32133 entries, 0 to 32132
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   publisher     24081 non-null  object
 1   genres        28850 non-null  object
 2   app_name      32131 non-null  object
 3   title         30083 non-null  object
 4   url           32132 non-null  object
 5   release_date  30066 non-null  object
 6   tags          31970 non-null  object
 7   reviews_url   32132 non-null  object
 8   specs         31463 non-null  object
 9   price         30755 non-null  object
 10  early_access  32132 non-null  object
 11  id            32132 non-null  object
 12  developer     28834 non-null  object
dtypes: object(13)
memory usage: 3.2+ MB


In [7]:
# Aplicamos una mascara para marcar los valores NaN y los sumamos por columna
print(f'NameColumn       NaN\n{dfSteamGames.isna().sum()}')

NameColumn       NaN
publisher       8052
genres          3283
app_name           2
title           2050
url                1
release_date    2067
tags             163
reviews_url        1
specs            670
price           1378
early_access       1
id                 1
developer       3299
dtype: int64


Eliminemos los juegos que no tienen 'id'

In [8]:
# Elimina las filas que tienen valor NaN en la columna 'id'
dfSteamGames = dfSteamGames.dropna(subset = ['id']).reset_index(drop = True)

#### ðŸ”Ž PRICE

Busquemos en la columna 'price' valores que no sean numericos

In [9]:
dfSteamGames.loc[
  ~dfSteamGames['price']. astype(str).
  str.isnumeric() & (dfSteamGames['price'].astype(str).str.len() > 3), 'price'
].unique()

array([4.99, 'Free To Play', 'Free to Play', 0.99, 2.99, 3.99, 9.99,
       18.99, 29.99, 'Free', 10.99, 1.59, 14.99, 1.99, 59.99, 8.99, 6.99,
       7.99, 39.99, 19.99, 7.49, 12.99, 5.99, 2.49, 15.99, 1.25, 24.99,
       17.99, 61.99, 3.49, 11.99, 13.99, 'Free Demo', 'Play for Free!',
       34.99, 74.76, 1.49, 32.99, 99.99, 14.95, 69.99, 16.99, 79.99,
       49.99, 44.99, 13.98, 29.96, 119.99, 109.99, 149.99, 771.71,
       'Install Now', 21.99, 89.99, 'Play WARMACHINE: Tactics Demo', 0.98,
       139.92, 4.29, 64.99, 'Free Mod', 54.99, 74.99, 'Install Theme',
       0.89, 'Third-party', 'Play Now', 299.99, 1.29, 15.0, 5.49, 23.99,
       49.0, 20.99, 10.93, 1.39, 'Free HITMANâ„¢ Holiday Pack', 36.99, 4.49,
       234.99, 1.95, 199.0, 189.0, 6.66, 27.99, 10.49, 129.99, 179.0,
       26.99, 399.99, 31.99, 399.0, 20.0, 40.0, 3.33, 199.99, 22.99,
       320.0, 38.85, 71.7, 59.95, 995.0, 27.49, 3.39, 19.95, 499.99,
       16.06, 4.68, 131.4, 44.98, 202.76, 0.95, 172.24, 249.99, 2.97,
   

In [10]:
# Crear una lista con valores invalidos a borrar en la columna 'price'
invalidValues = ['Free Movie', 'Install Now', 'Install Theme', 'Third-party', 'Play Now']

In [11]:
# Crear una lista con valores invalidos a reemplazar en la columna price
replaceValues = ['Free', 'Free Demo', 'Free Mod', 'Free to Use', 'Free To Play', 'Free to Play',
   'Free to Try', 'Play the Demo', 'Play for Free!', 'Free HITMANâ„¢ Holiday Pack',
   'Play WARMACHINE: Tactics Demo']

Â¿Son mucho o pocos los valores invalidos en el precio?

In [12]:
dfSteamGames['price'].value_counts()[invalidValues]

price
Free Movie       1
Install Now      1
Install Theme    1
Third-party      2
Play Now         2
Name: count, dtype: int64

In [13]:
# Eliminar las filas que no tengan valores validos en la columna 'price'
dfSteamGames = dfSteamGames.loc[~dfSteamGames['price'].isin(invalidValues)].reset_index(drop = True)

In [14]:
# Sustituir el precio que contenga 'Free' por 0.00 en la columna 'price'
dfSteamGames['price'].replace(replaceValues, 0.00, inplace = True)

dfSteamGames['price'].replace(['Starting at $499.00'], 499.00, inplace = True)
dfSteamGames['price'].replace(['Starting at $449.00'], 449.00, inplace = True)

In [15]:
# Cambiar la columna 'price' a tipo float en el DataFrame 'df'
dfSteamGames['price'] = dfSteamGames['price'].astype('float16')

In [16]:
dfSteamGames['price'].unique().round(2)

array([4.9883e+00, 0.0000e+00, 9.9023e-01, 2.9902e+00, 3.9902e+00,
       9.9922e+00, 1.8984e+01, 2.9984e+01,        nan, 1.0992e+01,
       1.5898e+00, 1.4992e+01, 1.9902e+00, 6.0000e+01, 8.9922e+00,
       6.9883e+00, 7.9883e+00, 4.0000e+01, 1.9984e+01, 7.4883e+00,
       1.2992e+01, 5.9883e+00, 2.4902e+00, 1.5992e+01, 1.2500e+00,
       2.4984e+01, 1.7984e+01, 6.2000e+01, 3.4902e+00, 1.1992e+01,
       1.3992e+01, 3.5000e+01, 7.4750e+01, 1.4902e+00, 3.3000e+01,
       1.0000e+02, 1.4953e+01, 7.0000e+01, 1.6984e+01, 8.0000e+01,
       5.0000e+01, 5.0000e+00, 4.5000e+01, 1.3977e+01, 2.9953e+01,
       1.2000e+02, 1.1000e+02, 1.5000e+02,        inf, 2.1984e+01,
       9.0000e+01, 9.7998e-01, 1.3988e+02, 4.2891e+00, 6.5000e+01,
       5.5000e+01, 7.5000e+01, 8.9014e-01, 5.0000e-01, 3.0000e+02,
       1.2900e+00, 3.0000e+00, 1.5000e+01, 5.4883e+00, 2.3984e+01,
       4.9000e+01, 2.0984e+01, 1.0930e+01, 1.3896e+00, 3.7000e+01,
       4.4883e+00, 2.0000e+00, 4.0000e+00, 9.0000e+00, 2.3500e

In [17]:
dfSteamGames.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32125 entries, 0 to 32124
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   publisher     24078 non-null  object 
 1   genres        28844 non-null  object 
 2   app_name      32124 non-null  object 
 3   title         30076 non-null  object 
 4   url           32125 non-null  object 
 5   release_date  30059 non-null  object 
 6   tags          31963 non-null  object 
 7   reviews_url   32125 non-null  object 
 8   specs         31456 non-null  object 
 9   price         30748 non-null  float16
 10  early_access  32125 non-null  object 
 11  id            32125 non-null  object 
 12  developer     28828 non-null  object 
dtypes: float16(1), object(12)
memory usage: 3.0+ MB


Listo los datos 'masticados' para que los cinetificos de datos hagan lo suyo

In [18]:
# Guardar el DataFrame en un archivo .csv comprimido con gzip
dfSteamGames.to_csv('../datasource/steam_games_chewed.csv.gz', compression='gzip')