# Proceso [ETL] de el archivo output_steam_games.json



pandas
numpy
uvicorn
pyarrow
fastapi
scikit-learn
ipykernel
matplotlib

In [33]:
#importamos las librerias necesarias para nuestra ETL

import numpy as np
import pandas as pd 
import json 
import ast
from textblob import TextBlob
import parquet
import pyarrow
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import pandas as pd
import json



***Abrimos nuestro archivo***

In [34]:
df = pd.read_parquet('D:\HENRY\ProyectoPIMLops\PIMLops-STEAM\RawFiles/steamgamesraw.parquet')

In [35]:
df_steam_games = df

***Verificamos los nulos en nuestro dataset***

In [36]:
df_steam_games.isna().sum()

publisher       96362
genres          91593
app_name        88312
title           90360
url             88310
release_date    90377
tags            88473
reviews_url     88312
specs           88980
price               0
early_access    88310
id              88312
developer       91609
dtype: int64

Eliminamos los nulos completamente vacíos

In [37]:
df_steam_games = df_steam_games.dropna(how='all').reset_index(drop=True)

Procesamos y eliminamos duplicados/nulos de la columna item ID 

renombramos columnas

In [38]:
df_steam_games = df_steam_games.rename(columns={'id': 'item_id'})
df_steam_games = df_steam_games.rename(columns={'app_name': 'item_name'})

In [39]:
#eliminamos nulos y duplicados
df_steam_games.drop_duplicates(subset='item_id',inplace=True)
df_steam_games.dropna(subset='item_id',inplace=True)

In [40]:
# Convertir la columna 'item_id' a int
df_steam_games['item_id'] = df_steam_games['item_id'].astype(int)

volvemos a verificar nulos

In [41]:
#Revisamos que los nulos hayan desaparecido
nulos_por_columna = df_steam_games.isnull().sum()
nulos_por_columna

publisher       8051
genres          3282
item_name          1
title           2049
url                0
release_date    2066
tags             162
reviews_url        0
specs            669
price              0
early_access       0
item_id            0
developer       3298
dtype: int64

In [42]:
# Asumiendo que df_steam_games es tu DataFrame y 'genres' es la columna con los géneros
df_steam_games = df_steam_games.explode('genres')



In [43]:
#Verificamos la cantidad de Developers que tenemos en nuestros datos
unique_developers_count = df_steam_games['developer'].nunique()

print(f"Total de desarrolladores únicos: {unique_developers_count}")

Total de desarrolladores únicos: 10992


***Eliminamos las columnas que no necesitamos y revisamos nuestro dataframe***

In [44]:
df_steam_games.drop(columns=["publisher","title","url","early_access","reviews_url","specs","price"], inplace=True)

Extraemos solo el año de release date

In [45]:
df_steam_games['release_date'] = df_steam_games['release_date'].str.extract(r'(\d{4})')

***Realizamos la transformación de nuestros datos para utilizar luego***

In [46]:

def clean_data(df_steam_games):
    df_steam_games = df_steam_games.astype({'item_name': 'string','tags': 'object','developer': 'string','tags': 'string','release_date': 'string'})
    return df_steam_games

df_steam_games = clean_data(df_steam_games)


Pasamos la columna "release_date" la cual tiene el año de lanzamiento a int y rellenamos sus nulos con 0 

In [47]:
#Rellenamos los nulos para convertir a int
df_steam_games['release_date'].fillna('0', inplace=True)
#Convertimos a int
df_steam_games['release_date'] = df_steam_games['release_date'].astype(int)

Observamos los cambios realizados


In [48]:
df_steam_games.info()

<class 'pandas.core.frame.DataFrame'>
Index: 74833 entries, 88310 to 120444
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   genres        71551 non-null  object
 1   item_name     74831 non-null  string
 2   release_date  74833 non-null  int32 
 3   tags          74649 non-null  string
 4   item_id       74833 non-null  int32 
 5   developer     71355 non-null  string
dtypes: int32(2), object(1), string(3)
memory usage: 3.4+ MB


In [49]:
df_steam_games.reset_index(drop=True, inplace=True)

## Aplicamos funciones que iteren en la columna tags, para rellenar los NaN en Genres con sus únicos

Es importante remarcar que tenemos que tener el mismo tipo de dato en nuestras columnas


In [50]:
# Rellenar los valores faltantes con una lista vacía
df_steam_games['genres'] = df_steam_games['genres'].fillna('[]')  
# Asegurarse de que los géneros estén en formato de lista
df_steam_games['genres'] = df_steam_games['genres'].apply(lambda x: x if isinstance(x, list) else [x])
# Convertir la lista de géneros a una cadena separada por comas
df_steam_games['genres'] = df_steam_games['genres'].apply(lambda x: ', '.join(x))  
# Crear variables ficticias para los géneros
dummy_genres = df_steam_games['genres'].str.get_dummies(', ')
# Concatenar las variables ficticias con el DataFrame original
df_steam_games = pd.concat([df_steam_games, dummy_genres], axis=1)

In [51]:
df_steam_games

Unnamed: 0,genres,item_name,release_date,tags,item_id,developer,Accounting,Action,Adventure,Animation &amp; Modeling,...,RPG,Racing,Simulation,Software Training,Sports,Strategy,Utilities,Video Production,Web Publishing,[]
0,Action,Lost Summoner Kitty,2018,['Strategy' 'Action' 'Indie' 'Casual' 'Simulat...,761140,Kotoshiro,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Casual,Lost Summoner Kitty,2018,['Strategy' 'Action' 'Indie' 'Casual' 'Simulat...,761140,Kotoshiro,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Indie,Lost Summoner Kitty,2018,['Strategy' 'Action' 'Indie' 'Casual' 'Simulat...,761140,Kotoshiro,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Simulation,Lost Summoner Kitty,2018,['Strategy' 'Action' 'Indie' 'Casual' 'Simulat...,761140,Kotoshiro,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,Strategy,Lost Summoner Kitty,2018,['Strategy' 'Action' 'Indie' 'Casual' 'Simulat...,761140,Kotoshiro,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74828,Racing,Russian Roads,2018,['Indie' 'Simulation' 'Racing'],610660,Laush Dmitriy Sergeevich,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
74829,Simulation,Russian Roads,2018,['Indie' 'Simulation' 'Racing'],610660,Laush Dmitriy Sergeevich,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
74830,Casual,EXIT 2 - Directions,2017,['Indie' 'Casual' 'Puzzle' 'Singleplayer' 'Atm...,658870,"xropi,stev3ns",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
74831,Indie,EXIT 2 - Directions,2017,['Indie' 'Casual' 'Puzzle' 'Singleplayer' 'Atm...,658870,"xropi,stev3ns",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Transformamos la columna "tags" para sacarla de lista quedándonos solo con sus valores separados por coma

In [52]:
# Función para transformar la columna 'tags'
def transform_tags(tags):
    try:
        if isinstance(tags, str):
            tags_list = ast.literal_eval(tags)
            return ', '.join(tags_list)
    except SyntaxError:
        pass
    return tags
# Aplica la función al DataFrame
df_steam_games['tags'] = df_steam_games['tags'].apply(transform_tags)

Creamos la funcion que rellenará los nulos en la columna "genres" con los valores unicos que posee, buscándolos en la columna "tags"

In [53]:
def fill_genres(row):
    if row['genres'] == '[, ]':  # o pd.isna(row['genres'])
        # divide la cadena de tags en una lista de tags
        if pd.notna(row['tags']):
            tags = row['tags'].split(', ')
            # encuentra la intersección de tags y unique_genres
            genres_in_tags = list(set(tags) & set(unique_genres))
            # si se encuentran géneros en tags, únelos en una cadena
            if genres_in_tags:
                return ', '.join(genres_in_tags)
        return row['genres']
    else:
        return row['genres']
# obtén los géneros únicos
unique_genres = df_steam_games['genres'].dropna().unique()
# aplica la función
df_steam_games['genres'] = df_steam_games.apply(fill_genres, axis=1)

In [54]:
df_steam_games

Unnamed: 0,genres,item_name,release_date,tags,item_id,developer,Accounting,Action,Adventure,Animation &amp; Modeling,...,RPG,Racing,Simulation,Software Training,Sports,Strategy,Utilities,Video Production,Web Publishing,[]
0,Action,Lost Summoner Kitty,2018,StrategyActionIndieCasualSimulation,761140,Kotoshiro,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Casual,Lost Summoner Kitty,2018,StrategyActionIndieCasualSimulation,761140,Kotoshiro,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Indie,Lost Summoner Kitty,2018,StrategyActionIndieCasualSimulation,761140,Kotoshiro,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Simulation,Lost Summoner Kitty,2018,StrategyActionIndieCasualSimulation,761140,Kotoshiro,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,Strategy,Lost Summoner Kitty,2018,StrategyActionIndieCasualSimulation,761140,Kotoshiro,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74828,Racing,Russian Roads,2018,IndieSimulationRacing,610660,Laush Dmitriy Sergeevich,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
74829,Simulation,Russian Roads,2018,IndieSimulationRacing,610660,Laush Dmitriy Sergeevich,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
74830,Casual,EXIT 2 - Directions,2017,IndieCasualPuzzleSingleplayerAtmosphericRelaxing,658870,"xropi,stev3ns",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
74831,Indie,EXIT 2 - Directions,2017,IndieCasualPuzzleSingleplayerAtmosphericRelaxing,658870,"xropi,stev3ns",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Cambiamos el tipo de dato de genres

In [55]:

def clean_data(df_steam_games):
    df_steam_games = df_steam_games.astype({'genres': 'string'})
    return df_steam_games

df_steam_games = clean_data(df_steam_games)
df_steam_games.head()

Unnamed: 0,genres,item_name,release_date,tags,item_id,developer,Accounting,Action,Adventure,Animation &amp; Modeling,...,RPG,Racing,Simulation,Software Training,Sports,Strategy,Utilities,Video Production,Web Publishing,[]
0,Action,Lost Summoner Kitty,2018,StrategyActionIndieCasualSimulation,761140,Kotoshiro,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Casual,Lost Summoner Kitty,2018,StrategyActionIndieCasualSimulation,761140,Kotoshiro,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Indie,Lost Summoner Kitty,2018,StrategyActionIndieCasualSimulation,761140,Kotoshiro,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Simulation,Lost Summoner Kitty,2018,StrategyActionIndieCasualSimulation,761140,Kotoshiro,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,Strategy,Lost Summoner Kitty,2018,StrategyActionIndieCasualSimulation,761140,Kotoshiro,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


***Las líneas que contienen "[, ]" en genre, son porque no poseen ningún género o tag específico en la columna tags***

Rellenamos los nulos con "SinDato" debido a que no podemos rellenarlos con un valor específico ubicado en otras columnas


In [56]:
df_steam_games.fillna('SinDato',inplace=True)

In [57]:
df_steam_games['genres'] = df_steam_games['genres'].str.replace("[", "", regex=False)
df_steam_games['genres'] = df_steam_games['genres'].str.replace("]", "", regex=False)



In [58]:
#Solucionamos el error de conversión a parquet de "halffloat" transformando a float32
for col in df_steam_games.select_dtypes(include=[np.float16]).columns:
    df_steam_games[col] = df_steam_games[col].astype('float32')


In [59]:
df_steam_games

Unnamed: 0,genres,item_name,release_date,tags,item_id,developer,Accounting,Action,Adventure,Animation &amp; Modeling,...,RPG,Racing,Simulation,Software Training,Sports,Strategy,Utilities,Video Production,Web Publishing,[]
0,Action,Lost Summoner Kitty,2018,StrategyActionIndieCasualSimulation,761140,Kotoshiro,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Casual,Lost Summoner Kitty,2018,StrategyActionIndieCasualSimulation,761140,Kotoshiro,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Indie,Lost Summoner Kitty,2018,StrategyActionIndieCasualSimulation,761140,Kotoshiro,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Simulation,Lost Summoner Kitty,2018,StrategyActionIndieCasualSimulation,761140,Kotoshiro,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,Strategy,Lost Summoner Kitty,2018,StrategyActionIndieCasualSimulation,761140,Kotoshiro,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74828,Racing,Russian Roads,2018,IndieSimulationRacing,610660,Laush Dmitriy Sergeevich,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
74829,Simulation,Russian Roads,2018,IndieSimulationRacing,610660,Laush Dmitriy Sergeevich,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
74830,Casual,EXIT 2 - Directions,2017,IndieCasualPuzzleSingleplayerAtmosphericRelaxing,658870,"xropi,stev3ns",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
74831,Indie,EXIT 2 - Directions,2017,IndieCasualPuzzleSingleplayerAtmosphericRelaxing,658870,"xropi,stev3ns",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
#Utilizamos la extensión data wrangler para borrar las columnas innecesarias
def clean_data(df_steam_games):
    # Drop columns: 'tags', 'price' and 22 other columns
    df_steam_games = df_steam_games.drop(columns=['tags', 'Accounting', 'Action', 'Adventure', 'Animation &amp; Modeling', 'Audio Production', 'Casual', 'Design &amp; Illustration', 'Early Access', 'Education', 'Free to Play', 'Indie', 'Massively Multiplayer', 'Photo Editing', 'RPG', 'Racing', 'Simulation', 'Software Training', 'Sports', 'Strategy', 'Utilities', 'Video Production', 'Web Publishing'])
    return df_steam_games
df_steam_games = clean_data(df_steam_games)

In [61]:
# Change column type to object for column: 'genres'
df_steam_games = df_steam_games.astype({'genres': 'object'})


Elimino los tags que se colaron en genres que afectan nuestro EDA 

In [62]:
df_steam_games = df_steam_games[df_steam_games['genres'] != 'Early Access']
df_steam_games = df_steam_games[df_steam_games['genres'] != 'free to play']

In [63]:
#Eliminamos todos los datos iguales a "sindato" y en release date los que tengan "0" 
df_steam_games = df_steam_games[(df_steam_games != 'SinDato').all(axis=1) & (df_steam_games['release_date'] != 0)]

In [64]:
df_steam_games.to_parquet('Steam_games.parquet')