ETL (Extracción, Transformación y Carga) es un proceso en el que se extraen datos de una fuente, se transforman y se cargan en otra fuente. En este caso, se extraen datos de un archivo json, se transforman y se cargan en un archivo csv.

In [1]:
# Cargamos las librerias necesarias para el desarrollo del ETL.
import sys
sys.path.append("../Lib")
import utils as ut
import pandas as pd
import gzip
import ast
import warnings
warnings.filterwarnings("ignore")

Creamos una función que se encargue de leer el archivo json y devolver un dataframe de pandas.

In [2]:
# Funcion para convertir un archivo .gz a un dataframe.
def gzip_to_df(file_path):                                          
    with gzip.open(file_path, 'rt', encoding='utf-8-sig') as file:  # Abrimos el archivo .gz
        return [ast.literal_eval(line) for line in file]            # Retornamos el archivo .gz como un dataframe.

In [3]:
# Cargamos los datos y los convertimos en un dataframe.
data = gzip_to_df('../Datasets/Raw/users_items.json.gz')     # Cargamos los datos.
df_users_items = pd.DataFrame(data)                                    # Convertimos los datos en un dataframe.
df_users_items.head()                                                  # Mostramos los primeros 5 registros del dataframe.

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712555,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445855,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099482,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."


In [4]:
ut.data_summ(df_users_items)


Total rows:  88310

Total full null rows:  0


Unnamed: 0,Column,Data_type,No_miss_Qty,%Missing,Missing_Qty
0,user_id,[<class 'str'>],88310,0.0,0
1,items_count,[<class 'int'>],88310,0.0,0
2,steam_id,[<class 'str'>],88310,0.0,0
3,user_url,[<class 'str'>],88310,0.0,0
4,items,[<class 'list'>],88310,0.0,0


In [5]:
df_users_items = pd.json_normalize(data, record_path=['items'], meta=['user_id','items_count','steam_id','user_url'] )
df_users_items

Unnamed: 0,item_id,item_name,playtime_forever,playtime_2weeks,user_id,items_count,steam_id,user_url
0,10,Counter-Strike,6,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
1,20,Team Fortress Classic,0,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
2,30,Day of Defeat,7,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
3,40,Deathmatch Classic,0,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
4,50,Half-Life: Opposing Force,0,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
...,...,...,...,...,...,...,...,...
5153204,346330,BrainBread 2,0,0,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...
5153205,373330,All Is Dust,0,0,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...
5153206,388490,One Way To Die: Steam Edition,3,3,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...
5153207,521570,You Have 10 Seconds 2,4,4,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...


In [6]:
ut.data_summ(df_users_items)


Total rows:  5153209

Total full null rows:  0


Unnamed: 0,Column,Data_type,No_miss_Qty,%Missing,Missing_Qty
0,item_id,[<class 'str'>],5153209,0.0,0
1,item_name,[<class 'str'>],5153209,0.0,0
2,playtime_forever,[<class 'int'>],5153209,0.0,0
3,playtime_2weeks,[<class 'int'>],5153209,0.0,0
4,user_id,[<class 'str'>],5153209,0.0,0
5,items_count,[<class 'int'>],5153209,0.0,0
6,steam_id,[<class 'str'>],5153209,0.0,0
7,user_url,[<class 'str'>],5153209,0.0,0


In [7]:
# Obteniendo datos duplicados
duplicated = df_users_items.loc[df_users_items.duplicated()]
duplicated

Unnamed: 0,item_id,item_name,playtime_forever,playtime_2weeks,user_id,items_count,steam_id,user_url
164294,20,Team Fortress Classic,5,0,Nikiad,109,76561198084006094,http://steamcommunity.com/id/Nikiad
164295,50,Half-Life: Opposing Force,0,0,Nikiad,109,76561198084006094,http://steamcommunity.com/id/Nikiad
164296,70,Half-Life,0,0,Nikiad,109,76561198084006094,http://steamcommunity.com/id/Nikiad
164297,130,Half-Life: Blue Shift,0,0,Nikiad,109,76561198084006094,http://steamcommunity.com/id/Nikiad
164298,220,Half-Life 2,198,0,Nikiad,109,76561198084006094,http://steamcommunity.com/id/Nikiad
...,...,...,...,...,...,...,...,...
4898223,213670,South Park™: The Stick of Truth™,725,0,76561198080057659,39,76561198080057659,http://steamcommunity.com/profiles/76561198080...
4898224,221910,The Stanley Parable,53,0,76561198080057659,39,76561198080057659,http://steamcommunity.com/profiles/76561198080...
4898225,261030,The Walking Dead: Season Two,253,0,76561198080057659,39,76561198080057659,http://steamcommunity.com/profiles/76561198080...
4898226,273110,Counter-Strike Nexon: Zombies,0,0,76561198080057659,39,76561198080057659,http://steamcommunity.com/profiles/76561198080...


In [8]:
# Eliminando datos duplicados
df_users_items.drop_duplicates(inplace=True)
df_users_items


Unnamed: 0,item_id,item_name,playtime_forever,playtime_2weeks,user_id,items_count,steam_id,user_url
0,10,Counter-Strike,6,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
1,20,Team Fortress Classic,0,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
2,30,Day of Defeat,7,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
3,40,Deathmatch Classic,0,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
4,50,Half-Life: Opposing Force,0,0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
...,...,...,...,...,...,...,...,...
5153204,346330,BrainBread 2,0,0,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...
5153205,373330,All Is Dust,0,0,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...
5153206,388490,One Way To Die: Steam Edition,3,3,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...
5153207,521570,You Have 10 Seconds 2,4,4,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329...


In [9]:
# Eliminamos columnas que no se van a utilizar
df_users_items.drop(['playtime_2weeks', 'steam_id','user_url'], axis=1, inplace=True)
df_users_items

Unnamed: 0,item_id,item_name,playtime_forever,user_id,items_count
0,10,Counter-Strike,6,76561197970982479,277
1,20,Team Fortress Classic,0,76561197970982479,277
2,30,Day of Defeat,7,76561197970982479,277
3,40,Deathmatch Classic,0,76561197970982479,277
4,50,Half-Life: Opposing Force,0,76561197970982479,277
...,...,...,...,...,...
5153204,346330,BrainBread 2,0,76561198329548331,7
5153205,373330,All Is Dust,0,76561198329548331,7
5153206,388490,One Way To Die: Steam Edition,3,76561198329548331,7
5153207,521570,You Have 10 Seconds 2,4,76561198329548331,7


In [10]:
# Revisando si hay valores menores o iguales a 0 en la columna "playtime_forever"
df_users_items[df_users_items['playtime_forever'] <= 0]

Unnamed: 0,item_id,item_name,playtime_forever,user_id,items_count
1,20,Team Fortress Classic,0,76561197970982479,277
3,40,Deathmatch Classic,0,76561197970982479,277
4,50,Half-Life: Opposing Force,0,76561197970982479,277
5,60,Ricochet,0,76561197970982479,277
6,70,Half-Life,0,76561197970982479,277
...,...,...,...,...,...
5153199,519170,The Journey Home,0,76561198326700687,177
5153200,358390,Stargunner,0,76561198326700687,177
5153201,521570,You Have 10 Seconds 2,0,76561198326700687,177
5153204,346330,BrainBread 2,0,76561198329548331,7


Dado que en etapas posteriores utilizaremos la columna 'playtime_forever' para analizar el tiempo de juego, eliminaremos los registros con valores de 0, ya que no son relevantes para el análisis

In [11]:
df_users_items = df_users_items[df_users_items['playtime_forever'] > 0]
df_users_items

Unnamed: 0,item_id,item_name,playtime_forever,user_id,items_count
0,10,Counter-Strike,6,76561197970982479,277
2,30,Day of Defeat,7,76561197970982479,277
8,300,Day of Defeat: Source,4733,76561197970982479,277
9,240,Counter-Strike: Source,1853,76561197970982479,277
10,3830,Psychonauts,333,76561197970982479,277
...,...,...,...,...,...
5153202,304930,Unturned,677,76561198329548331,7
5153203,227940,Heroes & Generals,43,76561198329548331,7
5153206,388490,One Way To Die: Steam Edition,3,76561198329548331,7
5153207,521570,You Have 10 Seconds 2,4,76561198329548331,7


In [12]:
df_users_items.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3246375 entries, 0 to 5153208
Data columns (total 5 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   item_id           object
 1   item_name         object
 2   playtime_forever  int64 
 3   user_id           object
 4   items_count       object
dtypes: int64(1), object(4)
memory usage: 148.6+ MB


In [13]:
df_users_items

Unnamed: 0,item_id,item_name,playtime_forever,user_id,items_count
0,10,Counter-Strike,6,76561197970982479,277
2,30,Day of Defeat,7,76561197970982479,277
8,300,Day of Defeat: Source,4733,76561197970982479,277
9,240,Counter-Strike: Source,1853,76561197970982479,277
10,3830,Psychonauts,333,76561197970982479,277
...,...,...,...,...,...
5153202,304930,Unturned,677,76561198329548331,7
5153203,227940,Heroes & Generals,43,76561198329548331,7
5153206,388490,One Way To Die: Steam Edition,3,76561198329548331,7
5153207,521570,You Have 10 Seconds 2,4,76561198329548331,7


In [14]:
# Guardar el dataframe en un archivo csv en la carpeta Clean
df_users_items.to_csv('../Datasets/Clean/users_items.csv.gz', compression='gzip', index=False, encoding='utf-8')

In [15]:
# Guardar el dataframe en un archivo parquet en la carpeta Datasets
df_users_items.to_parquet('../Datasets/users_items.parquet.gz', compression='gzip', index=False)