In [1]:
import pandas as pd
import sys

sys.path.append("../utils/")
from myFunctions import jsonGzipToDataframe2

Extraemos los datos del archivo origina y lo cargamos en un dataframe

In [2]:
df = jsonGzipToDataframe2('../datasource/users_items.json.gz')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88310 entries, 0 to 88309
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      88310 non-null  object
 1   items_count  88310 non-null  int64 
 2   steam_id     88310 non-null  object
 3   user_url     88310 non-null  object
 4   items        88310 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.4+ MB


In [3]:
df.shape

(88310, 5)

Nos aseguramos de trabajar en otro espacio de memoria para no tener que volver a cargar el JSON

In [4]:
# Nos aseguramos de crear un nuevo objeto en memoria usando 'copy()'
dfUserItems = df[['user_id', 'items', 'items_count']].copy()

Si hay 'user_id' duplicados nos quedamos con el primero y los demas los eliminamos

In [5]:
# Eliminar filas duplicadas basadas en la columna 'user_id'
dfUserItems = dfUserItems.drop_duplicates(['user_id'], keep = 'first').reset_index(drop = True)
#dfUserItems = dfUserItems.drop_duplicates(subset = 'items', keep = 'first')


In [6]:
dfUserItems

Unnamed: 0,user_id,items,items_count
0,76561197970982479,"[{'item_id': '10', 'item_name': 'Counter-Strik...",277
1,js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik...",888
2,evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest...",137
3,Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik...",328
4,doctr,"[{'item_id': '300', 'item_name': 'Day of Defea...",541
...,...,...,...
87621,76561198323066619,"[{'item_id': '413850', 'item_name': 'CS:GO Pla...",22
87622,76561198326700687,"[{'item_id': '11020', 'item_name': 'TrackMania...",177
87623,XxLaughingJackClown77xX,[],0
87624,76561198329548331,"[{'item_id': '304930', 'item_name': 'Unturned'...",7


Vemos los juegos de un usuarios cualquiera

In [7]:
#Ver todos los diccionarios de la columna 'items' en la fila 77 
for dic in dfUserItems.loc[77, 'items']:
    print(dic) # con dic['item_name'] veo solo los nombres de los juegos

{'item_id': '8190', 'item_name': 'Just Cause 2', 'playtime_forever': 115, 'playtime_2weeks': 97}
{'item_id': '105600', 'item_name': 'Terraria', 'playtime_forever': 8592, 'playtime_2weeks': 198}
{'item_id': '212070', 'item_name': 'Star Conflict', 'playtime_forever': 15, 'playtime_2weeks': 0}
{'item_id': '218230', 'item_name': 'PlanetSide 2', 'playtime_forever': 53, 'playtime_2weeks': 0}
{'item_id': '230410', 'item_name': 'Warframe', 'playtime_forever': 22954, 'playtime_2weeks': 2099}
{'item_id': '236390', 'item_name': 'War Thunder', 'playtime_forever': 2544, 'playtime_2weeks': 0}
{'item_id': '214420', 'item_name': 'Gear Up', 'playtime_forever': 96, 'playtime_2weeks': 0}
{'item_id': '271290', 'item_name': 'HAWKEN', 'playtime_forever': 416, 'playtime_2weeks': 0}
{'item_id': '244850', 'item_name': 'Space Engineers', 'playtime_forever': 77, 'playtime_2weeks': 0}
{'item_id': '252490', 'item_name': 'Rust', 'playtime_forever': 52, 'playtime_2weeks': 0}
{'item_id': '263060', 'item_name': 'Block

Convirtamos las claves de los JSON que estan en 'reviews' en columnas

In [8]:
# Entre otras cosas, json_normalize corrige las comillas simples y dobles de JSON en 'items'
dfUserItems = pd.json_normalize(df.to_dict('records'), 'items', ['user_id', 'items_count'])

# Ordenamos las columnas:
dfUserItems = dfUserItems[
  [
    'user_id',
    'item_id',
    'item_name',
    'items_count',
    'playtime_forever',
    'playtime_2weeks'
  ]
]

In [9]:
dfUserItems.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5153209 entries, 0 to 5153208
Data columns (total 6 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   user_id           object
 1   item_id           object
 2   item_name         object
 3   items_count       object
 4   playtime_forever  int64 
 5   playtime_2weeks   int64 
dtypes: int64(2), object(4)
memory usage: 235.9+ MB


In [10]:
dfUserItems

Unnamed: 0,user_id,item_id,item_name,items_count,playtime_forever,playtime_2weeks
0,76561197970982479,10,Counter-Strike,277,6,0
1,76561197970982479,20,Team Fortress Classic,277,0,0
2,76561197970982479,30,Day of Defeat,277,7,0
3,76561197970982479,40,Deathmatch Classic,277,0,0
4,76561197970982479,50,Half-Life: Opposing Force,277,0,0
...,...,...,...,...,...,...
5153204,76561198329548331,346330,BrainBread 2,7,0,0
5153205,76561198329548331,373330,All Is Dust,7,0,0
5153206,76561198329548331,388490,One Way To Die: Steam Edition,7,3,3
5153207,76561198329548331,521570,You Have 10 Seconds 2,7,4,4


Antes de persistir los datos en un formatos csv, nos aseguraremos de que no hayan quedado datos NaN en el dataframe

In [11]:
# Aplicamos una mascara para marcar los valores NaN y los sumamos por columna
dfUserItems.isna().sum()

user_id             0
item_id             0
item_name           0
items_count         0
playtime_forever    0
playtime_2weeks     0
dtype: int64

Listo los datos 'masticados' para que los cinetificos de datos hagan lo suyo

In [12]:
dfUserItems.head(2)

Unnamed: 0,user_id,item_id,item_name,items_count,playtime_forever,playtime_2weeks
0,76561197970982479,10,Counter-Strike,277,6,0
1,76561197970982479,20,Team Fortress Classic,277,0,0


In [13]:
dfUserItems.reset_index(drop = True, inplace = True)

In [14]:
dfUserItems.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5153209 entries, 0 to 5153208
Data columns (total 6 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   user_id           object
 1   item_id           object
 2   item_name         object
 3   items_count       object
 4   playtime_forever  int64 
 5   playtime_2weeks   int64 
dtypes: int64(2), object(4)
memory usage: 235.9+ MB


In [15]:
# Guardar el DataFrame en un archivo .csv comprimido con gzip
dfUserItems.to_csv('../datasource/user_items_chewed.csv.gz', compression = 'gzip', index = False)