In [1]:
import pandas as pd
import numpy as np
import json
import gzip
import ast
from textblob import TextBlob

In [2]:
archivo = "user_reviews.json.gz"
data = []

with gzip.open(archivo, 'rt', encoding='utf-8') as file:
    for line in file:
        try:
            json_data = ast.literal_eval(line)
            data.append(json_data)
        except ValueError as e:
            print(f"Error en la línea: {line}")
            continue

df_user_reviews = pd.DataFrame(data)

In [3]:
def analizar_sentimiento(review):
    if pd.isnull(review):  # Manejar casos donde la reseña está ausente
        return 1
    analysis = TextBlob(review)
    # Usar la polaridad de TextBlob para clasificar la reseña
    if analysis.sentiment.polarity < -0.1:
        return 0  # Malo
    elif analysis.sentiment.polarity > 0.1:
        return 2  # Positivo
    else:
        return 1  # Neutral

In [4]:
df_user_reviews.shape

(25799, 3)

In [5]:
print(df_user_reviews.isnull().sum())

user_id     0
user_url    0
reviews     0
dtype: int64


In [4]:
duplicados_r = df_user_reviews.duplicated(subset=['reviews'])

In [5]:
duplicados_id = df_user_reviews.duplicated(subset=['user_id','user_url'])
duplicados = duplicados_id & duplicados_r
 
print(f"Número de filas duplicadas: {(duplicados).sum()}")

Número de filas duplicadas: 313


In [8]:
df_user_reviews.shape

(25799, 3)

In [6]:
df_user_reviews = df_user_reviews[~duplicados]

In [10]:
df_user_reviews.shape


(25486, 3)

In [7]:
# Expandir las listas de reseñas en filas individuales
user_reviews_expanded = df_user_reviews.explode('reviews').reset_index(drop=True)

In [8]:
# Extraer el contenido de la columna reviews en columnas separadas
user_reviews_expanded = pd.concat([user_reviews_expanded.drop(['reviews'], axis=1), user_reviews_expanded['reviews'].apply(pd.Series)], axis=1)


In [9]:
user_reviews_expanded['sentiment_analysis'] = user_reviews_expanded['review'].apply(analizar_sentimiento)

In [10]:
user_reviews_expanded.head(3)

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review,0,sentiment_analysis
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,,2
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.,,2
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...,,1


In [None]:
user_reviews_expanded.shape

In [None]:
user_reviews_expanded2 = user_reviews_expanded.drop(columns=['user_url', 'funny','posted','last_edited','helpful','review',0])
user_reviews_expanded2 = user_reviews_expanded2.dropna(subset=['user_id', 'item_id', 'recommend','sentiment_analysis'])
user_reviews_expanded2.head()

In [None]:
valores_unicos = user_reviews_expanded2['recommend'].unique()
conteo_valores = user_reviews_expanded2['recommend'].value_counts()

# Mostrar el recuento de valores únicos
print("Recuento de valores únicos de la columna 'recommend':")
print(conteo_valores)
num_valores_unicos = user_reviews_expanded2['recommend'].nunique()

# Imprimir el número de valores únicos
print(f"Número de valores únicos en la columna 'recommend': {num_valores_unicos}")

In [None]:
valores_unicos = user_reviews_expanded2['sentiment_analysis'].unique()
conteo_valores = user_reviews_expanded2['sentiment_analysis'].value_counts()

# Mostrar el recuento de valores únicos
print("Recuento de valores únicos de la columna 'sentiment_analysis':")
print(conteo_valores)
num_valores_unicos = user_reviews_expanded2['sentiment_analysis'].nunique()

# Imprimir el número de valores únicos
print(f"Número de valores únicos en la columna 'sentiment_analysis': {num_valores_unicos}")

In [None]:
user_reviews_expanded2['item_id'] = user_reviews_expanded2['item_id'].astype(float)

In [None]:
# Guardar el DataFrame como un archivo Parquet comprimido
user_reviews_expanded2.to_parquet('user_reviews.parquet', compression='snappy')

In [None]:
df_user_reviews['reviews'][0]