In [1]:
# Importa las librerias necesarias
import pandas as pd
import ast
import os
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
# Crea una lista vacía para almacenar las filas (diccionarios) de datos
rows_reviews = []
# Abre el archivo 'australian_user_reviews.json' para lectura
with open('../datasets/australian_user_reviews.json', 'r', encoding='UTF-8') as reviews:
    # Itera a través de las líneas del archivo
    for line in reviews.readlines():
        # Evalúa cada línea como una expresión de Python y agrega el resultado a la lista rows_reviews
        rows_reviews.append(ast.literal_eval(line))

# Crea un DataFrame de Pandas a partir de las filas (diccionarios) de datos
users_reviews = pd.DataFrame(rows_reviews)

# Elimina columna que no se considera necesaria
users_reviews = users_reviews.drop(columns=['user_url'])

users_reviews.head(3)

Unnamed: 0,user_id,reviews
0,76561197970982479,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."


In [3]:
# Lista vacia donde se van a guardar cada iteracion desanidad de reviews
des_reviews = []

# Entra a cada elemento de users_reviews
for index, row in users_reviews.iterrows():
    # Obtiene los valores de las columnas 'user_id' y 'reviews'
    user_id = row['user_id']
    reviews = row['reviews']

    # Itera a través de la lista de ítems en la columna 'reviews'
    for r in reviews:
        # Crea un diccionario con los valores desanidados
        reviews_desanidados = {
            'user_id': user_id,
            'funny': r.get('funny', ''), # Obtiene el valor de 'funny'
            'posted': r.get('posted',''), # Obtiene el valor de 'posted'
            'last_edited': r.get('last_edited',''), # Obtiene el valor de 'last_edited'
            'item_id': r.get('item_id',''), # Obtiene el valor de 'item_id'
            'helpful': r.get('helpful',''), # Obtiene el valor de 'helpful'
            'recommend': r.get('recommend',bool), # Obtiene el valor de 'recommend'
            'review': r.get('review','') # Obtiene el valor de 'review'
        }
        # Agrega el diccionario a la lista des_reviews        
        des_reviews.append(reviews_desanidados)

# Crea un nuevo DataFrame a partir de la lista des_reviews
users_reviews = pd.DataFrame(des_reviews)

users_reviews.head(3)

Unnamed: 0,user_id,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,76561197970982479,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,76561197970982479,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...


In [4]:
# Muestra la cantidad de cada valor de la columna funny
users_reviews['funny'].value_counts()

funny
                                        51154
1 person found this review funny         5180
2 people found this review funny         1232
3 people found this review funny          491
4 people found this review funny          267
                                        ...  
58 people found this review funny           1
405 people found this review funny          1
105 people found this review funny          1
1,130 people found this review funny        1
825 people found this review funny          1
Name: count, Length: 186, dtype: int64

In [5]:
# Muestra la cantidad de cada valor de la columna last_edited
users_reviews['last_edited'].value_counts()

last_edited
                                  53165
Last edited November 25, 2013.       99
Last edited October 17, 2015.        19
Last edited June 6, 2015.            18
Last edited January 3.               17
                                  ...  
Last edited May 30, 2015.             1
Last edited May 21, 2015.             1
Last edited February 11, 2014.        1
Last edited May 8, 2014.              1
Last edited August 15, 2014.          1
Name: count, Length: 1015, dtype: int64

In [6]:
# Muestra la cantidad de cada valor de la columna helpful
users_reviews['helpful'].value_counts()

helpful
No ratings yet                                     30168
1 of 1 people (100%) found this review helpful      6730
0 of 1 people (0%) found this review helpful        4024
1 of 2 people (50%) found this review helpful       2493
2 of 2 people (100%) found this review helpful      1872
                                                   ...  
73 of 96 people (76%) found this review helpful        1
12 of 31 people (39%) found this review helpful        1
12 of 53 people (23%) found this review helpful        1
17 of 37 people (46%) found this review helpful        1
24 of 34 people (71%) found this review helpful        1
Name: count, Length: 1344, dtype: int64

In [7]:
# Elimina las columnas del DataFrame
users_reviews = users_reviews.drop(columns=['funny','last_edited','helpful'])

In [8]:
# Crea una instancia del analizador de sentimientos VADER
analyzer = SentimentIntensityAnalyzer()

# Crea una función para asignar valores numéricos al sentimiento
def assign_sentiment(row):
    # Clasifica como neutral si no hay reseña o está vacía
    if pd.isnull(row['review']) or not row['review']:
        return 1  

    # Calcula el puntaje de sentimiento de la revisión
    sentiment_score = analyzer.polarity_scores(row['review'])

    # Positivo
    if sentiment_score['compound'] >= 0.05:
        return 2
    # Malo
    elif sentiment_score['compound'] <= -0.05:
        return 0
    # Neutral
    else:
        return 1

# Crea una nueva columna "sentiment_analysis" usando la función
users_reviews['sentiment_analysis'] = users_reviews.apply(assign_sentiment, axis=1)

# Elimina la columna review
users_reviews = users_reviews.drop(columns=['review'])

users_reviews.head(3)

Unnamed: 0,user_id,posted,item_id,recommend,sentiment_analysis
0,76561197970982479,"Posted November 5, 2011.",1250,True,2
1,76561197970982479,"Posted July 15, 2011.",22200,True,2
2,76561197970982479,"Posted April 21, 2011.",43110,True,2


In [9]:
# Reconoce la columna como str para no tirar error de atributo
users_reviews['posted'] = users_reviews['posted'].astype(str)
# Extrae el año de la columna 'posted' y lo almacena en 'posted_year'
users_reviews['posted_year'] = users_reviews['posted'].str.extract(r'(\d{4})')
# Elimina la columna 'posted'
users_reviews = users_reviews.drop(columns=['posted'])

users_reviews.head(2)

Unnamed: 0,user_id,item_id,recommend,sentiment_analysis,posted_year
0,76561197970982479,1250,True,2,2011
1,76561197970982479,22200,True,2,2011


In [10]:
# Devuelve la cantidad de nulos que se encuentran en cada columna del Dataframe
users_reviews.isnull().sum()

user_id                   0
item_id                   0
recommend                 0
sentiment_analysis        0
posted_year           10119
dtype: int64

In [11]:
# Llena los valores nulos en "posted_year" con la moda
users_reviews['posted_year'] = users_reviews['posted_year'].fillna(users_reviews['posted_year'].mode().iloc[0])

In [12]:
# Reconoce la columna como bool para no tirar error en la parte de EDA
users_reviews['recommend'] = users_reviews['recommend'].astype(bool)

In [13]:
# Guarda el DataFrame users_reviews en un archivo Parquet
users_reviews.to_parquet(os.path.join('../datasets/', 'users_reviews.parquet'), index=False)