In [1]:
import pandas as pd
import numpy as np
import ast
import gzip
import json
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re

In [2]:
user_reviews_gz = '../Data/user_reviews.json.gz'
filas = []

with gzip.open(user_reviews_gz, 'rt', encoding='utf-8') as archivo:
    for line in archivo.readlines():
        # Convertir la cadena a un diccionario utilizando ast.literal_eval
        diccionario = ast.literal_eval(line)
        filas.append(diccionario)

# Crear el DataFrame
user_review = pd.DataFrame(filas)


In [3]:
user_review

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."
...,...,...,...
25794,76561198306599751,http://steamcommunity.com/profiles/76561198306...,"[{'funny': '', 'posted': 'Posted May 31.', 'la..."
25795,Ghoustik,http://steamcommunity.com/id/Ghoustik,"[{'funny': '', 'posted': 'Posted June 17.', 'l..."
25796,76561198310819422,http://steamcommunity.com/profiles/76561198310...,"[{'funny': '1 person found this review funny',..."
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,"[{'funny': '', 'posted': 'Posted July 21.', 'l..."


In [4]:
# Verificamos que información posee la primera fila de la columna reviews

user_review.loc[0, 'reviews']

[{'funny': '',
  'posted': 'Posted November 5, 2011.',
  'last_edited': '',
  'item_id': '1250',
  'helpful': 'No ratings yet',
  'recommend': True,
  'review': 'Simple yet with great replayability. In my opinion does "zombie" hordes and team work better than left 4 dead plus has a global leveling system. Alot of down to earth "zombie" splattering fun for the whole family. Amazed this sort of FPS is so rare.'},
 {'funny': '',
  'posted': 'Posted July 15, 2011.',
  'last_edited': '',
  'item_id': '22200',
  'helpful': 'No ratings yet',
  'recommend': True,
  'review': "It's unique and worth a playthrough."},
 {'funny': '',
  'posted': 'Posted April 21, 2011.',
  'last_edited': '',
  'item_id': '43110',
  'helpful': 'No ratings yet',
  'recommend': True,
  'review': 'Great atmosphere. The gunplay can be a bit chunky at times but at the end of the day this game is definitely worth it and I hope they do a sequel...so buy the game so I get a sequel!'}]

In [5]:
#Se crean nuevas filas para cada elemento del diccionario en la lista

user_review_desanidar = user_review.explode('reviews')

In [6]:
# Se concatena el dataframe original, con el dataframe desanidado

user_review_desanidar = pd.concat([user_review_desanidar.drop(['reviews'],axis=1),user_review_desanidar['reviews'].apply(pd.Series)],axis=1)  

In [7]:
user_review_desanidar

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review,0
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.,
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...,
1,js41637,http://steamcommunity.com/id/js41637,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...,
1,js41637,http://steamcommunity.com/id/js41637,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...,
...,...,...,...,...,...,...,...,...,...,...
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,Posted July 10.,,70,No ratings yet,True,a must have classic from steam definitely wort...,
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,Posted July 8.,,362890,No ratings yet,True,this game is a perfect remake of the original ...,
25798,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,1 person found this review funny,Posted July 3.,,273110,1 of 2 people (50%) found this review helpful,True,had so much fun plaing this and collecting res...,
25798,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,,Posted July 20.,,730,No ratings yet,True,:D,


In [8]:
#Se eliminan las columnas que no se van a utilizar

user_review_desanidar.drop(columns=['user_url','funny','helpful','posted','last_edited',0],inplace=True)

In [9]:
user_review_desanidar

Unnamed: 0,user_id,item_id,recommend,review
0,76561197970982479,1250,True,Simple yet with great replayability. In my opi...
0,76561197970982479,22200,True,It's unique and worth a playthrough.
0,76561197970982479,43110,True,Great atmosphere. The gunplay can be a bit chu...
1,js41637,251610,True,I know what you think when you see this title ...
1,js41637,227300,True,For a simple (it's actually not all that simpl...
...,...,...,...,...
25797,76561198312638244,70,True,a must have classic from steam definitely wort...
25797,76561198312638244,362890,True,this game is a perfect remake of the original ...
25798,LydiaMorley,273110,True,had so much fun plaing this and collecting res...
25798,LydiaMorley,730,True,:D


In [10]:
#Verifico los valores nulos de las columnas

user_review_desanidar.isna().sum()

user_id       0
item_id      28
recommend    28
review       28
dtype: int64

In [11]:
#Calculamos el porcentaje de valores nulos que posee cada columna para considerar como proceder

porcentaje_nulos_por_columna = (user_review_desanidar.isnull().sum() / len(user_review_desanidar)) * 100
print(porcentaje_nulos_por_columna)


user_id      0.000000
item_id      0.047191
recommend    0.047191
review       0.047191
dtype: float64


In [12]:
# Al ser los nulos un porcentaje muy bajo, se procede a eliminarlos 

user_review_desanidar.dropna(inplace=True)

In [13]:
#Se realiza el análisis de sentimiento en la columna review
nltk.download('vader_lexicon')
model_sentimiento = SentimentIntensityAnalyzer()


def sentimiento(review):
    # Se obtiene el puntaje de sentimiento usando SentimentIntensityAnalyzer
    sentimiento_score = model_sentimiento.polarity_scores(review)
    
    # Se clasifica el sentimiento
    
    if review and not pd.isnull(review):
        if sentimiento_score['compound'] >= 0.5:
            return 2  # Sentimiento positivo
        elif sentimiento_score['compound'] <= -0.5:
            return 0  # Sentimiento negativo
        else:
            return 1  # Sentimiento neutral
    else:
        return 1

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\belen\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [14]:
#def sentimiento(review):
    # Se obtiene el puntaje de sentimiento usando SentimentIntensityAnalyzer
    #sentimiento_score = model_sentimiento.polarity_scores(review)
    
    # Se clasifica el sentimiento
    
    #if review and not pd.isnull(review):
        #if sentimiento_score['compound'] >= 0.5:
            #return 2  # Sentimiento positivo
        #elif sentimiento_score['compound'] <= -0.5:
            #return 0  # Sentimiento negativo
        #else:
            #return 1  # Sentimiento neutral
    #else:
        #return 1

In [15]:
#Se crea la columna sentiment_analysis aplicándole la función sentimiento

user_review_desanidar['sentiment_analysis']  = user_review_desanidar['review'].apply(sentimiento)


In [16]:
user_review_desanidar

Unnamed: 0,user_id,item_id,recommend,review,sentiment_analysis
0,76561197970982479,1250,True,Simple yet with great replayability. In my opi...,2
0,76561197970982479,22200,True,It's unique and worth a playthrough.,1
0,76561197970982479,43110,True,Great atmosphere. The gunplay can be a bit chu...,2
1,js41637,251610,True,I know what you think when you see this title ...,2
1,js41637,227300,True,For a simple (it's actually not all that simpl...,2
...,...,...,...,...,...
25797,76561198312638244,70,True,a must have classic from steam definitely wort...,2
25797,76561198312638244,362890,True,this game is a perfect remake of the original ...,2
25798,LydiaMorley,273110,True,had so much fun plaing this and collecting res...,2
25798,LydiaMorley,730,True,:D,2


In [29]:
user_review_desanidar[(user_review_desanidar['sentiment_analysis']!=0)&(user_review_desanidar['recommend']==True)]

Unnamed: 0,user_id,item_id,recommend,sentiment_analysis
0,76561197970982479,1250.0,True,2
0,76561197970982479,22200.0,True,1
0,76561197970982479,43110.0,True,2
1,js41637,251610.0,True,2
1,js41637,227300.0,True,2
...,...,...,...,...
25797,76561198312638244,70.0,True,2
25797,76561198312638244,362890.0,True,2
25798,LydiaMorley,273110.0,True,2
25798,LydiaMorley,730.0,True,2


In [17]:
#Se elimina la columna review porque ya no se va a utililzar

user_review_desanidar.drop(columns='review',inplace=True) 

In [18]:
user_review_desanidar

Unnamed: 0,user_id,item_id,recommend,sentiment_analysis
0,76561197970982479,1250,True,2
0,76561197970982479,22200,True,1
0,76561197970982479,43110,True,2
1,js41637,251610,True,2
1,js41637,227300,True,2
...,...,...,...,...
25797,76561198312638244,70,True,2
25797,76561198312638244,362890,True,2
25798,LydiaMorley,273110,True,2
25798,LydiaMorley,730,True,2


In [19]:
user_review_desanidar.info()

<class 'pandas.core.frame.DataFrame'>
Index: 59305 entries, 0 to 25798
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   user_id             59305 non-null  object
 1   item_id             59305 non-null  object
 2   recommend           59305 non-null  object
 3   sentiment_analysis  59305 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [20]:
#Convertimos la columna item_id en tipo de dato flotante

user_review_desanidar['item_id'] = user_review_desanidar['item_id'].astype(float)


In [21]:
user_review_desanidar.info()

<class 'pandas.core.frame.DataFrame'>
Index: 59305 entries, 0 to 25798
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   user_id             59305 non-null  object 
 1   item_id             59305 non-null  float64
 2   recommend           59305 non-null  object 
 3   sentiment_analysis  59305 non-null  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 2.3+ MB


In [22]:
user_review_desanidar.to_csv('../Data/user_reviews_limpio.csv',index=False)

In [23]:
with gzip.open('../Data/user_reviews_limpio.csv.gz', 'wb') as f:
    user_review_desanidar.to_csv(f, index=False, encoding='utf-8')