Steam User Reviews Cleaning

Exploración del dataset y limpieza

In [1]:
#nltk.download('all')
#pip install tqdm
#pip install emoji=2.0.0

In [2]:
#!pip install emot

In [3]:
# Importacion de librerias
import json 
import pandas as pd
import numpy as np
import pickle
import re
import ast
import warnings
warnings.filterwarnings('ignore')
import nltk
import string


Esto convertirá nuestro archivo JSON en un marco de datos de pandas.

In [4]:
rows = []
with open("australian_user_reviews.json", encoding='MacRoman') as f:
    for line in f.readlines():
        rows.append(ast.literal_eval(line))

In [5]:
#Normalizando la column 'reviews' 
data = pd.json_normalize(rows, record_path=['reviews'], meta=['user_id','user_url'] )


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59305 entries, 0 to 59304
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   funny        59305 non-null  object
 1   posted       59305 non-null  object
 2   last_edited  59305 non-null  object
 3   item_id      59305 non-null  object
 4   helpful      59305 non-null  object
 5   recommend    59305 non-null  bool  
 6   review       59305 non-null  object
 7   user_id      59305 non-null  object
 8   user_url     59305 non-null  object
dtypes: bool(1), object(8)
memory usage: 3.7+ MB


In [7]:
print(f"Numero de filas duplicadas = {data.duplicated().sum()}")

Numero de filas duplicadas = 874


In [8]:
data = data.drop_duplicates()
data.reset_index(drop=True,inplace=True)

In [9]:
print(f"Numero de filas duplicadas = {data.duplicated().sum()}")

Numero de filas duplicadas = 0


In [10]:
#Verificador de Dimension del DataFrame
data.shape

(58431, 9)

In [11]:
#Eliminación de las Columnas 'helpful' y 'Funny' ya que no son explicativas para el Análisis del Proyecto (verificado en EDA)
data.drop(columns=['helpful'], inplace=True)
data.drop(columns=['funny'], inplace=True)

In [12]:
#Verificador de Dimension del DataFrame
data.shape

(58431, 7)

### Transformaciones de la columna Reviews


In [13]:
from emot.emo_unicode import UNICODE_EMO, EMOTICONS

In [14]:
# Function for converting emojis into word
def convert_emojis(text):
    for emot in UNICODE_EMO:
        text = text.replace(emot, "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()))
    return text

In [15]:
def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
        return text

In [16]:
data["review"] = data['review'].apply(convert_emojis)

In [17]:
data["review"] = data['review'].apply(convert_emoticons)

In [18]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [19]:
data['review'] = data['review'].apply(remove_urls)

In [20]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

In [21]:
data["review"] = data['review'].apply(remove_punctuations)

In [22]:
translator = str.maketrans('', '', string.punctuation)

In [23]:
data['review'] = data['review'].apply(lambda x: x.translate(translator))

In [24]:
translation = str.maketrans('', '', string.digits)

In [25]:
data['review'] = data['review'].apply(lambda x: x.translate(translation))

In [26]:
data['review'] = data['review'].str.lower()

In [27]:
# Adición de una columna incremental  
start=1
data['id'] = range(start, start + data.shape[0])

In [28]:
data = data[["id","user_id", "user_url","item_id","posted", "recommend", "review"]]
data

Unnamed: 0,id,user_id,user_url,item_id,posted,recommend,review
0,1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1250,"Posted November 5, 2011.",True,simple yet with great replayability in my opin...
1,2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,22200,"Posted July 15, 2011.",True,its unique and worth a playthrough
2,3,76561197970982479,http://steamcommunity.com/profiles/76561197970...,43110,"Posted April 21, 2011.",True,great atmosphere the gunplay can be a bit chun...
3,4,js41637,http://steamcommunity.com/id/js41637,251610,"Posted June 24, 2014.",True,i know what you think when you see this title ...
4,5,js41637,http://steamcommunity.com/id/js41637,227300,"Posted September 8, 2013.",True,for a simple its actually not all that simple ...
...,...,...,...,...,...,...,...
58426,58427,76561198312638244,http://steamcommunity.com/profiles/76561198312...,70,Posted July 10.,True,a must have classic from steam definitely wort...
58427,58428,76561198312638244,http://steamcommunity.com/profiles/76561198312...,362890,Posted July 8.,True,this game is a perfect remake of the original ...
58428,58429,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,273110,Posted July 3.,True,had so much fun plaing this and collecting res...
58429,58430,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,730,Posted July 20.,True,d


In [29]:
from nltk.sentiment import SentimentIntensityAnalyzer
import tqdm

sia = SentimentIntensityAnalyzer()

In [30]:
# Run the polarity score on the entire dataset
#Uso de Libreria tqdm para lograr el análisis de Sentimiento
res = {}
for i, row in tqdm.tqdm(data.iterrows(), total=len(data)):
    text = row['review']
    myid = row['id']
    res[myid] = sia.polarity_scores(text)

  0%|          | 0/58431 [00:00<?, ?it/s]

100%|██████████| 58431/58431 [00:25<00:00, 2274.86it/s]


In [31]:

vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns={'index': 'id'})

#tenemos asi el sentiment score
dfVaders = vaders.merge(data, how='left')
print(dfVaders.columns)

Index(['id', 'neg', 'neu', 'pos', 'compound', 'user_id', 'user_url', 'item_id',
       'posted', 'recommend', 'review'],
      dtype='object')


In [32]:
#Eliminación de columnas no explicativas para el proyecto
dfVaders.drop(columns=['neg', 'neu', 'pos'], inplace=True)

In [33]:
dfVaders

Unnamed: 0,id,compound,user_id,user_url,item_id,posted,recommend,review
0,1,0.8481,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1250,"Posted November 5, 2011.",True,simple yet with great replayability in my opin...
1,2,0.2263,76561197970982479,http://steamcommunity.com/profiles/76561197970...,22200,"Posted July 15, 2011.",True,its unique and worth a playthrough
2,3,0.9062,76561197970982479,http://steamcommunity.com/profiles/76561197970...,43110,"Posted April 21, 2011.",True,great atmosphere the gunplay can be a bit chun...
3,4,0.8814,js41637,http://steamcommunity.com/id/js41637,251610,"Posted June 24, 2014.",True,i know what you think when you see this title ...
4,5,0.9792,js41637,http://steamcommunity.com/id/js41637,227300,"Posted September 8, 2013.",True,for a simple its actually not all that simple ...
...,...,...,...,...,...,...,...,...
58426,58427,0.5574,76561198312638244,http://steamcommunity.com/profiles/76561198312...,70,Posted July 10.,True,a must have classic from steam definitely wort...
58427,58428,0.9786,76561198312638244,http://steamcommunity.com/profiles/76561198312...,362890,Posted July 8.,True,this game is a perfect remake of the original ...
58428,58429,0.7635,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,273110,Posted July 3.,True,had so much fun plaing this and collecting res...
58429,58430,0.0000,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,730,Posted July 20.,True,d


Dependiendo del score del compound se tómo como referencia el siguiente rango de valores para crear la columna "sentiment_analysis"

In [34]:
#positivo sentiment : (compound score >= 0.05) 
#neutral sentiment : (compound score > -0.05) and (compound score < 0.05) 
#negativo sentiment : (compound score <= -0.05)

In [35]:
# create a list of our conditions
conditions = [
    (dfVaders['compound'] >= 0.05),
    (dfVaders['compound'] > -0.05) & (dfVaders['compound'] < 0.05),
    (dfVaders['compound'] <= -0.05)
    ]


In [36]:
# create a list of the values we want to assign for each condition
values = [2, 1, 0]

In [37]:
dfVaders['sentiment_analysis'] = np.select(conditions, values)

In [38]:
dfVaders.head(5)

Unnamed: 0,id,compound,user_id,user_url,item_id,posted,recommend,review,sentiment_analysis
0,1,0.8481,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1250,"Posted November 5, 2011.",True,simple yet with great replayability in my opin...,2
1,2,0.2263,76561197970982479,http://steamcommunity.com/profiles/76561197970...,22200,"Posted July 15, 2011.",True,its unique and worth a playthrough,2
2,3,0.9062,76561197970982479,http://steamcommunity.com/profiles/76561197970...,43110,"Posted April 21, 2011.",True,great atmosphere the gunplay can be a bit chun...,2
3,4,0.8814,js41637,http://steamcommunity.com/id/js41637,251610,"Posted June 24, 2014.",True,i know what you think when you see this title ...,2
4,5,0.9792,js41637,http://steamcommunity.com/id/js41637,227300,"Posted September 8, 2013.",True,for a simple its actually not all that simple ...,2


In [39]:
#Eliminación de columnas no explicativas para el proyecto
dfVaders.drop(columns=['review','id','compound'], inplace=True)

In [40]:
dfVaders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58431 entries, 0 to 58430
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   user_id             58431 non-null  object
 1   user_url            58431 non-null  object
 2   item_id             58431 non-null  object
 3   posted              58431 non-null  object
 4   recommend           58431 non-null  bool  
 5   sentiment_analysis  58431 non-null  int32 
dtypes: bool(1), int32(1), object(4)
memory usage: 2.1+ MB


In [41]:
dfVaders['posted']

0         Posted November 5, 2011.
1            Posted July 15, 2011.
2           Posted April 21, 2011.
3            Posted June 24, 2014.
4        Posted September 8, 2013.
                   ...            
58426              Posted July 10.
58427               Posted July 8.
58428               Posted July 3.
58429              Posted July 20.
58430               Posted July 2.
Name: posted, Length: 58431, dtype: object

In [42]:
dfVaders['posted'] = dfVaders['posted'].str.replace('Posted', '')

In [43]:
#Creación de una función para Normalizar el campo Fecha
from datetime import datetime

def getNormalizeDate(text):   
   
    text = str.lstrip(str.rstrip(text))
    
    if len(text)>4 and "," in text:
         
        formats = ["%B %d %Y", "%d %B %Y", "%b %d %Y", "%m/%d/%Y", "%m %d %Y","%m/%d"]
        text = text.replace(".", "").replace(",", "")
        
        for format in formats:
            return datetime.strptime(text, format).strftime('%Y-%m-%d')

In [44]:
dfVaders['posted'].head(10)

0      November 5, 2011.
1         July 15, 2011.
2        April 21, 2011.
3         June 24, 2014.
4     September 8, 2013.
5     November 29, 2013.
6            February 3.
7      December 4, 2015.
8      November 3, 2014.
9      October 15, 2014.
Name: posted, dtype: object

In [45]:
#Normalizando el campo Fecha
dfVaders['posted'] = dfVaders['posted'].astype(str).apply(getNormalizeDate)

In [46]:
dfVaders['posted'].head(10)

0    2011-11-05
1    2011-07-15
2    2011-04-21
3    2014-06-24
4    2013-09-08
5    2013-11-29
6          None
7    2015-12-04
8    2014-11-03
9    2014-10-15
Name: posted, dtype: object

In [47]:
#Conversion a integer
#dfVaders['item_id'] = dfVaders['item_id'].astype('Int64')

In [48]:
dfVaders.isnull().sum()

user_id                  0
user_url                 0
item_id                  0
posted                9933
recommend                0
sentiment_analysis       0
dtype: int64

In [49]:
dfVaders.head(5)

Unnamed: 0,user_id,user_url,item_id,posted,recommend,sentiment_analysis
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,1250,2011-11-05,True,2
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,22200,2011-07-15,True,2
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,43110,2011-04-21,True,2
3,js41637,http://steamcommunity.com/id/js41637,251610,2014-06-24,True,2
4,js41637,http://steamcommunity.com/id/js41637,227300,2013-09-08,True,2


In [50]:
dfVaders['posted'] = dfVaders['posted'].replace(np.nan, '')

In [51]:
dfVaders.isnull().sum()

user_id               0
user_url              0
item_id               0
posted                0
recommend             0
sentiment_analysis    0
dtype: int64

In [52]:
dfVaders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58431 entries, 0 to 58430
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   user_id             58431 non-null  object
 1   user_url            58431 non-null  object
 2   item_id             58431 non-null  object
 3   posted              58431 non-null  object
 4   recommend           58431 non-null  bool  
 5   sentiment_analysis  58431 non-null  int32 
dtypes: bool(1), int32(1), object(4)
memory usage: 2.1+ MB


In [53]:
#Ordenar por item-id
dfVaders = dfVaders.sort_values('item_id')

In [54]:
#Regenerar el indice del DataFrame
data = dfVaders.reset_index(drop=True)

In [55]:
### Create a Pickle file using serialization 
import pickle
pickle_out = open("steamReviews.pkl","wb")
pickle.dump(data, pickle_out)
pickle_out.close()