"""
Created on Mon Oct 2021

@author: Alex Nascimento Rodrigues
"""

In [61]:
!pip install nltk
!pip install pyreadr

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [62]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from unicodedata import normalize 
import pyreadr, csv, string, os, nltk, json, re, statistics, pandas as pd, numpy as np
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [63]:
# Criação de um super léxico baseados em diversos léxicos #

# Foram utilizados os Léxicos Vader_lexicon_ptbr, SentiLex-lem-PT02, Oplexicon_v3.0

#https://github.com/rafjaa/LeIA/tree/master/lexicons
#https://github.com/sillasgonzaga/lexiconPT/blob/master/data-raw/SentiLex-lem-PT02.txt
#https://www.inf.pucrs.br/linatural/wordpress/recursos-e-ferramentas/oplexicon/


In [64]:
# Remove Stopwords
stop = stopwords.words('portuguese')

# Clean and Remove special characters
def clean(text):
  text = re.sub(r'#" ','', text)
  text = re.sub(r'[0-9]+', '', text)
  text = text.translate(str.maketrans('', '', string.punctuation))
  return text

def remover_acentos(text, codif='utf-8'):
  return normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')

# Remove incorrect text
def clean_incorrect(text):
  text = re.sub(r'video player is loadingplay videoplaymutecurrent time duration loaded','', text)
  text = re.sub(r'stream type liveseek to live currently playing liveliveremaining time playback ratexchapterschaptersdescriptionsdescriptions','', text)
  text = re.sub(r'off selectedcaptionslegendas desativadas selectedaudio trackfullscreenthis is modal window','', text)
  return text

# Clean Stars column
def cleanStars(text):
  text = re.sub(r',0','', text)
  return text

##################
## Sentilex Lem ##
##################

sentiLexPath = "drive/MyDrive/TCC-Alex-Rodrigues/lexicos/sentilex_pt_02.txt"

sentiLexDict = pd.read_csv(sentiLexPath, sep=";", header=None, on_bad_lines='skip', names=["term", "tg", "polarity", "anot"])

# Clean lexicon #

def convertValueToFloat(dic):
  for key, val in dic.items():                                              
    dic[key] = float(val)
  return dic
              
def cleanTerm(text):
  text = text.split('.', 1)[0]
  return text

sentiLexDict['term'] = sentiLexDict['term'].apply(cleanTerm)
sentiLexDict['term'] = sentiLexDict['term'].apply(remover_acentos)

def cleanPolarity(text):
  text = re.sub(r'POL:N0=','', text)  
  text = re.sub(r'POL:N1=','', text)
  return text

sentiLexDict['polarity'] = sentiLexDict['polarity'].apply(cleanPolarity)
sentiLexDict = sentiLexDict.set_index('term')['polarity'].to_dict()
sentiLexDict = convertValueToFloat(sentiLexDict)
      
##################
## OpLexicon v3 ##
##################

opLexiconPath = 'drive/MyDrive/TCC-Alex-Rodrigues/lexicos/oplexicon_v3.0.rda'
opLexiconDict = pyreadr.read_r(opLexiconPath)
opLexiconDict = opLexiconDict['oplexicon_v3.0']
opLexiconDict.pop('polarity_revision')
opLexiconDict.pop('type')

opLexiconDict['term'] = opLexiconDict['term'].apply(clean)
opLexiconDict = opLexiconDict.set_index('term')['polarity'].to_dict()
opLexiconDict = convertValueToFloat(opLexiconDict)

###################
## Vader Lexicon ##
###################
vaderLexiconPTPath = 'drive/MyDrive/TCC-Alex-Rodrigues/lexicos/vader_lexicon_ptbr.txt'

with open(vaderLexiconPTPath, encoding='utf-8') as f:
    lexicon_full_filepath = f.read()

vaderDict = {}

for line in lexicon_full_filepath.split('\n'):
    if len(line) < 1:
        continue
    (word, measure) = line.strip().split('\t')[0:2]
    vaderDict[word] = float(measure)

vaderDict = convertValueToFloat(vaderDict)
vaderDict.update(sentiLexDict)
vaderDict.update(opLexiconDict)

lexicos_negativos = 0
lexicos_neutros = 0
lexicos_positivos = 0

def normalizeNotes(dic):
  for key, val in dic.items():
    if (val < 0):
      dic[key] = -1
    elif (val > 0):
      dic[key] = 1
  return dic

vaderDict = normalizeNotes(vaderDict)

for key, val in vaderDict.items():
  if (val == -1):
    vaderDict[key] = -1
    lexicos_negativos+=1
  elif (val == 0):
    lexicos_neutros+=1
    vaderDict[key] = 0
  elif (val == 1):
    lexicos_positivos+=1
    vaderDict[key] = 1
  else:
    print("Consertar = " + str(key) + ":" + str(val))

print("Negativos: " + str(lexicos_negativos))
print("Neutros: " + str(lexicos_neutros))
print("Positivos: " + str(lexicos_positivos))

completeDict = vaderDict

def review_analyzer(review):
  tokens = word_tokenize(review)
  noteDict = {'neg': 0,
              'neu': 0,
              'pos': 0}
  for token in tokens:
    if (token in completeDict.keys()):
      if (completeDict[token] == -1):
        noteDict['neg'] = noteDict['neg'] + 1
      elif (completeDict[token] == 0):
        noteDict['neu'] = noteDict['neu'] + 1
      elif (completeDict[token] == 1):
        noteDict['pos'] = noteDict['pos'] + 1
    # Token não está presente no léxico
    else:
      noteDict['neu'] = noteDict['neu'] + 1
  return noteDict


def calculate_stars_weight(stars):

  list_neg = []
  list_neu = []
  list_pos = []

  for star in stars:
    if(int(star) > 3):
      list_neg.append(0)
      list_neu.append(0)
      list_pos.append(1)
      
    elif (int(star) == 3):
      list_neg.append(0)
      list_neu.append(1)
      list_pos.append(0)
    else:
      list_neg.append(1)
      list_neu.append(0)
      list_pos.append(0)

  return list_neg, list_neu, list_pos

def calculateFinalResult(values):
  negative = values['Negative']
  positive = values['Positive']
  
  star_neg = values['Star_Neg']
  star_pos = values['Star_Pos']

  list_final_result = []
  
  for i in range(len(negative)):

    negative[i] = negative[i] + (0.3 * negative[i] * star_neg[i])
    positive[i] = positive[i] + (0.3 * positive[i] * star_neg[i])

    if(negative[i] > positive[i]):
      list_final_result.append("Negative")
    elif(negative[i] < positive[i]):
      list_final_result.append("Positive")
    else:
      list_final_result.append("Neutral")        
  
  return list_final_result
  
def percentage(part, whole):  
  if (whole == 0):
    return str(0) + "%"
  percentage = 100 * float(part)/float(whole)
  return str(round(percentage,2)) + "%"

def set_percentage_with_neutral(values):

  negative = values['Negative']
  neutral = values['Neutral']
  positive = values['Positive']

  list_neg = []
  list_neu = []
  list_pos = []

  for i in range(len(negative)):
    total = int(negative[i]+neutral[i]+positive[i])
    list_neg.append(percentage(negative[i], total))
    list_neu.append(percentage(neutral[i], total))
    list_pos.append(percentage(positive[i], total))
  return list_neg, list_neu, list_pos

def set_percentage(values):
  negative = values['Negative']
  positive = values['Positive']
  list_neg_pos = []

  for i in range(len(negative)):
    total = int(negative[i]+positive[i])
    list_neg_pos.append(percentage(negative[i], total) + ' / ' + percentage(positive[i], total))
  return list_neg_pos

Negativos: 18054
Neutros: 8981
Positivos: 10740


In [65]:
#######################################
### Limpeza do Dataset para análise ###
#######################################

with open('drive/MyDrive/TCC-Alex-Rodrigues/extract_reviews/eletronicos_Results.json', encoding='utf-8') as fh:
    data_eletronicos = json.load(fh)

data_all_categories = data_eletronicos

with open('drive/MyDrive/TCC-Alex-Rodrigues/extract_reviews/brinquedos_Results.json', encoding='utf-8') as fh:
    data_brinquedos = json.load(fh)
    
data_all_categories.update(data_brinquedos)

with open('drive/MyDrive/TCC-Alex-Rodrigues/extract_reviews/livros_Results.json', encoding='utf-8') as fh:
    data_livros = json.load(fh)

data_all_categories.update(data_livros)

# Products name
productsName = list(data_all_categories.keys())

df_complete = pd.DataFrame()
dic_complete = {}

cont_errors = 0

for product in productsName:
  try:
    #print("-----------PRODUTO ATUAL------------ " + str(product))
    raw_reviews = data_all_categories.get(product)
    new_df = pd.DataFrame(raw_reviews, columns=['Name', 'Stars', 'Date', 'Review'])
    new_df.dropna(inplace = True)
    
    new_df['Raw_Review'] = new_df['Review']
    
    new_df['Item'] = product
    # Clean and Remove special characters
    new_df['Processed_Review'] = new_df['Review'].apply(clean)
    # Lowercase all reviews
    new_df['Processed_Review'] = new_df['Processed_Review'].apply(lambda x: " ".join(x.lower() for x in x.split()))
    # Remove Stopwords
    new_df['Processed_Review'] = new_df['Processed_Review'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
    # Remoção da acentuação
    new_df['Processed_Review'] = new_df['Processed_Review'].apply(remover_acentos)
    # Clean Stars
    new_df['Stars'] = new_df['Stars'].apply(cleanStars)
    # Clean incorrect text
    new_df['Processed_Review'] = new_df['Processed_Review'].apply(clean_incorrect)

    new_df['Scores'] = new_df['Processed_Review'].apply(review_analyzer)

    new_df['Negative']  = new_df['Scores'].apply(lambda score_dict: score_dict['neg'])
    new_df['Neutral']  = new_df['Scores'].apply(lambda score_dict: score_dict['neu'])
    new_df['Positive']  = new_df['Scores'].apply(lambda score_dict: score_dict['pos'])
    del new_df['Scores']

    new_df['Star_Neg'], new_df['Star_Neu'], new_df['Star_Pos'] = calculate_stars_weight(new_df['Stars'])
        
    new_df['Final_Result'] = calculateFinalResult(new_df[['Negative', 'Positive','Star_Neg','Star_Neu', 'Star_Pos']])
    new_df['Neg(%)'], new_df['Neu(%)'], new_df['Pos(%)'] = set_percentage_with_neutral(new_df[['Negative','Neutral','Positive']])
    new_df['Neg/Pos(%)'] = set_percentage(new_df[['Negative','Positive']])  

    new_df = new_df.reindex(['Item', 'Name', 'Date', 'Raw_Review', 'Processed_Review', 'Stars', 'Star_Neg', 'Star_Neu', 'Star_Pos',
                             'Negative', 'Neutral', 'Positive', 'Neg(%)', 'Neu(%)', 'Pos(%)', 'Neg/Pos(%)', 'Final_Result'], axis=1)    
    
    df_complete = pd.concat([df_complete, new_df])
    
  except:
    print("-----Erro ao analisar o produto----- " + str(product))    
    cont_errors+=1
    continue

df_complete['Processed_Review'].replace('', np.nan, inplace=True)
df_complete.dropna(subset=['Processed_Review'], inplace=True)

print("Produtos não avaliados " + str(cont_errors) + " de " + str(len(productsName)))

#df_complete.to_csv('drive/My Drive/TCC-NEW/Reviews-Dataset.csv', sep=';', encoding='utf-8', header=True, index=False)

df_complete.to_excel('drive/MyDrive/TCC-Alex-Rodrigues/Reviews-Dataset.xlsx', encoding='utf-8', header=True, index=False)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Produtos não avaliados 0 de 2999


In [66]:

print('Negative Results: ',len(df_complete.loc[df_complete['Final_Result'] == 'Negative']))
print('Neutral Results: ',len(df_complete.loc[df_complete['Final_Result'] == 'Neutral']))
print('Positive Results: ',len(df_complete.loc[df_complete['Final_Result'] == 'Positive']))

print('Stars Average: ', round(statistics.mean(df_complete['Stars'].astype(int)),2))

Negative Results:  9449
Neutral Results:  8497
Positive Results:  38848
Stars Average:  4.4
