In [2]:
import pandas as pd
reviews_df = pd.read_csv('./data/reviews.csv')

In [17]:
reviews_df['Review'].sample(1).iloc[0]

"Located in Lisbon, the Capela de Santo Amaro is a charming chapel that exudes a peaceful and serene atmosphere. The beautiful architecture and intricate details of the chapel make it a must-visit spot for those interested in religious and historical sites. The Capela de Santo Amaro offers a tranquil escape from the bustling city, allowing visitors to find solace and connect with their spirituality. Whether you're religious or not, the Capela de Santo Amaro is a place of beauty and tranquility that shouldn't be missed."

In [19]:
#pandas and numpy for df manipulation
import pandas as pd
import numpy as np
import re
import nltk
import statistics

#Preprocessing: tokenization and lemmatization
from nltk.tokenize import word_tokenize
from nltk.tokenize import PunktSentenceTokenizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
sent_tokenizer = PunktSentenceTokenizer()

#Sentiment Analysis with VADER
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#Regression Metrics
from scipy.stats import pearsonr
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error

#Visualization
import matplotlib.pyplot as plt


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\abdar\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [24]:
def sentiment_preprocesser(raw_text, lowercase=True, leave_punctuation = False, lemmatization=True, tokenized_output=True, sentence_output=True):
    
    if lowercase == True:
    #convert to lowercase
        clean_text = raw_text.lower()
    else:
        clean_text = raw_text
    #remove newline characters
    clean_text = re.sub(r'(\**\\[nrt]|</ul>)',' ',clean_text)
    if leave_punctuation == False:
    #remove punctuation:
        clean_text = re.sub(r'(\W)',' ',clean_text)
    #remove url:
    clean_text = re.sub(r'(\bhttp:.+\b)',' ',clean_text)
    #remove isolated consonants:
    clean_text = re.sub(r'\b([^aeiou])\b',' ',clean_text)
    #tokenize
    clean_text = word_tokenize(clean_text)
    #lemmatize
    if lemmatization == True:
        for pos_tag in ["v","n","a"]:
            clean_text = [lemmatizer.lemmatize(token, pos=pos_tag) for token in clean_text]
    
    if tokenized_output == False:
    #re-join
        clean_text = " ".join(clean_text)
    #Remove space before punctuation
        clean_text = re.sub(r'(\s)(?!\w)','',clean_text)

    if sentence_output == True:
        #split into sentences:
        clean_text = sent_tokenizer.tokenize(clean_text)
    
    return clean_text

In [25]:
reviews_df['CleanReview'] = reviews_df['Review'].apply(lambda review: sentiment_preprocesser(
    review, lowercase = False, 
    leave_punctuation = True, 
    lemmatization=False, 
    tokenized_output=False))

In [27]:
vader = SentimentIntensityAnalyzer()

In [30]:
reviews_df['CleanReview']

0      [The Oceanário de Lisboa is an absolute must v...
1      [Visiting the Oceanário de Lisboa was like div...
2      [If you re in Lisbon, you got ta check out the...
3      [The Quinta da Regaleira is a hidden gem in Li...
4      [Oh man, Quinta da Regaleira is a hidden gem i...
                             ...                        
582    [Mercado 31 de Janeiro is a vibrant market in ...
583    [Located in the heart of Lisbon, Mercado 31 de...
584                               [Palacio de Valflores]
585    [Palácio de Valflores is a majestic castle loc...
586    [The Palacio de Valflores is a hidden gem in L...
Name: CleanReview, Length: 587, dtype: object

In [28]:
# Analyse polarity and add results to dataframe
reviews_df['Vader'] = reviews_df['CleanReview'].apply(lambda x: vader.polarity_scores(x))
reviews_df['Negative_vader'] = reviews_df['Vader'].apply(lambda x: x['neg'])
reviews_df['Neutral_vader'] = reviews_df['Vader'].apply(lambda x: x['neu'])
reviews_df['Positive_vader'] = reviews_df['Vader'].apply(lambda x: x['pos'])
reviews_df['Compound_vader'] = reviews_df['Vader'].apply(lambda x: x['compound'])

AttributeError: 'list' object has no attribute 'encode'

In [None]:
# Drop column with polarity scores
reviews_df.drop('Vader', axis=1, inplace=True)

In [None]:
# Name of the columns related with vader
vader_cols = ['Negative_vader', 'Neutral_vader', 'Positive_vader', 'Compound_vader']

In [None]:
reviews_df[vader_cols].describe()