In [2]:
import pandas as pd
from geopy.distance import geodesic
from langdetect import detect, DetectorFactory
import matplotlib.pyplot as plt
import spacy
import re
import string
from textblob import TextBlob
import hashlib
from langdetect.lang_detect_exception import LangDetectException
import time
from deep_translator import GoogleTranslator
import numpy as np
import dateparser
from datetime import datetime
from spacy.lang.en.stop_words import STOP_WORDS


# === 1. LOADING FILES ===

In [5]:
# Caminhos para os CSVs (ajusta conforme o necessário)
caminho_avaliacoes = "C:/Users/Fernanda Costa/OneDrive - Universidade de Aveiro/Desktop/seminar_project/1_data_collection/google_places_API/csv/google_places_AMP_with_coordinates.csv"
caminho_comentarios = "C:/Users/Fernanda Costa/OneDrive - Universidade de Aveiro/Desktop/seminar_project/1_data_collection/google_places_API/csv/comments_google_maps_AMP.csv"

# Leitura dos dados
avaliacoes = pd.read_csv(caminho_avaliacoes)
comentarios = pd.read_csv(caminho_comentarios)

# === 2. PRE-PROCESSING OF RATINGS ===

In [6]:
avaliacoes.head()

Unnamed: 0,Cidade,Categoria,Nome,Rating,Endereço,Tipos,Latitude,Longitude,Total_Reviews
0,Arouca,restaurant,Tasquinha da Quinta,4.6,"R. 1º de Maio 3, 4540-121 Arouca, Portugal","restaurant, food, point_of_interest, establish...",40.929109,-8.245191,2154.0
1,Arouca,restaurant,A Assembleia,4.5,"Tv. da Ribeira 11, 4540-102 Arouca, Portugal","restaurant, bar, food, point_of_interest, esta...",40.928766,-8.247588,1788.0
2,Arouca,restaurant,Parlamento,4.6,"Tv. da Ribeira 2, 4540-148 Arouca, Portugal","restaurant, food, point_of_interest, establish...",40.929011,-8.247392,2469.0
3,Arouca,restaurant,Casa Testinha,4.5,"R. 1º de Maio 4, 4540-113 Arouca, Portugal","restaurant, food, point_of_interest, establish...",40.928876,-8.245147,623.0
4,Arouca,restaurant,Pedestre 142,4.4,"R. Dr. Ângelo Miranda 142 RC, 4540-140 Arouca,...","restaurant, food, point_of_interest, establish...",40.930185,-8.25321,1047.0


In [7]:
avaliacoes['Cidade'].unique()

array(['Arouca', 'Espinho', 'Gondomar', 'Maia', 'Matosinhos',
       'Oliveira de Azeméis', 'Paredes', 'Penafiel', 'Porto',
       'Póvoa de Varzim', 'Santa Maria da Feira', 'Santo Tirso',
       'São João da Madeira', 'Trofa', 'Valongo', 'Vila do Conde',
       'Vila Nova de Gaia'], dtype=object)

In [8]:
avaliacoes.columns

Index(['Cidade', 'Categoria', 'Nome', 'Rating', 'Endereço', 'Tipos',
       'Latitude', 'Longitude', 'Total_Reviews'],
      dtype='object')

In [9]:
avaliacoes['Rating'].isnull().sum()

140

In [10]:
avaliacoes['Total_Reviews'].isnull().sum()

4179

In [11]:
((avaliacoes["Total_Reviews"] == 0) & (avaliacoes["Total_Reviews"].notnull())).sum()

0

In [12]:
avaliacoes.duplicated(subset=["Endereço"]).sum()

1803

In [13]:
endereco_repetido = avaliacoes["Endereço"][avaliacoes["Endereço"].duplicated()].iloc[0]
avaliacoes[avaliacoes["Endereço"] == endereco_repetido]

Unnamed: 0,Cidade,Categoria,Nome,Rating,Endereço,Tipos,Latitude,Longitude,Total_Reviews
7,Arouca,restaurant,Café Arouquense,4.3,"Av. 25 de Abril, 4540-102 Arouca, Portugal","cafe, restaurant, food, point_of_interest, est...",40.928469,-8.245599,901.0
20,Arouca,restaurant,Sabores da serra,4.1,"Av. 25 de Abril, 4540-102 Arouca, Portugal","restaurant, food, point_of_interest, establish...",40.928561,-8.244947,16.0
94,Arouca,bar,BOM COPO - restaurante & bar,4.8,"Av. 25 de Abril, 4540-102 Arouca, Portugal","restaurant, food, point_of_interest, establish...",40.928572,-8.24446,11.0
119,Arouca,cafe,Café Arouquense,4.3,"Av. 25 de Abril, 4540-102 Arouca, Portugal","cafe, restaurant, food, point_of_interest, est...",40.928469,-8.245599,901.0
133,Arouca,cafe,Bakery Village II,4.4,"Av. 25 de Abril, 4540-102 Arouca, Portugal","cafe, store, food, point_of_interest, establis...",40.927949,-8.251804,378.0
151,Arouca,cafe,Sabores da serra,4.1,"Av. 25 de Abril, 4540-102 Arouca, Portugal","restaurant, food, point_of_interest, establish...",40.928561,-8.244947,16.0


In [14]:
avaliacoes.duplicated(subset=["Nome", "Endereço"]).sum()

1569

In [15]:
avaliacoes["Cidade"].nunique()

17

In [16]:
avaliacoes["Categoria"].unique()

array(['restaurant', 'museum', 'hotel', 'bar', 'tourist_attraction',
       'cafe', 'church', 'park', 'natural_feature', 'viewpoint', 'trail',
       'lodging'], dtype=object)

In [17]:
print("Mínimo:", avaliacoes["Rating"].min())
print("Máximo:", avaliacoes["Rating"].max())

Mínimo: 1.0
Máximo: 5.0


In [18]:
def gerar_hash(nome, endereco):
    texto = (nome + endereco).encode("utf-8")
    return hashlib.sha1(texto).hexdigest()[:8]  # 8 caracteres de hash

avaliacoes["id_unico"] = avaliacoes.apply(
    lambda row: gerar_hash(row["Nome"], row["Endereço"]),
    axis=1
)

In [19]:
avaliacoes.head()

Unnamed: 0,Cidade,Categoria,Nome,Rating,Endereço,Tipos,Latitude,Longitude,Total_Reviews,id_unico
0,Arouca,restaurant,Tasquinha da Quinta,4.6,"R. 1º de Maio 3, 4540-121 Arouca, Portugal","restaurant, food, point_of_interest, establish...",40.929109,-8.245191,2154.0,d111a3c6
1,Arouca,restaurant,A Assembleia,4.5,"Tv. da Ribeira 11, 4540-102 Arouca, Portugal","restaurant, bar, food, point_of_interest, esta...",40.928766,-8.247588,1788.0,54221336
2,Arouca,restaurant,Parlamento,4.6,"Tv. da Ribeira 2, 4540-148 Arouca, Portugal","restaurant, food, point_of_interest, establish...",40.929011,-8.247392,2469.0,ca96ed20
3,Arouca,restaurant,Casa Testinha,4.5,"R. 1º de Maio 4, 4540-113 Arouca, Portugal","restaurant, food, point_of_interest, establish...",40.928876,-8.245147,623.0,348acace
4,Arouca,restaurant,Pedestre 142,4.4,"R. Dr. Ângelo Miranda 142 RC, 4540-140 Arouca,...","restaurant, food, point_of_interest, establish...",40.930185,-8.25321,1047.0,55a3ab0d


In [20]:
# Eliminar as linhas onde o Rating é null (ou seja, NaN)
avaliacoes = avaliacoes[avaliacoes["Rating"].notnull()]

# Substituir os valores null (ou NaN) por 0 na coluna Total_Reviews
avaliacoes["Total_Reviews"] = avaliacoes["Total_Reviews"].fillna(0)

# Garantir que Total_Reviews fica como número inteiro
avaliacoes["Total_Reviews"] = avaliacoes["Total_Reviews"].astype(int)

In [21]:
avaliacoes.head()

Unnamed: 0,Cidade,Categoria,Nome,Rating,Endereço,Tipos,Latitude,Longitude,Total_Reviews,id_unico
0,Arouca,restaurant,Tasquinha da Quinta,4.6,"R. 1º de Maio 3, 4540-121 Arouca, Portugal","restaurant, food, point_of_interest, establish...",40.929109,-8.245191,2154,d111a3c6
1,Arouca,restaurant,A Assembleia,4.5,"Tv. da Ribeira 11, 4540-102 Arouca, Portugal","restaurant, bar, food, point_of_interest, esta...",40.928766,-8.247588,1788,54221336
2,Arouca,restaurant,Parlamento,4.6,"Tv. da Ribeira 2, 4540-148 Arouca, Portugal","restaurant, food, point_of_interest, establish...",40.929011,-8.247392,2469,ca96ed20
3,Arouca,restaurant,Casa Testinha,4.5,"R. 1º de Maio 4, 4540-113 Arouca, Portugal","restaurant, food, point_of_interest, establish...",40.928876,-8.245147,623,348acace
4,Arouca,restaurant,Pedestre 142,4.4,"R. Dr. Ângelo Miranda 142 RC, 4540-140 Arouca,...","restaurant, food, point_of_interest, establish...",40.930185,-8.25321,1047,55a3ab0d


In [22]:
avaliacoes['Cidade'].unique()

array(['Arouca', 'Espinho', 'Gondomar', 'Maia', 'Matosinhos',
       'Oliveira de Azeméis', 'Paredes', 'Penafiel', 'Porto',
       'Póvoa de Varzim', 'Santa Maria da Feira', 'Santo Tirso',
       'São João da Madeira', 'Trofa', 'Valongo', 'Vila do Conde',
       'Vila Nova de Gaia'], dtype=object)

In [23]:
avaliacoes["id_unico"].nunique()

3498

In [24]:
def classificar_por_categoria(cat):
    cat = str(cat).lower()
    if cat in ["restaurant", "cafe", "bar", "bakery", "store", "night_club"]:
        return "Serviços"
    elif cat in ["museum", "art_gallery", "tourist_attraction", "church"]:
        return "Turismo Cultural"
    elif cat in ["park", "natural_feature", "viewpoint", "trail", "scenic_spot"]:
        return "Recursos Naturais"
    elif cat in ["hotel", "lodging", "hostel", "guest_house"]:
        return "Alojamento"
    else:
        return "Outro"


avaliacoes["Grupo_Tematico"] = avaliacoes["Categoria"].apply(classificar_por_categoria)

In [25]:
avaliacoes["Grupo_Tematico"].unique()

array(['Serviços', 'Turismo Cultural', 'Alojamento', 'Recursos Naturais'],
      dtype=object)

In [26]:
avaliacoes['Cidade'].unique()

array(['Arouca', 'Espinho', 'Gondomar', 'Maia', 'Matosinhos',
       'Oliveira de Azeméis', 'Paredes', 'Penafiel', 'Porto',
       'Póvoa de Varzim', 'Santa Maria da Feira', 'Santo Tirso',
       'São João da Madeira', 'Trofa', 'Valongo', 'Vila do Conde',
       'Vila Nova de Gaia'], dtype=object)

In [27]:
# Função para contar locais semelhantes por categoria e proximidade, ignorando coordenadas nulas
def contar_proximos(row, df, raio=100):
    # Verifica se a linha tem coordenadas válidas
    if pd.isna(row["Latitude"]) or pd.isna(row["Longitude"]):
        return np.nan  # ou 0, dependendo do que preferires

    lat_lon_ref = (row["Latitude"], row["Longitude"])
    
    mesma_categoria = df[df["Categoria"] == row["Categoria"]]

    # Aplicar cálculo apenas às linhas com coordenadas válidas
    mesma_categoria = mesma_categoria[
        mesma_categoria["Latitude"].notna() & mesma_categoria["Longitude"].notna()
    ]

    count = mesma_categoria.apply(
        lambda x: geodesic(lat_lon_ref, (x["Latitude"], x["Longitude"])).meters < raio
                  and x["id_unico"] != row["id_unico"],
        axis=1
    ).sum()
    
    return count

# Aplicar a função
avaliacoes["Locais_Semelhantes_Perto"] = avaliacoes.apply(
    lambda row: contar_proximos(row, avaliacoes),
    axis=1
)

In [28]:
avaliacoes.head()

Unnamed: 0,Cidade,Categoria,Nome,Rating,Endereço,Tipos,Latitude,Longitude,Total_Reviews,id_unico,Grupo_Tematico,Locais_Semelhantes_Perto
0,Arouca,restaurant,Tasquinha da Quinta,4.6,"R. 1º de Maio 3, 4540-121 Arouca, Portugal","restaurant, food, point_of_interest, establish...",40.929109,-8.245191,2154,d111a3c6,Serviços,5.0
1,Arouca,restaurant,A Assembleia,4.5,"Tv. da Ribeira 11, 4540-102 Arouca, Portugal","restaurant, bar, food, point_of_interest, esta...",40.928766,-8.247588,1788,54221336,Serviços,5.0
2,Arouca,restaurant,Parlamento,4.6,"Tv. da Ribeira 2, 4540-148 Arouca, Portugal","restaurant, food, point_of_interest, establish...",40.929011,-8.247392,2469,ca96ed20,Serviços,6.0
3,Arouca,restaurant,Casa Testinha,4.5,"R. 1º de Maio 4, 4540-113 Arouca, Portugal","restaurant, food, point_of_interest, establish...",40.928876,-8.245147,623,348acace,Serviços,5.0
4,Arouca,restaurant,Pedestre 142,4.4,"R. Dr. Ângelo Miranda 142 RC, 4540-140 Arouca,...","restaurant, food, point_of_interest, establish...",40.930185,-8.25321,1047,55a3ab0d,Serviços,0.0


In [29]:
variacoes = avaliacoes.groupby("id_unico")["Locais_Semelhantes_Perto"].nunique()
variacoes_multiplos = variacoes[variacoes > 1]
print("Número de id_unico com valores diferentes de Locais_Semelhantes_Perto:", len(variacoes_multiplos))

Número de id_unico com valores diferentes de Locais_Semelhantes_Perto: 54


In [30]:
exemplo_id = variacoes_multiplos.index[0]
print("Exemplo de id_unico com valores diferentes:", exemplo_id)


Exemplo de id_unico com valores diferentes: 066473c6


In [31]:
avaliacoes[avaliacoes["id_unico"] == exemplo_id]

Unnamed: 0,Cidade,Categoria,Nome,Rating,Endereço,Tipos,Latitude,Longitude,Total_Reviews,id_unico,Grupo_Tematico,Locais_Semelhantes_Perto
5,Arouca,restaurant,Tasquinho do Parque,4.5,"R. Dom Afonso Henriques 57 4540, Arouca, Portugal","restaurant, food, point_of_interest, establish...",40.927645,-8.248854,203,066473c6,Serviços,1.0
87,Arouca,bar,Tasquinho do Parque,4.5,"R. Dom Afonso Henriques 57 4540, Arouca, Portugal","restaurant, food, point_of_interest, establish...",40.927645,-8.248854,203,066473c6,Serviços,0.0
132,Arouca,cafe,Tasquinho do Parque,4.5,"R. Dom Afonso Henriques 57 4540, Arouca, Portugal","restaurant, food, point_of_interest, establish...",40.927645,-8.248854,203,066473c6,Serviços,1.0
195,Arouca,park,Tasquinho do Parque,4.5,"R. Dom Afonso Henriques 57 4540, Arouca, Portugal","restaurant, food, point_of_interest, establish...",40.927645,-8.248854,203,066473c6,Recursos Naturais,2.0


In [32]:
# Agrupaste a densidade por local (id_unico)
densidade_agrupada = avaliacoes.groupby('id_unico')['Locais_Semelhantes_Perto'].max().reset_index()

# Eliminaste duplicados para cada id_unico e retiraste a coluna antiga da densidade
ratings_unicos = avaliacoes.drop_duplicates(subset='id_unico').drop(columns=['Locais_Semelhantes_Perto'])

# Juntaste a densidade máxima ao DataFrame limpo — resultado final ficou em:
ratings_final = ratings_unicos.merge(densidade_agrupada, on='id_unico', how='left')

avaliacoes = ratings_final.copy()

In [33]:
avaliacoes.head(5)

Unnamed: 0,Cidade,Categoria,Nome,Rating,Endereço,Tipos,Latitude,Longitude,Total_Reviews,id_unico,Grupo_Tematico,Locais_Semelhantes_Perto
0,Arouca,restaurant,Tasquinha da Quinta,4.6,"R. 1º de Maio 3, 4540-121 Arouca, Portugal","restaurant, food, point_of_interest, establish...",40.929109,-8.245191,2154,d111a3c6,Serviços,6.0
1,Arouca,restaurant,A Assembleia,4.5,"Tv. da Ribeira 11, 4540-102 Arouca, Portugal","restaurant, bar, food, point_of_interest, esta...",40.928766,-8.247588,1788,54221336,Serviços,6.0
2,Arouca,restaurant,Parlamento,4.6,"Tv. da Ribeira 2, 4540-148 Arouca, Portugal","restaurant, food, point_of_interest, establish...",40.929011,-8.247392,2469,ca96ed20,Serviços,7.0
3,Arouca,restaurant,Casa Testinha,4.5,"R. 1º de Maio 4, 4540-113 Arouca, Portugal","restaurant, food, point_of_interest, establish...",40.928876,-8.245147,623,348acace,Serviços,6.0
4,Arouca,restaurant,Pedestre 142,4.4,"R. Dr. Ângelo Miranda 142 RC, 4540-140 Arouca,...","restaurant, food, point_of_interest, establish...",40.930185,-8.25321,1047,55a3ab0d,Serviços,0.0


# === 3. PRE-PROCESSING COMMENTS ===

In [34]:
comentarios.head()

Unnamed: 0,Cidade,Categoria,Nome_Local,Autor,Texto,Data,Rating
0,Arouca,restaurante,Tasquinha da Quinta,IC,"I came here as part of a tour, and this restau...",10 months ago,5
1,Arouca,restaurante,Tasquinha da Quinta,Preetam Nath,Has to be the most delicious veal I've had in ...,a year ago,5
2,Arouca,restaurante,Tasquinha da Quinta,Roya MJ,Came here as part of a tour and very much appr...,a year ago,5
3,Arouca,restaurante,Tasquinha da Quinta,Jonathan lugo,Very good food and an excellent place to eat! ...,a year ago,5
4,Arouca,restaurante,Tasquinha da Quinta,Benjamim Nande,Great place for a good typical Portuguese food...,4 months ago,5


In [35]:
(comentarios["Nome_Local"] == "Tasquinha da Quinta").sum()

5

In [36]:
comentarios["Cidade"].unique()

array(['Arouca', 'Espinho', 'Gondomar', 'Maia', 'Matosinhos',
       'Oliveira de Azeméis', 'Paredes', 'Penafiel', 'Porto',
       'Póvoa de Varzim', 'Santa Maria da Feira', 'Santo Tirso',
       'São João da Madeira', 'Trofa', 'Valongo', 'Vila do Conde',
       'Vila Nova de Gaia'], dtype=object)

In [37]:
comentarios['Rating'].unique()

array([5, 3, 2, 1, 4], dtype=int64)

In [38]:
comentarios['Rating'].isnull().sum()

0

In [39]:
comentarios['Texto'].isnull().sum()

149

In [40]:
comentarios[comentarios['Texto'].isnull()].head()

Unnamed: 0,Cidade,Categoria,Nome_Local,Autor,Texto,Data,Rating
186,Arouca,restaurante,O Canastro,Cláudia Martins,,2 months ago,5
321,Arouca,hotel,Arouca Guest House 2,Chus,,10 months ago,5
334,Arouca,hotel,MS Collection Arouca - Mosteiro de Arouca,Vítor Neves,,a week ago,5
335,Arouca,hotel,MS Collection Arouca - Mosteiro de Arouca,Alexandra Barreiros,,a year ago,5
590,Arouca,praia,Casa do Tanque - Arouca,Hugo Cunha,,2 years ago,5


In [41]:
# Eliminar comentários em falta
comentarios = comentarios.dropna(subset=["Texto"])

# Remover espaços e garantir tipo texto
comentarios["Texto"] = comentarios["Texto"].astype(str).str.strip()

In [42]:
comentarios.shape[0]

9063

In [43]:
comentarios.head()

Unnamed: 0,Cidade,Categoria,Nome_Local,Autor,Texto,Data,Rating
0,Arouca,restaurante,Tasquinha da Quinta,IC,"I came here as part of a tour, and this restau...",10 months ago,5
1,Arouca,restaurante,Tasquinha da Quinta,Preetam Nath,Has to be the most delicious veal I've had in ...,a year ago,5
2,Arouca,restaurante,Tasquinha da Quinta,Roya MJ,Came here as part of a tour and very much appr...,a year ago,5
3,Arouca,restaurante,Tasquinha da Quinta,Jonathan lugo,Very good food and an excellent place to eat! ...,a year ago,5
4,Arouca,restaurante,Tasquinha da Quinta,Benjamim Nande,Great place for a good typical Portuguese food...,4 months ago,5


In [44]:
# Função que tenta detetar o idioma, tratando erros e textos vazios
def detetar_idioma(texto):
    try:
        if pd.isna(texto) or str(texto).strip() == "":
            return "desconhecido"
        return detect(texto)
    except LangDetectException:
        return "erro"

# Aplicar ao DataFrame
comentarios["Idioma"] = comentarios["Texto"].apply(detetar_idioma)

In [45]:
comentarios.head()

Unnamed: 0,Cidade,Categoria,Nome_Local,Autor,Texto,Data,Rating,Idioma
0,Arouca,restaurante,Tasquinha da Quinta,IC,"I came here as part of a tour, and this restau...",10 months ago,5,en
1,Arouca,restaurante,Tasquinha da Quinta,Preetam Nath,Has to be the most delicious veal I've had in ...,a year ago,5,en
2,Arouca,restaurante,Tasquinha da Quinta,Roya MJ,Came here as part of a tour and very much appr...,a year ago,5,en
3,Arouca,restaurante,Tasquinha da Quinta,Jonathan lugo,Very good food and an excellent place to eat! ...,a year ago,5,en
4,Arouca,restaurante,Tasquinha da Quinta,Benjamim Nande,Great place for a good typical Portuguese food...,4 months ago,5,en


In [46]:
comentarios['Data'].unique()

array(['10 months ago', 'a year ago', '4 months ago', '5 months ago',
       '7 months ago', '3 years ago', '4 years ago', '2 years ago',
       '11 months ago', '6 years ago', '8 years ago', '5 years ago',
       '8 months ago', '3 weeks ago', '9 months ago', '6 months ago',
       'a month ago', 'a week ago', '7 years ago', '10 years ago',
       '3 months ago', 'in the last week', '2 months ago', '9 years ago',
       '2 weeks ago', '4 weeks ago', '11 years ago', '12 months ago'],
      dtype=object)

In [47]:
# Converter a coluna de datas relativas para datas reais
comentarios['Data_Convertida'] = comentarios['Data'].apply(lambda x: dateparser.parse(x, settings={'RELATIVE_BASE': datetime.today()}))

# Ver exemplo
print(comentarios[['Data', 'Data_Convertida']].head())

            Data            Data_Convertida
0  10 months ago 2024-08-11 18:20:59.350385
1     a year ago 2024-06-11 18:20:59.441467
2     a year ago 2024-06-11 18:20:59.458231
3     a year ago 2024-06-11 18:20:59.458231
4   4 months ago 2025-02-11 18:20:59.458231


In [48]:
comentarios.head(3)

Unnamed: 0,Cidade,Categoria,Nome_Local,Autor,Texto,Data,Rating,Idioma,Data_Convertida
0,Arouca,restaurante,Tasquinha da Quinta,IC,"I came here as part of a tour, and this restau...",10 months ago,5,en,2024-08-11 18:20:59.350385
1,Arouca,restaurante,Tasquinha da Quinta,Preetam Nath,Has to be the most delicious veal I've had in ...,a year ago,5,en,2024-06-11 18:20:59.441467
2,Arouca,restaurante,Tasquinha da Quinta,Roya MJ,Came here as part of a tour and very much appr...,a year ago,5,en,2024-06-11 18:20:59.458231


In [49]:
# Converter para apenas data (datetime.date)
comentarios['Data_Convertida'] = pd.to_datetime(comentarios['Data_Convertida']).dt.date
comentarios['Data_Convertida'] = pd.to_datetime(comentarios['Data_Convertida'])

In [50]:
comentarios['Data_Convertida'].isnull().sum()

0

In [51]:
comentarios.head(5)

Unnamed: 0,Cidade,Categoria,Nome_Local,Autor,Texto,Data,Rating,Idioma,Data_Convertida
0,Arouca,restaurante,Tasquinha da Quinta,IC,"I came here as part of a tour, and this restau...",10 months ago,5,en,2024-08-11
1,Arouca,restaurante,Tasquinha da Quinta,Preetam Nath,Has to be the most delicious veal I've had in ...,a year ago,5,en,2024-06-11
2,Arouca,restaurante,Tasquinha da Quinta,Roya MJ,Came here as part of a tour and very much appr...,a year ago,5,en,2024-06-11
3,Arouca,restaurante,Tasquinha da Quinta,Jonathan lugo,Very good food and an excellent place to eat! ...,a year ago,5,en,2024-06-11
4,Arouca,restaurante,Tasquinha da Quinta,Benjamim Nande,Great place for a good typical Portuguese food...,4 months ago,5,en,2025-02-11


# === 4. NLP OF COMMENTS ===

In [52]:
# Para garantir resultados consistentes
DetectorFactory.seed = 0

# Função robusta para detetar idioma
def detectar_idioma(texto):
    try:
        return detect(str(texto))
    except LangDetectException:
        return "unknown"

# Criar nova coluna com idioma detetado
comentarios['Idioma'] = comentarios['Texto'].apply(detectar_idioma)

In [53]:
# Função para traduzir
def traduzir_para_ingles(texto):
    try:
        if not texto or pd.isna(texto):
            return texto
        time.sleep(1.2)  # espera 1.2 segundos
        return GoogleTranslator(source='auto', target='en').translate(texto)
    except Exception as e:
        print(f"Erro ao traduzir: {e}")
        return texto

# Criar nova coluna 'translated_text' com o texto original ou traduzido
comentarios['translated_text'] = comentarios.apply(
    lambda row: row['Texto'] if row['Idioma'] == 'en' else traduzir_para_ingles(row['Texto']),
    axis=1
)

Erro ao traduzir: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Erro ao traduzir: ('Connection aborted.', ConnectionAbortedError(10053, 'Uma ligação estabelecida foi anulada pelo software no computador anfitrião', None, 10053, None))
Erro ao traduzir: HTTPSConnectionPool(host='translate.google.com', port=443): Max retries exceeded with url: /m?tl=en&sl=auto&q=Eu+n%C3%A3o+gostava+de+sushi+mas+fui+ao+sabores+da+%C3%81sia+contra+a+minha+vontade+e+quando+provei+eu+amei%2C+5+estrelas+pelo+antendimento+e+pela+comida. (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000023B1B5428D0>: Failed to resolve 'translate.google.com' ([Errno 11001] getaddrinfo failed)"))
Erro ao traduzir: HTTPSConnectionPool(host='translate.google.com', port=443): Max retries exceeded with url: /m?tl=en&sl=auto&q=Nice+pizza%21+Recommend (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000023B1B66ADE0>: Failed 

In [54]:
# Carregar o modelo linguístico do spaCy
nlp = spacy.load("en_core_web_sm")

# Personalizar a lista de stopwords: manter 'not'
custom_stopwords = STOP_WORDS - {"not"}

# Função de normalização do texto
def normalizar_texto(texto):
    if not isinstance(texto, str):
        return ""
    texto = texto.lower()  # converter para minúsculas
    texto = re.sub(r"[{}]".format(string.punctuation), "", texto)  # remover pontuação
    texto = re.sub(r"\s+", " ", texto).strip()  # remover espaços extra
    return texto

# Aplicar normalização
comentarios["Texto_Normalizado"] = comentarios["translated_text"].apply(normalizar_texto)

# Função de lematização com remoção criteriosa de stopwords
def lematizar(texto):
    if not isinstance(texto, str) or texto.strip() == "":
        return ""
    doc = nlp(texto)
    return " ".join([
        token.lemma_ for token in doc
        if token.text.lower() not in custom_stopwords and not token.is_punct
    ])

# Aplicar lematização
comentarios["Texto_Lematizado"] = comentarios["Texto_Normalizado"].apply(lematizar)

# Cálculo da polaridade com TextBlob (poderias adicionar VADER também, se desejado)
comentarios["Polaridade"] = comentarios["Texto_Lematizado"].apply(lambda x: TextBlob(x).sentiment.polarity)

# Visualizar primeiras linhas
comentarios[["translated_text", "Texto_Lematizado", "Polaridade"]].head()


Unnamed: 0,translated_text,Texto_Lematizado,Polaridade
0,"I came here as part of a tour, and this restau...",come tour restaurant save tour goat meat potat...,0.0
1,Has to be the most delicious veal I've had in ...,delicious veal ve life probably good meat ve l...,0.583333
2,Came here as part of a tour and very much appr...,come tour appreciate enjoy food vibe service t...,0.24
3,Very good food and an excellent place to eat! ...,good food excellent place eat go group food ta...,0.85
4,Great place for a good typical Portuguese food...,great place good typical portuguese food good ...,0.446667


**Refinement of the NLP Preprocessing Strategy:**

Throughout the development process, the text preprocessing pipeline was progressively improved based on feedback and established NLP best practices. In particular, we incorporated recommendations from experts like Julia Silge to ensure a balanced approach between data cleaning and semantic preservation. 

This involved:
- Retaining context-relevant stopwords (e.g., *not*) to maintain the polarity of expressions like *not good*.
- Ensuring that automatic translations preserved emotional tone and idiomatic expressions as faithfully as possible.
- Avoiding excessive simplification that might harm topic coherence or sentiment interpretation.

In [55]:
# Testar se expressões com negação mantêm o sentido após normalização
test_sentences = ["not good", "very bad", "absolutely amazing"]
for s in test_sentences:
    print("Lematizado:", lematizar(normalizar_texto(s)))

Lematizado: not good
Lematizado: bad
Lematizado: absolutely amazing


| Original Phrase       | Lemmatized Output       | Observations                                                                 |
|-----------------------|--------------------------|------------------------------------------------------------------------------|
| `not good`            | `not good`               | ✅ Negation preserved — ideal for sentiment analysis.                        |
| `very bad`            | `bad`                    | ✅ Intensifier “very” correctly removed as a stopword; key term retained.    |
| `absolutely amazing`  | `absolutely amazing`     | ✅ “absolutely” preserved — may indicate emphasis (acceptable depending on the context). |

In [56]:
comentarios.head()

Unnamed: 0,Cidade,Categoria,Nome_Local,Autor,Texto,Data,Rating,Idioma,Data_Convertida,translated_text,Texto_Normalizado,Texto_Lematizado,Polaridade
0,Arouca,restaurante,Tasquinha da Quinta,IC,"I came here as part of a tour, and this restau...",10 months ago,5,en,2024-08-11,"I came here as part of a tour, and this restau...",i came here as part of a tour and this restaur...,come tour restaurant save tour goat meat potat...,0.0
1,Arouca,restaurante,Tasquinha da Quinta,Preetam Nath,Has to be the most delicious veal I've had in ...,a year ago,5,en,2024-06-11,Has to be the most delicious veal I've had in ...,has to be the most delicious veal ive had in m...,delicious veal ve life probably good meat ve l...,0.583333
2,Arouca,restaurante,Tasquinha da Quinta,Roya MJ,Came here as part of a tour and very much appr...,a year ago,5,en,2024-06-11,Came here as part of a tour and very much appr...,came here as part of a tour and very much appr...,come tour appreciate enjoy food vibe service t...,0.24
3,Arouca,restaurante,Tasquinha da Quinta,Jonathan lugo,Very good food and an excellent place to eat! ...,a year ago,5,en,2024-06-11,Very good food and an excellent place to eat! ...,very good food and an excellent place to eat w...,good food excellent place eat go group food ta...,0.85
4,Arouca,restaurante,Tasquinha da Quinta,Benjamim Nande,Great place for a good typical Portuguese food...,4 months ago,5,en,2025-02-11,Great place for a good typical Portuguese food...,great place for a good typical portuguese food...,great place good typical portuguese food good ...,0.446667


In [57]:
# Calcular média da polaridade por local
average_polarity = comentarios.groupby("Nome_Local")["Polaridade"].mean().reset_index()
average_polarity.columns = ["Nome_Local", "Polaridade_Média"]

# Adicionar ao DataFrame original (merge)
comentarios = comentarios.merge(average_polarity, on="Nome_Local", how="left")

# === 5. EXPORTING FILES ===

In [58]:
avaliacoes.to_csv("ratings_clean.csv", index=False, encoding="utf-8-sig")
comentarios.to_csv("comments_clean.csv", index=False, encoding="utf-8-sig")

print("✅ Ficheiros tratados com sucesso! Dois CSVs gerados: 'ratings_clean.csv' e 'comments_clean.csv'")

✅ Ficheiros tratados com sucesso! Dois CSVs gerados: 'ratings_clean.csv' e 'comments_clean.csv'


In [59]:
avaliacoes.head(20)

Unnamed: 0,Cidade,Categoria,Nome,Rating,Endereço,Tipos,Latitude,Longitude,Total_Reviews,id_unico,Grupo_Tematico,Locais_Semelhantes_Perto
0,Arouca,restaurant,Tasquinha da Quinta,4.6,"R. 1º de Maio 3, 4540-121 Arouca, Portugal","restaurant, food, point_of_interest, establish...",40.929109,-8.245191,2154,d111a3c6,Serviços,6.0
1,Arouca,restaurant,A Assembleia,4.5,"Tv. da Ribeira 11, 4540-102 Arouca, Portugal","restaurant, bar, food, point_of_interest, esta...",40.928766,-8.247588,1788,54221336,Serviços,6.0
2,Arouca,restaurant,Parlamento,4.6,"Tv. da Ribeira 2, 4540-148 Arouca, Portugal","restaurant, food, point_of_interest, establish...",40.929011,-8.247392,2469,ca96ed20,Serviços,7.0
3,Arouca,restaurant,Casa Testinha,4.5,"R. 1º de Maio 4, 4540-113 Arouca, Portugal","restaurant, food, point_of_interest, establish...",40.928876,-8.245147,623,348acace,Serviços,6.0
4,Arouca,restaurant,Pedestre 142,4.4,"R. Dr. Ângelo Miranda 142 RC, 4540-140 Arouca,...","restaurant, food, point_of_interest, establish...",40.930185,-8.25321,1047,55a3ab0d,Serviços,0.0
5,Arouca,restaurant,Tasquinho do Parque,4.5,"R. Dom Afonso Henriques 57 4540, Arouca, Portugal","restaurant, food, point_of_interest, establish...",40.927645,-8.248854,203,066473c6,Serviços,2.0
6,Arouca,restaurant,Varandinha,4.4,"Av. Reinaldo Noronha 39, 4540-181 Arouca, Port...","restaurant, food, point_of_interest, establish...",40.934069,-8.241372,721,dc4ee328,Serviços,1.0
7,Arouca,restaurant,Café Arouquense,4.3,"Av. 25 de Abril, 4540-102 Arouca, Portugal","cafe, restaurant, food, point_of_interest, est...",40.928469,-8.245599,901,47e0eea2,Serviços,6.0
8,Arouca,restaurant,Quinta D'Além da Ponte,4.5,"Lugar do, 4540-606 Arouca, Portugal","restaurant, food, point_of_interest, establish...",40.93906,-8.341836,423,61113e19,Serviços,0.0
9,Arouca,restaurant,Restaurante do Pedrogão,4.8,"4540-445 Arouca, Portugal","restaurant, food, point_of_interest, establish...",40.89902,-8.212277,272,223ec896,Serviços,0.0


In [60]:
comentarios.head(20)

Unnamed: 0,Cidade,Categoria,Nome_Local,Autor,Texto,Data,Rating,Idioma,Data_Convertida,translated_text,Texto_Normalizado,Texto_Lematizado,Polaridade,Polaridade_Média
0,Arouca,restaurante,Tasquinha da Quinta,IC,"I came here as part of a tour, and this restau...",10 months ago,5,en,2024-08-11,"I came here as part of a tour, and this restau...",i came here as part of a tour and this restaur...,come tour restaurant save tour goat meat potat...,0.0,0.424
1,Arouca,restaurante,Tasquinha da Quinta,Preetam Nath,Has to be the most delicious veal I've had in ...,a year ago,5,en,2024-06-11,Has to be the most delicious veal I've had in ...,has to be the most delicious veal ive had in m...,delicious veal ve life probably good meat ve l...,0.583333,0.424
2,Arouca,restaurante,Tasquinha da Quinta,Roya MJ,Came here as part of a tour and very much appr...,a year ago,5,en,2024-06-11,Came here as part of a tour and very much appr...,came here as part of a tour and very much appr...,come tour appreciate enjoy food vibe service t...,0.24,0.424
3,Arouca,restaurante,Tasquinha da Quinta,Jonathan lugo,Very good food and an excellent place to eat! ...,a year ago,5,en,2024-06-11,Very good food and an excellent place to eat! ...,very good food and an excellent place to eat w...,good food excellent place eat go group food ta...,0.85,0.424
4,Arouca,restaurante,Tasquinha da Quinta,Benjamim Nande,Great place for a good typical Portuguese food...,4 months ago,5,en,2025-02-11,Great place for a good typical Portuguese food...,great place for a good typical portuguese food...,great place good typical portuguese food good ...,0.446667,0.424
5,Arouca,restaurante,Parlamento,Helena Borges,Delicious food and super nice service. Generou...,a year ago,5,en,2024-06-11,Delicious food and super nice service. Generou...,delicious food and super nice service generous...,delicious food super nice service generous amo...,0.444444,0.449752
6,Arouca,restaurante,Parlamento,Greg Janes,We loved this place! We stopped here on our wa...,a year ago,5,en,2024-06-11,We loved this place! We stopped here on our wa...,we loved this place we stopped here on our way...,love place stop way town feel lucky stumble pl...,0.465385,0.449752
7,Arouca,restaurante,Parlamento,Tiago Ferreira,The food was delicious and the staff was frien...,5 months ago,5,en,2025-01-11,The food was delicious and the staff was frien...,the food was delicious and the staff was frien...,food delicious staff friendly offer dessert drink,0.6875,0.449752
8,Arouca,restaurante,Parlamento,Samuel Monteiro,Amazing food and wine. We had the house specia...,a year ago,5,en,2024-06-11,Amazing food and wine. We had the house specia...,amazing food and wine we had the house special...,amazing food wine house special vitela assada ...,0.365714,0.449752
9,Arouca,restaurante,Parlamento,Pini Shvartsman,we are the recommended steaks. very good. come...,a year ago,5,en,2024-06-11,we are the recommended steaks. very good. come...,we are the recommended steaks very good comes ...,recommend steak good come side local rice fren...,0.285714,0.449752


In [62]:
avaliacoes['Total_Reviews'].isnull().sum()

0