In [1]:
import pandas as pd
import pyarrow.parquet as pq
import torch

In [2]:
# Cargar el archivo Parquet con las reseñas
table = pq.read_table("../hoteles_review_estado_actualizado.parquet")
df = table.to_pandas()

# Filtrar las reseñas con rating igual a 1
reseñas_con_rating_1 = df[df['rating'] == 1]

# Filtrar las 70 primeras reseñas con rating igual a 1
primeras_40_reseñas = reseñas_con_rating_1.head(20)
# Extraer las reseñas y guardarlas en un archivo de texto
reviews = primeras_40_reseñas["comentario"].tolist()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\USUARIO\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Topic 1: i, job, door, car, whole, staff, little, bad, few, auto
Topic 2: i, place, worst, hot, water, green, days, last, stay, leak
Topic 3: i, car, dent, insurance, contract, manager, company, location, claim, rental
Topic 4: staff, last, i, customer, service, hotel, green, days, due, place
Topic 5: i, hotel, groupon, smell, reservation, quote, filthy, day, phone, free
Topic 6: office, service, customer, lines, staff, nice, taylor, easton, rd, proud
Topic 7: rooms, pool, road, hilton, team, last, great, conditions, customers, concert
Topic 8: i, breakfast, money, day, elevator, front, next, manager, rude, mildew
Topic 9: hotel, floor, sheets, i, elevator, area, room, total, many, renovation
Topic 10: i, card, day, someone, extra, hold, charge, bill, charges, time


In [6]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Descargar el paquete de datos de NLTK para el etiquetado gramatical
nltk.download('averaged_perceptron_tagger')

# Crear una función para filtrar las palabras por categoría gramatical y formar pares sustantivo-adjetivo
def filter_and_pair_by_pos(text, pos_tags=['NN', 'NNS', 'JJ']):
    tagged_words = nltk.pos_tag(nltk.word_tokenize(text))
    pairs = []
    noun = None
    for word, pos in tagged_words:
        if pos.startswith('N'):
            noun = word
        elif pos.startswith('J') and noun:
            pairs.append(f"{noun} {word}")
            noun = None
    return pairs

# Crear una matriz de términos-documentos usando CountVectorizer y filtrar las palabras por sustantivo y adjetivo
vectorizer = CountVectorizer(tokenizer=lambda text: filter_and_pair_by_pos(text, pos_tags=['NN', 'NNS', 'JJ']))
X = vectorizer.fit_transform(primeras_40_reseñas["comentario"])

# Aplicar LDA para identificar los temas de quejas
num_topics = 10
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(X)

# Obtener los términos más importantes por cada tema
feature_names = vectorizer.get_feature_names_out()
top_pairs_by_topic = {}
for topic_idx, topic in enumerate(lda.components_):
    top_pairs_idx = topic.argsort()[:-11:-1]
    top_pairs_by_topic[topic_idx + 1] = [feature_names[i] for i in top_pairs_idx]

# Imprimir los pares sustantivo-adjetivo más importantes por cada tema
for topic, top_pairs in top_pairs_by_topic.items():
    print(f"Topic {topic}: {', '.join(top_pairs)}")


Topic 1: inspectors non-working, ceilings/walls filthy, bathrooms/rooms filthy, towels shower, orleans second, furnishings prior, rust filthy, telephones filthy, filth unhygienic, elevator whole
Topic 2: i disappointed, wifi fast, smell second, smell pervasive, smell bad, rates additional, rate free, positives free, kind sensitive, juices free
Topic 3: elevator unstable, i ready, i next, message horrible, manager rude, breakfast worth, front nice, bed hard, i important, fuse next
Topic 4: i worst, floors wet, hotel long, hotel double, groupon helpful, morning able, place negative, club/restaurant loud, attitude last, time i
Topic 5: way many, inns past, floor only, sheets uncomfortable, middle total, machines th, i non, renovation total, hotel many, elevator due
Topic 6: hold high, facebook several, bill same, phone next, people similar, day extra, week timely, i wrong, card lied, i sure
Topic 7: road better, lot more, rooms last, spot same, experience queen, i few, body worst, car slo

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\USUARIO\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
