In [1]:
import pandas as pd

df = pd.read_csv("../data/processed/cleaned_reviews.csv")


### FILTER NEGATIVE REVIEWS

In [2]:
negative_reviews = df[df["sentiment"] == "Negative"]
negative_reviews = negative_reviews["cleaned_review"].dropna()


### TOPIC MODELING

In [3]:
from gensim import corpora, models
from nltk.tokenize import word_tokenize

texts = [word_tokenize(text) for text in negative_reviews]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

lda_model = models.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=5,
    passes=10,
    random_state=42
)


In [4]:
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)


(0, '0.013*"flight" + 0.013*"ba" + 0.011*"food" + 0.009*"service" + 0.009*"airway"')
(1, '0.025*"seat" + 0.016*"flight" + 0.014*"ba" + 0.012*"class" + 0.010*"service"')
(2, '0.023*"flight" + 0.018*"ba" + 0.015*"hour" + 0.008*"luggage" + 0.008*"bag"')
(3, '0.039*"flight" + 0.021*"ba" + 0.009*"seat" + 0.008*"london" + 0.008*"service"')
(4, '0.031*"flight" + 0.015*"british" + 0.014*"airway" + 0.013*"ba" + 0.013*"customer"')


In [5]:
recommendations = {
    0: "Improve ground service efficiency and staff training",
    1: "Enhance in-flight meal quality and variety",
    2: "Reduce delays and improve communication",
    3: "Improve seat comfort and cabin cleanliness",
    4: "Improve baggage handling processes"
}


In [6]:
for topic_id, rec in recommendations.items():
    print(f"Topic {topic_id}: {rec}")


Topic 0: Improve ground service efficiency and staff training
Topic 1: Enhance in-flight meal quality and variety
Topic 2: Reduce delays and improve communication
Topic 3: Improve seat comfort and cabin cleanliness
Topic 4: Improve baggage handling processes
