In [1]:
import pandas as pd
import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize
from gensim import corpora, models


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DEll\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df = pd.read_csv("../data/processed/cleaned_reviews.csv")
df.head()


Unnamed: 0.1,Unnamed: 0,OverallRating,ReviewHeader,Name,Datetime,VerifiedReview,ReviewBody,TypeOfTraveller,SeatType,Route,...,GroundService,ValueForMoney,Recommended,Aircraft,Food&Beverages,InflightEntertainment,Wifi&Connectivity,cleaned_review,sentiment_score,sentiment
0,0,1.0,"""Service level far worse then Ryanair""",L Keele,19th November 2023,True,4 Hours before takeoff we received a Mail stat...,Couple Leisure,Economy Class,London to Stuttgart,...,1.0,1.0,no,,,,,hour takeoff received mail stating cryptic mes...,-0.8704,Negative
1,1,3.0,"""do not upgrade members based on status""",Austin Jones,19th November 2023,True,I recently had a delay on British Airways from...,Business,Economy Class,Brussels to London,...,1.0,2.0,no,A320,1.0,2.0,2.0,recently delay british airway bru lhr due staf...,-0.7635,Negative
2,2,8.0,"""Flight was smooth and quick""",M A Collie,16th November 2023,False,"Boarded on time, but it took ages to get to th...",Couple Leisure,Business Class,London Heathrow to Dublin,...,4.0,3.0,yes,A320,4.0,,,boarded time took age get runway due congestio...,0.4404,Positive
3,3,1.0,"""Absolutely hopeless airline""",Nigel Dean,16th November 2023,True,"5 days before the flight, we were advised by B...",Couple Leisure,Economy Class,London to Dublin,...,1.0,1.0,no,,,,,day flight advised ba cancelled asked u rebook...,-0.0508,Negative
4,4,1.0,"""Customer Service is non existent""",Gaylynne Simpson,14th November 2023,False,"We traveled to Lisbon for our dream vacation, ...",Couple Leisure,Economy Class,London to Lisbon,...,1.0,1.0,no,,1.0,1.0,1.0,traveled lisbon dream vacation cruise portugal...,0.8901,Positive


### Filter Negative Reviews

In [3]:
negative_reviews = df[df["sentiment"] == "Negative"]
negative_reviews = negative_reviews["cleaned_review"].dropna()

print("Total negative reviews:", len(negative_reviews))


Total negative reviews: 1212


### Tokenize Text

In [4]:
texts = [word_tokenize(text.lower()) for text in negative_reviews]
texts[:2]


[['hour',
  'takeoff',
  'received',
  'mail',
  'stating',
  'cryptic',
  'message',
  'disruption',
  'expected',
  'limit',
  'many',
  'plane',
  'leave',
  'time',
  'capacity',
  'heathrow',
  'airport',
  'really',
  'hit',
  'british',
  'airway',
  'surprise',
  'h',
  'departure',
  'anyhow',
  'took',
  'one',
  'hour',
  'delay',
  'forced',
  'check',
  'hand',
  'luggage',
  'travel',
  'hand',
  'luggage',
  'avoid',
  'waiting',
  'ultra',
  'slow',
  'processing',
  'checked',
  'luggage',
  'overall',
  'h',
  'later',
  'home',
  'planed',
  'really',
  'reason',
  'due',
  'incompetent',
  'people',
  'service',
  'level',
  'far',
  'worse',
  'ryanair',
  'triple',
  'price',
  'really',
  'never',
  'thanks',
  'nothing'],
 ['recently',
  'delay',
  'british',
  'airway',
  'bru',
  'lhr',
  'due',
  'staff',
  'shortage',
  'announced',
  'hour',
  'holding',
  'delay',
  'would',
  'board',
  'u',
  'immediately',
  'hope',
  'clearing',
  'gate',
  'leaving',


### Create Dictionary & Corpus

In [5]:
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

print("Total unique words:", len(dictionary))


Total unique words: 7674


### Training LDA Topic Model

In [6]:
lda_model = models.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=5,
    passes=10,
    random_state=42
)


In [7]:
topics = lda_model.print_topics(num_words=6)

for topic in topics:
    print(topic)


(0, '0.013*"flight" + 0.013*"ba" + 0.011*"food" + 0.009*"service" + 0.009*"airway" + 0.009*"british"')
(1, '0.025*"seat" + 0.016*"flight" + 0.014*"ba" + 0.012*"class" + 0.010*"service" + 0.010*"business"')
(2, '0.023*"flight" + 0.018*"ba" + 0.015*"hour" + 0.008*"luggage" + 0.008*"bag" + 0.008*"time"')
(3, '0.039*"flight" + 0.021*"ba" + 0.009*"seat" + 0.008*"london" + 0.008*"service" + 0.008*"hour"')
(4, '0.031*"flight" + 0.015*"british" + 0.014*"airway" + 0.013*"ba" + 0.013*"customer" + 0.012*"service"')


### Recommendation Engine

In [8]:
recommendations = {
    0: "Improve flight scheduling and delay communication",
    1: "Enhance seat comfort and legroom in economy class",
    2: "Improve in-flight food quality and options",
    3: "Provide better customer service and staff training",
    4: "Improve baggage handling and tracking systems"
}

for topic_id, rec in recommendations.items():
    print(f"Topic {topic_id}: {rec}")


Topic 0: Improve flight scheduling and delay communication
Topic 1: Enhance seat comfort and legroom in economy class
Topic 2: Improve in-flight food quality and options
Topic 3: Provide better customer service and staff training
Topic 4: Improve baggage handling and tracking systems
