In [6]:
import pandas as pd
from gensim import corpora, models
import os
from loguru import logger

### Configure Loguru logger

In [7]:
logger.add(
    "../logs/topic_modeling.log",
    rotation="5 MB",
    retention="10 days",
    level="INFO",
    enqueue=True,
    backtrace=True,
    diagnose=True
)


1

### file paths

In [8]:
processed_data_path = "../data/processed/british_airways_processed_reviews.csv"
topic_model_path = "../data/analysis/british_airways_topic_model.model"
lda_visualization_path = "../data/analysis/lda_visualization.html"
topics_plot_dir = "../data/analysis/topic_plots/"

### Ensure analysis directories exist

In [9]:

os.makedirs("../data/analysis/", exist_ok=True)
os.makedirs(topics_plot_dir, exist_ok=True)

In [11]:
df = pd.read_csv(processed_data_path)
logger.info(f"Loaded processed data with {len(df)} reviews.")

[32m2024-11-01 23:11:25.579[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mLoaded processed data with 2000 reviews.[0m


### Reconstruct tokens list from 'tokens_str'

In [12]:
df['tokens'] = df['tokens_str'].apply(lambda x: x.split())

### Create a dictionary and corpus

In [13]:
logger.info("Creating dictionary and corpus for LDA.")
dictionary = corpora.Dictionary(df['tokens'])
corpus = [dictionary.doc2bow(text) for text in df['tokens']]

[32m2024-11-01 23:12:08.839[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mCreating dictionary and corpus for LDA.[0m


### Build LDA model

In [14]:
num_topics = 5
logger.info(f"Building LDA model with {num_topics} topics.")
lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)        

[32m2024-11-01 23:12:39.946[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mBuilding LDA model with 5 topics.[0m


### Save the LDA model

In [15]:
lda_model.save(topic_model_path)
logger.info(f"LDA model saved to {topic_model_path}")

[32m2024-11-01 23:13:43.022[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mLDA model saved to ../data/analysis/british_airways_topic_model.model[0m


### Display the topics

In [17]:
for idx, topic in lda_model.print_topics(-1):
    # logger.info(f"Topic {idx}: {topic}")
    display(lda_model.show_topic(idx))

[('flight', 0.025005186),
 ('ba', 0.0166024),
 ('seat', 0.0162356),
 ('service', 0.010721413),
 ('crew', 0.009008567),
 ('food', 0.009003725),
 ('class', 0.008843072),
 ('cabin', 0.008061493),
 ('good', 0.0077409046),
 ('business', 0.0073857973)]

[('flight', 0.028360238),
 ('ba', 0.016220031),
 ('hour', 0.010988692),
 ('customer', 0.010270344),
 ('service', 0.009836239),
 ('told', 0.009042205),
 ('day', 0.0088401),
 ('london', 0.008535932),
 ('u', 0.008103978),
 ('airline', 0.0077981334)]

[('flight', 0.032481782),
 ('ba', 0.020719457),
 ('british', 0.010968167),
 ('airway', 0.010749453),
 ('ticket', 0.008358165),
 ('airline', 0.008239848),
 ('customer', 0.007918527),
 ('service', 0.0074725347),
 ('london', 0.007205171),
 ('cancelled', 0.006874632)]

[('flight', 0.02192964),
 ('time', 0.009269192),
 ('british', 0.00827848),
 ('ba', 0.008220757),
 ('airway', 0.007953745),
 ('seat', 0.007949523),
 ('staff', 0.007754319),
 ('hour', 0.007492683),
 ('london', 0.007338516),
 ('crew', 0.006786641)]

[('seat', 0.012894462),
 ('flight', 0.010891676),
 ('food', 0.009450885),
 ('ba', 0.008108108),
 ('crew', 0.00786671),
 ('good', 0.007348561),
 ('london', 0.006552466),
 ('cabin', 0.0064574876),
 ('service', 0.00641648),
 ('aircraft', 0.006025679)]