In [0]:
%%capture
%pip install bertopic openai

In [0]:
CATALOG = 'cindy_demo_catalog'
SCHEMA = 'airline_bookings'
INTENTS_TABLE = "raw_intents_1000_function"
OUTPUT_TABLE = 'labeled_reviews_output_1000_all_airlines'

TOKEN = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().getOrElse(None)
WORKSPACE_URL = f'https://{spark.conf.get("spark.databricks.workspaceUrl")}'

MODEL_ID = 'databricks-meta-llama-3-1-70b-instruct'


## Load Reviews from UC

In [0]:
reviews_df = spark.table(f"{CATALOG}.{SCHEMA}.{INTENTS_TABLE}").toPandas()

In [0]:
all_reviews = [f"{review['intent']}: {review['text_summary']}" for _, review in reviews_df.iterrows()]

In [0]:
all_reviews, len(all_reviews)

(['Check-in experience: The queue was not organised, resulting in a longer wait than others.',
  'Check-in agent knowledge: The agent didn’t seem to know much about our delayed flight, nor whether she should check our bags and to where.',
  'Flight cancellation/removal: We received a text to say we had been removed from the flight due to baggage handler strikes in Brussels.',
  'Baggage collection: We waited for ages, and no sign of our bags. Even the staff there had no idea where they were.',
  'Accommodation vouchers: We expected to be given accommodation vouchers, but the staff didn’t seem to have any idea what was going on.',
  'Communication from airline: There was absolutely no information from the airline and we all felt abandoned.',
  'Compensation for new flights and hotel: The airline refused to pay us compensation for new flights we had to book, nor did they pay for our hotel, or refund our original booked flights.',
  'Flight booking and airline selection: I booked a flight

## Compute Embeddings

In [0]:
from sentence_transformers import SentenceTransformer

# Pre-calculate embeddings
embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
embeddings = embedding_model.encode(all_reviews, show_progress_bar=True)

Batches:   0%|          | 0/136 [00:00<?, ?it/s]

Option to save embeddings so dont have to generate embeddings everytime

In [0]:
# specify path to save embeddings
embedding_file_path = 'embeddings_reviews.npy'
import numpy as np
with open(embedding_file_path, 'wb') as f:
    np.save(f, embeddings)

In [0]:
# import numpy as np
# embedding_file_path = 'embeddings.npy'
# embeddings = np.load(embedding_file_path)

## Zero-shot Topic Modeling

### Predefined list of topics

In [0]:
predefined_topics  = [
    "Check-in and Boarding",
    "Seating Comfort",
    "In-Flight Wi-Fi",
    "Cabin Cleanliness",
    "Food and Beverage",
    "Flight Attendants and Crew Services",
    "Baggage Handling",
    "Flight Disruptions and Delays",
    "Loyalty Program and benefits",
    "Pricing Transparency and Fees",
    "Safety Measures",
    "Ground Services Assistance",
    "Accessibility and special assistance"
]

In [0]:
len(predefined_topics)

13

### BERTopic Model

#### Set up Databricks llm endpoint for topic generation

In [0]:
import openai
client = openai.OpenAI(base_url=f"{WORKSPACE_URL}/serving-endpoints/", api_key=TOKEN)

#### Create the representation model


In [0]:
prompt = """
Generate a short topic label based on reviews and key words describing the reviews. 

Make sure the response only contains the topic label. The response should be in the following format:
'''
topic: <topic label>
'''

Here are some examples of topic labels: 
'''
topic: Seating Comfort
topic: Food and Beverage Quality
topic: Baggage Handling
'''

Here are the reviews:
[DOCUMENTS]

Here are the keywords that are relevant to the reviews. Use them as reference for the topic label, but keep in mind the key words are not always the most representative. [KEYWORDS]

Now read the reviews and keywords carefully, and respond with the topic label that best categories the reviews. Remember to respond in the correct format.
"""

In [0]:
# import tiktoken
# Tokenizer
# tokenizer= tiktoken.encoding_for_model("gpt-3.5-turbo")

In [0]:
from bertopic.representation import OpenAI
from bertopic import BERTopic
openai_generator = OpenAI(
    client,
    model=MODEL_ID,
    chat=True,
    nr_docs=5,
    prompt=prompt
)

In [0]:
print(openai_generator.default_prompt_)


I have a topic that contains the following documents: 
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short topic label in the following format:
topic: <topic label>



In [0]:
print(openai_generator.prompt)


Generate a short topic label based on reviews and key words describing the reviews. 

Make sure the response only contain the topic label. The response should be in the following format:
'''
topic: <topic label>
'''

Here are some examples of topic labels: 
'''
topic: Seating Comfort
topic: Food and Beverage Quality
topic: Baggage Handling
'''

Here are the reviews:
[DOCUMENTS]

Here are the keywords that are relevant to the reviews. Use them as reference for the topic label, but keep in mind the key words are not always the most representative. [KEYWORDS]

Now read the reviews and keywards carefully, and respond with the topic label that best categories the reviews. Remember to respond in the correct format.



In [0]:
# Temp helper function 
def fixed_topic_labels_(self):
    """Map topic IDs to their labels.
    A label is the topic ID, along with the first four words of the topic representation, joined using '_'.
    Zeroshot topic labels come from self.zeroshot_topic_list rather than the calculated representation.

    Returns:
        topic_labels: a dict mapping a topic ID (int) to its label (str)
    """
    topic_labels = {
        key: f"{key}_" + "_".join([word[0] for word in values[:4]])
        for key, values in self.topic_representations_.items()
    }
    if self._is_zeroshot():
        # Need to correct labels from zero-shot topics
        topic_id_to_zeroshot_label = {
            self.topic_mapper_.get_mappings()[topic_id]: self.zeroshot_topic_list[zeroshot_topic_idx]
            for topic_id, zeroshot_topic_idx in self._topic_id_to_zeroshot_topic_idx.items()
        }
        topic_labels.update(topic_id_to_zeroshot_label)
    return topic_labels
BERTopic.topic_labels_ = property(fixed_topic_labels_)


%md
##### *UPDATED: `representations` use less topic representation to speed things up*

source code for zeroshot classification: https://github.com/MaartenGr/BERTopic/blob/master/bertopic/representation/_zeroshot.py


In [0]:
## UPDATED
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, ZeroShotClassification
ai_representation = [MaximalMarginalRelevance(diversity=0.3), openai_generator]
# text_model_representation = [MaximalMarginalRelevance(diversity=0.3),ZeroShotClassification(predefined_topics, model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli")] #, min_prob=0.8 


representations = {
    "AI_Generated": ai_representation,
    # "Classification_zeroshot": text_model_representation,
    "KeyBERT": KeyBERTInspired()  
}


In [0]:
# import openai
# from bertopic.backend import OpenAIBackend
# embedding_model = OpenAIBackend(client, 'databricks-bge-large-en')
# embedding_model.embed('test')

In [0]:
# from umap import UMAP
# umap_model = UMAP(n_neighbors=100, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

In [0]:
## min_cluster_size=min_topic_size

# from hdbscan import HDBSCAN
# hdbscan_model = HDBSCAN(min_cluster_size=100, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

## Put together pipeline and train
Source Code : https://github.com/MaartenGr/BERTopic/blob/master/bertopic/_bertopic.py#L3802


##### Set `min_topic_size` to be higher (min sample needed for each cluster) for large datasets*

In [0]:
zeroshot_topic_list = predefined_topics
topic_model = BERTopic(
    embedding_model="BAAI/bge-base-en-v1.5",
    verbose=True,
    # umap_model=umap_model, hdbscan_model=hdbscan_model,
    # nr_topics=40, # reduces topic by clustering after topic generation
    min_topic_size=50, 
    zeroshot_topic_list=predefined_topics,
    zeroshot_min_similarity=.6,
    representation_model=representations 
)
topics,probs = topic_model.fit_transform(all_reviews, embeddings)

2024-10-21 22:46:35,035 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-10-21 22:46:58,592 - BERTopic - Dimensionality - Completed ✓
2024-10-21 22:46:58,593 - BERTopic - Zeroshot Step 1 - Finding documents that could be assigned to either one of the zero-shot topics
2024-10-21 22:46:58,642 - BERTopic - Zeroshot Step 1 - Completed ✓
2024-10-21 22:47:14,937 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-10-21 22:47:15,005 - BERTopic - Cluster - Completed ✓
2024-10-21 22:47:15,005 - BERTopic - Zeroshot Step 2 - Combining topics from zero-shot topic modeling with topics from clustering...
2024-10-21 22:47:15,015 - BERTopic - Zeroshot Step 2 - Completed ✓
2024-10-21 22:47:15,016 - BERTopic - Representation - Extracting topics from clusters using representation models.
  0%|          | 0/16 [00:00<?, ?it/s]  6%|▋         | 1/16 [00:02<00:38,  2.54s/it] 12%|█▎        | 2/16 [00:05<00:36,  2.63s/it] 19%|█▉        | 3/16 [00:07<00:30,  

[Trace(request_id=tr-9b74742369b14c81a223ccf40bca4e77), Trace(request_id=tr-3d0b24e7a7a343a9a04ed66b7d1c1106), Trace(request_id=tr-cafa13dab8f84dcc9d9e33f6bc996964), Trace(request_id=tr-e282d53bb75e4748b1300b6648a2b838), Trace(request_id=tr-751e8bb42a63411f9857958ac45faaf3), Trace(request_id=tr-cd382528de8d4c4abb6f29eebc2902d8), Trace(request_id=tr-849d9e0edea44681ae6e99eaa91dc0ad), Trace(request_id=tr-ec43713054ab4a839339dcae8d6a0bdc), Trace(request_id=tr-e9c2e4bc9360427ab3a401752a21431f), Trace(request_id=tr-f7b48b6300644a7888cd62c0bcd4f456)]

## Results

- Name: predefined topics + top keywords for topic clusters 
- Default Representation: keywords based on c-TF-IDF (https://maartengr.github.io/BERTopic/algorithm/algorithm.html#5-topic-representation)
- AI_Generated: generates a label based on keywords and prompt 
- KeyBERT: key word extracted with Keybert() to compare

In [0]:
spark.createDataFrame(topic_model.get_topic_info()).display()

Topic,Count,Name,Representation,AI_Generated,KeyBERT,Representative_Docs
-1,58,-1_entertainment_inflight_ife_system,"List(entertainment, inflight, ife, system, movies, of, limited, the, in, flight)",List(Inflight Entertainment),"List(entertainment, inflight, advertisements, screens, good, movie, flight, no, onboard, lacklustre)","List(Entertainment quality: entertainment good, Inflight Entertainment: No inflight Entertainment., Inflight entertainment: Inflight entertainment was good.)"
0,315,Check-in and Boarding,"List(check, in, boarding, experience, at, to, and, the, was, online)",List(Check-in Experience),"List(checkin, checked, check, booked, booking, experience, arrived, queue, hotel, paid)","List(Check-in experience: No delays at Check-in, Check-in experience: The check in staff didn, Check-in experience: The check-in was smooth.)"
1,241,Seating Comfort,"List(comfort, seat, seats, seating, comfortable, the, legroom, leg, and, was)",List(Seat Comfort),"List(seats, seating, comfort, seat, comfortable, sitting, comfy, recline, sit, amenities)","List(Seat comfort: The seats were comfortable, Seat comfort: The seats were comfortable, Seat comfort: Comfortable seats)"
2,163,In-Flight Wi-Fi,"List(entertainment, flight, in, amenities, wifi, inflight, the, ife, and, no)",List(In-flight Entertainment),"List(inflight, flight, entertainment, flights, takeoff, aircraft, onboard, screens, lounge, screen)","List(In-flight entertainment: Entertainment was limited., In-flight entertainment: It, In-flight entertainment: No entertainment)"
3,114,Cabin Cleanliness,"List(cleanliness, cabin, clean, aircraft, plane, condition, the, and, was, dirty)",List(Aircraft Cleanliness),"List(cleanliness, cleaned, clean, airplane, aircraft, tidy, cabin, flight, planes, plane)","List(Aircraft cleanliness: The Aircraft was clean, Cabin cleanliness: The aircraft was clean, Cabin cleanliness: The cabin was clean)"
4,150,Food and Beverage,"List(food, quality, and, drinks, beverage, drink, good, served, was, options)",List(In-Flight Meal Quality),"List(food, quality, good, meal, meals, catering, rice, portions, chicken, snacks)","List(Food quality: good food, Food quality: Food was good., Food quality: food)"
5,349,Flight Attendants and Crew Services,"List(crew, service, cabin, friendly, staff, and, flight, attendant, were, attendants)",List(Cabin Crew Friendliness),"List(cabin, passengers, attendants, passenger, attendant, crews, crew, onboard, friendly, service)","List(Cabin crew service: the cabin crew were friendly and helpful, Cabin crew service: The cabin crew were friendly and helpful., Cabin crew service: Crew were friendly.)"
6,139,Baggage Handling,"List(baggage, luggage, handling, bag, my, to, lost, the, policy, and)",List(Baggage Handling),"List(baggage, luggage, suitcase, bags, bag, handling, passengers, missing, airline, airport)","List(Baggage handling: They lost our luggage and nobody is looking for it!, Baggage handling: no luggage problem, Baggage handling: They lost my bag)"
7,910,Flight Disruptions and Delays,"List(flight, the, delay, to, and, was, delayed, cancellation, on, hours)",List(Flight Punctuality),"List(delays, delayed, flight, delay, flights, late, punctuality, passengers, missed, plane)","List(Flight delay: Flight was delayed for two hours, Flight punctuality: The flight was on time., Flight delay: The flight was delayed by one hour)"
8,16,Loyalty Program and benefits,"List(loyalty, program, flyer, frequent, benefits, customer, plus, again, repeat, promotion)",List(Frequent Flyer Program),"List(loyalty, promotion, af, program, airline, customers, loyal, discounts, flyer, customer)","List(Loyalty program experience: Never again will I fly Iberia, signed a former Iberian Plus member, Frequent flyer program: Minor drawbacks : frequent flyer program does not seem on par with AF one ;, Frequent flyer program: Sky miles, their frequent flyer program is very popular and has many other benefits like discounts in hotels, than award tickets.)"


#### Option to further reduce outliers with clustering

In [0]:
# Reduce outliers
new_topics = topic_model.reduce_outliers(all_reviews, topics)

  0%|          | 0/1 [00:00<?, ?it/s]100%|██████████| 1/1 [00:00<00:00, 166.79it/s]


In [0]:
# Update the model
topic_model.update_topics(all_reviews, topics=new_topics)



In [0]:
# Updated topics
spark.createDataFrame(topic_model.get_topic_info()).display()

Topic,Count,Name,Representation,AI_Generated,KeyBERT,Representative_Docs
0,317,Check-in and Boarding,"List(check, in, boarding, experience, at, to, the, and, was, online)",List(Check-in Experience),"List(checkin, checked, check, booked, booking, experience, arrived, queue, hotel, paid)","List(Check-in experience: No delays at Check-in, Check-in experience: The check in staff didn, Check-in experience: The check-in was smooth.)"
1,241,Seating Comfort,"List(comfort, seat, seats, seating, comfortable, the, legroom, leg, and, was)",List(Seat Comfort),"List(seats, seating, comfort, seat, comfortable, sitting, comfy, recline, sit, amenities)","List(Seat comfort: The seats were comfortable, Seat comfort: The seats were comfortable, Seat comfort: Comfortable seats)"
2,205,In-Flight Wi-Fi,"List(entertainment, flight, in, inflight, ife, amenities, wifi, system, the, movies)",List(In-flight Entertainment),"List(inflight, flight, entertainment, flights, takeoff, aircraft, onboard, screens, lounge, screen)","List(In-flight entertainment: Entertainment was limited., In-flight entertainment: It, In-flight entertainment: No entertainment)"
3,114,Cabin Cleanliness,"List(cleanliness, cabin, clean, aircraft, plane, condition, the, and, was, dirty)",List(Aircraft Cleanliness),"List(cleanliness, cleaned, clean, airplane, aircraft, tidy, cabin, flight, planes, plane)","List(Aircraft cleanliness: The Aircraft was clean, Cabin cleanliness: The aircraft was clean, Cabin cleanliness: The cabin was clean)"
4,150,Food and Beverage,"List(food, quality, and, drinks, beverage, drink, good, was, served, options)",List(In-Flight Meal Quality),"List(food, quality, good, meal, meals, catering, rice, portions, chicken, snacks)","List(Food quality: good food, Food quality: Food was good., Food quality: food)"
5,350,Flight Attendants and Crew Services,"List(crew, service, cabin, staff, friendly, and, flight, were, attendant, attendants)",List(Cabin Crew Friendliness),"List(cabin, passengers, attendants, passenger, attendant, crews, crew, onboard, friendly, service)","List(Cabin crew service: the cabin crew were friendly and helpful, Cabin crew service: The cabin crew were friendly and helpful., Cabin crew service: Crew were friendly.)"
6,140,Baggage Handling,"List(baggage, luggage, handling, bag, my, to, lost, the, and, policy)",List(Baggage Handling),"List(baggage, luggage, suitcase, bags, bag, handling, passengers, missing, airline, airport)","List(Baggage handling: They lost our luggage and nobody is looking for it!, Baggage handling: no luggage problem, Baggage handling: They lost my bag)"
7,912,Flight Disruptions and Delays,"List(flight, the, to, delay, and, was, delayed, cancellation, on, time)",List(Flight Punctuality),"List(delays, delayed, flight, delay, flights, late, punctuality, passengers, missed, plane)","List(Flight delay: Flight was delayed for two hours, Flight punctuality: The flight was on time., Flight delay: The flight was delayed by one hour)"
8,16,Loyalty Program and benefits,"List(loyalty, program, flyer, frequent, benefits, customer, plus, again, repeat, promotion)",List(Frequent Flyer Program),"List(loyalty, promotion, af, program, airline, customers, loyal, discounts, flyer, customer)","List(Loyalty program experience: Never again will I fly Iberia, signed a former Iberian Plus member, Frequent flyer program: Minor drawbacks : frequent flyer program does not seem on par with AF one ;, Frequent flyer program: Sky miles, their frequent flyer program is very popular and has many other benefits like discounts in hotels, than award tickets.)"
9,58,Pricing Transparency and Fees,"List(pricing, for, ticket, price, hidden, charges, fees, value, prices, and)",List(Additional Fees and Charges),"List(pricing, fees, prices, fee, charges, price, charged, costs, cost, paying)","List(Pricing transparency: Deceptive pricing, extra fees for checked bags not clearly indicated, Pricing: Great price, Pricing: high price for the service provided)"


Probabilities:1) zeroshot topics:cosine similarity 2) rest: clustering 'confidence' (membership probability)

In [0]:
import pandas as pd
topic_names = [topic_model.get_topic_info(topic_id)['Name'][0] for topic_id in new_topics]
llm_topic_names = [topic_model.get_topic_info(topic_id)['AI_Generated'][0][0].strip("'''").strip('\n') for topic_id in new_topics]
results_df = pd.DataFrame(data={"topic_id": new_topics, "probability": probs, "topic_name": topic_names, "llm_topic_name": llm_topic_names, "document": all_reviews })
results_df

Unnamed: 0,topic_id,probability,topic_name,llm_topic_name,document
0,0,0.788810,Check-in and Boarding,Check-in Experience,Check-in experience: The queue was not organis...
1,7,0.741349,Flight Disruptions and Delays,Flight Punctuality,Check-in agent knowledge: The agent didn’t see...
2,7,0.752772,Flight Disruptions and Delays,Flight Punctuality,Flight cancellation/removal: We received a tex...
3,6,0.760215,Baggage Handling,Baggage Handling,"Baggage collection: We waited for ages, and no..."
4,14,0.685448,14_the_to_customer_experience,Customer Service Experience,Accommodation vouchers: We expected to be give...
...,...,...,...,...,...
4328,14,0.745417,14_the_to_customer_experience,Customer Service Experience,Previous negative experience: Last year my lug...
4329,14,0.711061,14_the_to_customer_experience,Customer Service Experience,Warning to others: I can only warn anybody not...
4330,14,0.704726,14_the_to_customer_experience,Customer Service Experience,Promotion and route: I was attracted to the pr...
4331,1,0.701608,Seating Comfort,Seat Comfort,Business class seats: Business class seats of ...


In [0]:
# Save results table to Unity Catalog
spark.createDataFrame(results_df).write.format("delta").saveAsTable(f"{CATALOG}.{SCHEma}.{OUTPUT_TABLE}")

## Visualize topics

In [0]:
# Use llm labels for visualization
llm_topic_labels = {topic:values[0][0].strip("'''").strip('\n') for topic, values in topic_model.topic_aspects_['AI_Generated'].items()}
llm_topic_labels[-1] = "Outlier Topic"
topic_model.set_topic_labels(llm_topic_labels)
# llm_topic_labels

In [0]:
topic_model.visualize_topics(custom_labels=True)

In [0]:
TOPIC_NUM = 10
topic_model.visualize_barchart(top_n_topics=TOPIC_NUM, height=200,custom_labels=True)

In [0]:
topic_model.visualize_hierarchy(custom_labels=True)

In [0]:
# Visualize hierarchy with custom labels
topic_model.visualize_hierarchy()#custom_labels=True)

In [0]:
topic_model.visualize_heatmap()

In [0]:
topic_distr, _ = topic_model.approximate_distribution(["Cabin crew service: Felt like a nuisance and was deliberately ignored by male cabin crew."])
topic_distr

  0%|          | 0/1 [00:00<?, ?it/s]100%|██████████| 1/1 [00:00<00:00, 359.32it/s]


array([[0.        , 0.        , 0.        , 0.27162929, 0.        ,
        0.58466214, 0.        , 0.04827872, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.04636327, 0.04906658]])

In [0]:
# Visualize the topic-document distribution for a single document
topic_model.visualize_distribution(topic_distr[0], custom_labels=True)

## Predict topics for new reviews

In [0]:
# topic_model.transform(new_reviews)

In [0]:
topic_distr, topic_token_distr = topic_model.approximate_distribution(all_reviews, calculate_tokens=True)

# Visualize the token-level distributions
df = topic_model.visualize_approximate_distribution(all_reviews[1], topic_token_distr[1])
df

Unnamed: 0,Airport,experience,It,took,us,hours,just,to,go,from,T5,to.1,T3,and,clear,security,check,The,place,was,chaotic
Check-in and Boarding,0.13,0.261,0.261,0.261,0.24,0.109,0.109,0.109,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Seating Comfort,0.13,0.232,0.362,0.479,0.494,0.521,0.509,0.495,0.452,0.323,0.205,0.102,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
In-Flight Wi-Fi,0.0,0.101,0.101,0.101,0.101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.229,0.453,0.652,0.887,0.658,0.434,0.234,0.0
Baggage Handling,0.0,0.12,0.229,0.229,0.229,0.109,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14_security_be_aware_risk,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.158,0.448,0.731,0.984,0.826,0.536,0.252,0.0,0.0


In [0]:
new_document_topic, topic_probabilities = topic_model.transform(["the movie was great but was having some audio glitches along the way which put off the experience"])
# Get the topic ID assigned to the new document
topic_id = new_document_topic[0]
# Get the topic words for the assigned topic
topic_words = topic_model.get_topic(topic_id)
topic_string = ", ".join([word for word, _ in topic_words])
print(f"The new document is related to Topic {topic_id}: {topic_string}")
print(topic_probabilities)

## Save Model

In [0]:
# embedding_model = "BAAI/bge-base-en-v1.5"
# topic_model.save("my_model_dir", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)
