In [None]:
%%time
%%capture
!pip install accelerate bertopic emoji
!pip install -U bitsandbytes

In [None]:
import glob
import os
import re

import emoji
import pandas as pd
import torch
import transformers
from bertopic import BERTopic
from bertopic.representation import (
    KeyBERTInspired,
    MaximalMarginalRelevance,
    PartOfSpeech,
    TextGeneration,
)
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
from umap import UMAP

In [None]:
folder_path = r"/path/to/google play reviews"

csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
print(f"Found {len(csv_files)} CSV files")
df_list = []
for f in tqdm(csv_files):
    try:
        if os.path.getsize(f) > 0:  # skip empty files
            df = pd.read_csv(f)
            df_list.append(df)
        else:
            print(f"Skipping empty file: {f}")
    except Exception as e:
        print(f"Skipping {f} due to error: {e}")

merged_df = pd.concat(df_list, ignore_index=True)
print(merged_df.shape)
documents = merged_df["content"].dropna().tolist()
print(f"Number of documents: {len(documents):,}")

# IMPORTANT: Filter the dataframe to match documents length
# Since dropna() removes rows, we need to align the dataframe
valid_indices = merged_df["content"].dropna().index
merged_df_clean = merged_df.loc[valid_indices].reset_index(drop=True)

print(f"Clean dataframe shape: {merged_df_clean.shape}")
print(f"Documents length: {len(documents):,}")

assert len(merged_df_clean) == len(documents), (
    "Length mismatch between dataframe and documents!"
)

In [None]:
def clean_review(text):
    """Clean app review text for topic modeling"""
    if not isinstance(text, str):
        return ""

    # 1. Deemojize - convert emojis to text
    text = emoji.demojize(text, delimiters=(" ", " "))

    # 2. Lowercase
    text = text.lower()

    # 3. Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)

    # 4. Remove email addresses
    text = re.sub(r"\S+@\S+", "", text)

    # 5. Remove excessive whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text


# Apply cleaning with progress bar
cleaned_documents = [
    clean_review(doc) for doc in tqdm(documents, desc="Cleaning reviews")
]

print(f"\nOriginal documents: {len(documents):,}")
print(f"Cleaned documents: {len(cleaned_documents):,}")

# Show a few examples
print("\n=== Sample Cleaned Reviews ===")
for i in range(min(3, len(documents))):
    print(f"\nOriginal: {documents[i][:150]}")
    print(f"Cleaned:  {cleaned_documents[i][:150]}")

In [None]:
# cleaned_documents_small = cleaned_documents[:100_000]
# print(f"{len(cleaned_documents_small):,}")

In [None]:
%%time
# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L12-v2")
embeddings = embedding_model.encode(cleaned_documents, show_progress_bar=True)

In [None]:
umap_model = UMAP(
    n_neighbors=15, n_components=5, min_dist=0.0, metric="cosine", random_state=42
)
hdbscan_model = HDBSCAN(
    min_cluster_size=150,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True,
)
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

In [None]:
%%time
model_id = "Qwen/Qwen2.5-7B-Instruct-1M"
device = f"cuda:{torch.cuda.current_device()}" if torch.cuda.is_available() else "cpu"
print(device)
print(torch.cuda.is_available())

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map="auto",
)
model.eval()

In [None]:
generator = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.1,
    max_new_tokens=500,
    repetition_penalty=1.1,
)

In [None]:
# KeyBERT
keybert_model = KeyBERTInspired()
# Part-of-Speech
pos_model = PartOfSpeech("en_core_web_sm")
# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# LLM
prompt = """
I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
topic: <topic label>
"""
qwen_model = TextGeneration(generator, prompt=prompt)

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "Qwen": qwen_model,
    "MMR": mmr_model,
    "POS": pos_model,
}

topic_model = BERTopic(
    # Pipeline models
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,
    # Hyperparameters
    top_n_words=10,
    verbose=True,
)

In [None]:
%%time
# Train model
topics, probs = topic_model.fit_transform(cleaned_documents, embeddings)

In [None]:
# Show topics
topic_model.get_topic_info().head()

In [None]:
topic_model.get_topic_info().shape

In [None]:
# Convert topics to Series
topic_series = pd.Series(topics)
# Count documents per topic
topic_counts = topic_series.value_counts().sort_index()
# Percentage distribution
topic_percentages = (topic_counts / len(topics)) * 100

# Combine into a DataFrame
topic_distribution = pd.DataFrame(
    {
        "Topic": topic_counts.index,
        "Count": topic_counts.values,
        "Percentage (%)": topic_percentages.values,
    }
)
topic_distribution

In [None]:
%%time
new_topics = topic_model.reduce_outliers(
    cleaned_documents, topics, strategy="embeddings", embeddings=embeddings
)
len(new_topics)

In [None]:
%%time
topic_model.update_topics(cleaned_documents, topics=new_topics)

In [None]:
# Get updated topic assignments
updated_topics = topic_model.topics_

# Convert to Series
topic_series = pd.Series(updated_topics)

# Count documents per topic
topic_counts = topic_series.value_counts().sort_index()

# Percentage distribution
topic_percentages = (topic_counts / len(updated_topics)) * 100

# Combine into a DataFrame
updated_topic_distribution = pd.DataFrame(
    {
        "Topic": topic_counts.index,
        "Count": topic_counts.values,
        "Percentage (%)": topic_percentages.values,
    }
)

updated_topic_distribution

In [None]:
topic_model.get_topic_info().head()

In [None]:
# Get topic info
topic_info = topic_model.get_topic_info()
# Save to CSV
topic_info.to_csv("topics_all_mini_qwen_instruct.csv", index=False)

In [None]:
qwen_topics = topic_model.get_topics(full=True)["Qwen"]
print(f"Number of topics with Qwen labels: {len(qwen_topics)}")
print(f"Topic IDs: {sorted(qwen_topics.keys())}")
print(f"Total topics in model: {len(topic_model.get_topic_info())}")

# Create labels
qwen_label_dict = {
    topic_id: label_list[0][0].split("\n")[0]
    for topic_id, label_list in qwen_topics.items()
}

# topic_model.set_topic_labels(qwen_label_dict)

In [None]:
# Save the labels to a file
with open("qwen_topic_labels.txt", "w", encoding="utf-8") as f:
    for topic_id in sorted(qwen_label_dict.keys()):
        f.write(f"Topic {topic_id}: {qwen_label_dict[topic_id]}\n")

print("Labels saved to qwen_topic_labels.txt")

In [None]:
# Create a clean dataframe with only the essentials
results_df = pd.DataFrame(
    {
        "reviewId": merged_df_clean["reviewId"],  # adjust column name if different
        "review_text": merged_df_clean["content"],
        "topic_id": new_topics,  # the topic assignment
        "topic_probability": probs,  # confidence score
        "qwen_label": [qwen_label_dict.get(topic, "Unknown") for topic in new_topics],
    }
)

# Display first few rows
# print(results_df.head(10))
print(f"\nShape: {results_df.shape}")

In [None]:
results_df.head()

In [None]:
# Save to CSV
results_df.to_csv("reviews_with_topics.csv", index=False, encoding="utf-8")
# print("\nSaved to reviews_with_topics.csv")