<a href="https://colab.research.google.com/github/arjonnill07/AI-ML-experiment-Notebooks/blob/main/12th_August_Best_Bangla_Topic_modeling_gradio_app_v1_4_30pm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%writefile requirements.txt
gradio
pandas
scikit-learn
bertopic[visualization]
sentence_transformers
torch
transformers
accelerate
bitsandbytes
huggingface_hub
requests
# --- New dependencies for the scraper ---
GoogleNews
dateparser
bnlp_toolkit
bangla-stemmer
nltk
# --- New dependencies for the dashboard ---
wordcloud
matplotlib

Writing requirements.txt


In [None]:
# This pip install command is for your Colab environment only.
# It allows you to run and test the cells interactively.
!pip install gradio pandas scikit-learn "bertopic[visualization]" sentence_transformers torch transformers accelerate bitsandbytes huggingface_hub requests -q


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m43.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:

%%writefile app.py
# --- IMPORTS & GLOBAL SETUP ---
import gradio as gr
import pandas as pd
import numpy as np
import torch
import re
import sqlite3
import json
import logging
import requests
from io import StringIO

# Transformers and BERTopic components
from transformers import pipeline, BitsAndBytesConfig
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

# Hugging Face and Colab integration (optional, for LLM access)
from huggingface_hub import login
# from google.colab import userdata # We will disable this for HF Spaces deployment

# Setup basic logging to monitor the application's health
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# A simple dictionary to hold data between UI interactions, acting as a session state.
APP_STATE = {
    "df": None,
    "bertopic_model": None,
    "topics_df": None,
    "final_df": None,
}

print("✅ app.py created. Initial imports written.")
print("✅ Dependencies installed in Colab environment.")

Writing app.py


In [None]:
%%writefile -a app.py

# --- TEXT PREPROCESSING & NORMALIZATION ---

# A comprehensive list of Bangla stop words, tailored for news and general text.
BANGLA_STOP_WORDS = [
    'অতএব', 'অথচ', 'অথবা', 'অনুযায়ী', 'অনেক', 'অনেকে', 'অনেকেই', 'অন্তত', 'অন্য', 'অবধি', 'অবশ্য',
    'অভিপ্রায়', 'একে', 'একই', 'একেবারে', 'একটি', 'একবার', 'এখন', 'এখনও', 'এখানে', 'এখানেই', 'এটি',
    'এতটাই', 'এতদূর', 'এতটুকু', 'এক', 'এবং', 'এবার', 'এমন', 'এমনভাবে', 'এর', 'এরা', 'এঁরা', 'এঁদের',
    'এই', 'এইভাবে', 'ও', 'ওঁরা', 'ওঁর', 'ওঁদের', 'ওকে', 'ওখানে', 'ওদের', 'ওর', 'কাছ', 'কাছে', 'কাজ',
    'কারণ', 'কিছু', 'কিছুই', 'কিন্তু', 'কিভাবে', 'কেন', 'কোন', 'কোনও', 'কোনো', 'ক্ষেত্রে', 'খুব',
    'গুলি', 'গিয়ে', 'চায়', 'ছাড়া', 'জন্য', 'জানা', 'ঠিক', 'তিনি', 'তিন', 'তিনিও', 'তাকে', 'তাঁকে',
    'তার', 'তাঁর', 'তারা', 'তাঁরা', 'তাদের', 'তাঁদের', 'তাহলে', ' থাকলেও', 'থেকে', 'মধ্যেই', 'মধ্যে',
    'द्वारा', 'নয়', 'না', 'নিজের', 'নিজে', 'নিয়ে', 'পারেন', 'পারা', 'পারে', 'পরে', 'পর্যন্ত', 'পুনরায়',
    'ফলে', 'বজায়', 'বা', 'বাদে', 'বার', 'বিশেষ', 'বিভিন্ন', 'ব্যবহার', 'ব্যাপারে', 'ভাবে', 'ভাবেই', 'মাধ্যমে',
    'মতো', 'মতোই', 'যখন', 'যদি', 'যদিও', 'যা', 'যাকে', 'যাওয়া', 'যায়', 'যে', 'যেখানে', 'যেতে', 'যেমন',
    'যেহেতু', 'রহিছে', 'শিক্ষা', 'শুধু', 'সঙ্গে', 'সব', 'সমস্ত', 'সম্প্রতি', 'সহ', 'সাধারণ', 'সামনে', 'হতে',
    'হতেই', 'হবে', 'হয়', 'হয়তো', 'হয়', 'হচ্ছে', 'হত', 'হলে', 'হলেও', 'হয়নি', 'হাজার', 'হোওয়া', 'আরও', 'আমরা',
    'আমার', 'আমি', 'আর', 'আগে', 'আগেই', 'আছে', 'আজ', 'তাকে', 'তাতে', 'তাদের', 'তাহার', 'তাহাতে', 'তাহারই',
    'তথা', 'তথাপি', 'সে', 'সেই', 'সেখান', 'সেখানে', 'থেকে', 'নাকি', 'নাগাদ', 'দু', 'দুটি', 'সুতরাং',
    'সম্পর্কে', 'সঙ্গেও', 'সর্বাধিক', 'সর্বদা', 'সহ', 'হৈতে', 'হইবে', 'হইয়া', 'হৈল', 'জানিয়েছেন', 'প্রতিবেদক'
]

def normalize_bangla_manual(text):
    """A robust, self-contained function to normalize Bangla text."""
    if not isinstance(text, str): return ""
    replacements = {
        '[\u09F7]': '\u09B0', '[\u09F2]': '\u09B2', '[\u09E4]': '\u098B', '[\u09E5]': '\u09E1',
        '[\u09FA]': '\u09B8\u09CD\u09AE', '[\u09FB]': '\u0995\u09CD\u09B7', '[\u0970]': '\u0966',
        '[\u09F3]': '\u09B0\u09C2', '[\u09F8]': '\u09A3', '[\u09F9]': '\u09B6', '[\u0984]': '',
        '[\u0980]': '\u0981', r'(\s)।(\s)': r'\1।\2', r'(\S)।(\S)': r'\1 । \2',
        '[\u0964][\u0964]': '\u0964', '[|]': '\u0964', '[\u09DC]': '\u09A1\u09BC',
        '[\u09DD]': '\u09A2\u09BC', '[\u09DF]': '\u09AF\u09BC',
    }
    for old, new in replacements.items():
        text = re.sub(old, new, text)
    return text

def preprocess_bangla_text(text):
    """Cleans and normalizes a single Bangla text string for NLP tasks."""
    if not isinstance(text, str): return ""
    text = normalize_bangla_manual(text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\S*@\S*\s?', '', text)
    text = re.sub(r'[^\u0980-\u09FF\s]', '', text)
    words = text.split()
    words = [word for word in words if word not in BANGLA_STOP_WORDS]
    text = " ".join(words)
    return re.sub(r'\s+', ' ', text).strip()

print("✅ Helper functions appended to app.py")

Appending to app.py


In [None]:
%%writefile -a app.py

# --- APP BRANDING & CONFIGURATION ---
# Easily update the application's title, tagline, and footer here.
APP_TITLE = "Social Perception Analyzer"
APP_TAGLINE = "Prepared for the Policymakers of Bangladesh Nationalist Party (BNP)"
APP_FOOTER = "Developed by Arjon and AI Studio"


# --- LOCAL LLM INITIALIZATION ---
def initialize_local_llm(hf_token=None):
    """
    Initializes and returns a local, quantized, lightweight LLM pipeline.
    This model is chosen for its efficiency and Bangla language specialization.
    """
    model_id = "hishab/titulm-llama-3.2-1b-v1.1"

    # 4-bit quantization to reduce memory usage significantly
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    try:
        # Check for GPU availability
        if not torch.cuda.is_available():
            logging.warning("GPU not available. LLM will run on CPU and be very slow.")
            llm_pipeline = pipeline("text-generation", model=model_id, token=hf_token)
        else:
            logging.info(f"Initializing quantized local LLM: {model_id} on GPU.")
            llm_pipeline = pipeline(
                "text-generation",
                model=model_id,
                model_kwargs={"quantization_config": quantization_config},
                device_map="auto",
                token=hf_token
            )
        return llm_pipeline
    except Exception as e:
        logging.error(f"Failed to initialize local LLM: {e}")
        # Add a note about potential trust issues for some models
        logging.info("Trying again with 'trust_remote_code=True'.")
        try:
             llm_pipeline = pipeline(
                "text-generation",
                model=model_id,
                model_kwargs={"trust_remote_code": True, "quantization_config": quantization_config},
                device_map="auto",
                token=hf_token
            )
             return llm_pipeline
        except Exception as e2:
             logging.error(f"Secondary attempt failed: {e2}")
             gr.Warning("Could not initialize the local LLM. AI features will be disabled.")
             return None

# --- DATA LOADING HELPER ---
def load_data(file_obj, gsheet_url):
    """Loads a DataFrame from either an uploaded file or a Google Sheets URL."""
    if file_obj is not None:
        logging.info(f"Loading data from uploaded file: {file_obj.name}")
        return pd.read_csv(file_obj.name)
    elif gsheet_url and gsheet_url.strip():
        logging.info(f"Loading data from Google Sheets URL.")
        try:
            # Manipulate the URL for direct CSV export
            csv_url = gsheet_url.replace('/edit?usp=sharing', '/export?format=csv&gid=0')
            response = requests.get(csv_url)
            response.raise_for_status() # Raise an exception for bad status codes
            return pd.read_csv(StringIO(response.text))
        except Exception as e:
            raise ValueError(f"Failed to load from Google Sheets URL. Please ensure the link is correct and publicly accessible. Error: {e}")
    else:
        raise ValueError("Please upload a CSV file or provide a public Google Sheets URL.")

print("✅ App branding, LLM initialization, and data loading functions appended to app.py")

Appending to app.py


In [None]:
%%writefile -a app.py

# --- MAIN ANALYSIS ENGINE ---

# We will define the AI agent in the next cell. For now, this is a placeholder.
LLM_PIPELINE = None

def run_analysis_pipeline(file_obj, gsheet_url, text_columns, analysis_mode, manual_seeds,
                          top_n_topics_slider, enable_ai_merging, hf_token, progress=gr.Progress()):
    """
    The main orchestrator function for the analysis pipeline.
    This function incorporates all our agreed-upon refinements.
    """
    global LLM_PIPELINE
    if enable_ai_merging and LLM_PIPELINE is None:
        progress(0, desc="Initializing LLM...")
        LLM_PIPELINE = initialize_local_llm(hf_token)
        if LLM_PIPELINE is None:
            gr.Warning("AI features enabled, but LLM failed to initialize. Skipping AI steps.")
            enable_ai_merging = False

    # === STEP 1: LOAD AND VALIDATE DATA ===
    progress(0.1, desc="Step 1/8: Loading and Validating Data...")
    try:
        df = load_data(file_obj, gsheet_url)
        if not text_columns: raise ValueError("Please select at least one text column to analyze.")
        df['combined_text'] = df[text_columns].fillna('').astype(str).agg(' '.join, axis=1)
        df.dropna(subset=['combined_text'], inplace=True)
        df['processed_text'] = df['combined_text'].apply(preprocess_bangla_text)

        # REFINEMENT: Filter by word count for more robust document validation.
        df_analysis = df[df['processed_text'].str.split().str.len() > 2].copy()
        if df_analysis.empty:
            raise ValueError("No documents with sufficient content found after cleaning. Please check your data and column selection.")
        documents = df_analysis['processed_text'].tolist()
        APP_STATE["df"] = df_analysis # Save the analyzable dataframe
    except Exception as e:
        logging.error(f"Data Loading Error: {e}")
        return {log_output: f"Error during data loading: {e}"}

    # === STEP 2: PREPARE GUIDANCE (IF MANUAL SEEDING) ===
    progress(0.2, desc="Step 2/8: Preparing Analysis Mode...")
    y_guidance = None
    if analysis_mode == "Manual Seeding" and manual_seeds:
        try:
            seed_topics_dict = json.loads(manual_seeds)
            y_guidance = [-1] * len(documents)
            topic_name_to_id = {name: i for i, name in enumerate(seed_topics_dict.keys())}
            for i, doc in enumerate(documents):
                for topic_name, keywords in seed_topics_dict.items():
                    if any(keyword in doc for keyword in keywords):
                        y_guidance[i] = topic_name_to_id[topic_name]
                        break # Prioritizes the first match in the JSON
        except Exception as e:
            return {log_output: f"Error: Invalid JSON in Manual Seeds. Details: {e}"}

    # === STEP 3: EMBEDDINGS & MODEL SETUP (WITH REFINEMENTS) ===
    progress(0.3, desc="Step 3/8: Calculating Document Embeddings...")
    embedding_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
    embeddings = embedding_model.encode(documents, show_progress_bar=True)

    # REFINEMENT: Lower min_cluster_size for more sensitive topic detection.
    hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
    # REFINEMENT: Use max_df and min_df for adaptive stop word filtering.
    vectorizer_model = CountVectorizer(tokenizer=lambda doc: doc.split(), ngram_range=(1, 3), max_df=0.90, min_df=5)

    # Other components remain robust
    umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
    representation_model = KeyBERTInspired()

    # === STEP 4: TRAIN TOPIC MODEL ===
    progress(0.5, desc="Step 4/8: Training BERTopic Model...")
    topic_model = BERTopic(
        embedding_model=embedding_model, umap_model=umap_model, hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model, representation_model=representation_model,
        language="multilingual", verbose=False
    )
    topics, _ = topic_model.fit_transform(documents, embeddings, y=y_guidance)

    # === STEP 5: AI REFINEMENT (IF ENABLED) ===
    if enable_ai_merging and LLM_PIPELINE:
        progress(0.6, desc="Step 5/8: Running AI Refinement Agent...")
        # We will define `run_ai_refinement` in the next cell. This is the hook.
        topic_model = run_ai_refinement(topic_model, LLM_PIPELINE, progress)
    else:
        progress(0.6, desc="Step 5/8: Skipping AI Refinement...")
        # Fallback to default naming if AI is disabled
        generated_labels = topic_model.generate_topic_labels(nr_words=4, separator=", ")
        topic_model.set_topic_labels(generated_labels)

    # === STEP 6: APPLY MANUAL SEED NAMES ===
    progress(0.7, desc="Step 6/8: Finalizing Topic Names...")
    if analysis_mode == "Manual Seeding" and 'seed_topics_dict' in locals():
        for topic_name, topic_id in topic_name_to_id.items():
            if topic_id in topic_model.get_topic_info()['Topic'].values:
                topic_model.set_topic_labels({topic_id: topic_name})

    # === STEP 7: PREPARE FINAL OUTPUTS & VISUALIZATIONS ===
    progress(0.85, desc="Step 7/8: Preparing Visualizations...")
    APP_STATE["bertopic_model"] = topic_model
    df_analysis['Topic'] = topics
    APP_STATE["final_df"] = df_analysis
    topics_df = topic_model.get_topic_info()
    APP_STATE["topics_df"] = topics_df

    # REFINEMENT: Safeguard against memory errors on very large datasets.
    if len(documents) > 50000:
        gr.Info("Dataset is large. Visualizing a sample of 50,000 documents for performance.")
        indices = np.random.choice(len(documents), 50000, replace=False)
        sampled_docs = [documents[i] for i in indices]
        sampled_embeddings = embeddings[indices]
        doc_topic_landscape_plot = topic_model.visualize_documents(sampled_docs, embeddings=sampled_embeddings)
    else:
        doc_topic_landscape_plot = topic_model.visualize_documents(documents, embeddings=embeddings)

    inter_topic_map_plot = topic_model.visualize_topics()
    # REFINEMENT: Use slider value for dynamic chart generation.
    num_chart_topics = int(top_n_topics_slider)
    top_topics_barchart_plot = topic_model.visualize_barchart(top_n_topics=num_chart_topics)
    topic_similarity_heatmap_plot = topic_model.visualize_heatmap(top_n_topics=num_chart_topics)
    topic_hierarchy_plot = topic_model.visualize_hierarchy(top_n_topics=num_chart_topics)

    review_topic_table = topics_df[['Topic', 'Name', 'Count']].rename(columns={'Topic':'ID', 'Name':'Topic Name', 'Count':'Documents'})

    # Check for date columns for the temporal analysis tab
    date_columns = [col for col in df_analysis.columns if pd.to_datetime(df_analysis[col], errors='coerce').notna().any()]

    # === STEP 8: UPDATE UI WITH RESULTS ===
    progress(1.0, desc="Step 8/8: Finalizing UI...")
    return {
        log_output: f"✅ Analysis Complete! Discovered {len(topics_df)-1} topics.",
        # Make result tabs visible
        review_tab: gr.update(visible=True),
        visualize_tab: gr.update(visible=True),
        # Populate the review tab
        review_topic_table_df: gr.update(value=review_topic_table),
        # Populate the visualization tab
        doc_topic_landscape_plot_ui: doc_topic_landscape_plot,
        inter_topic_map_plot_ui: inter_topic_map_plot, # Hook for the fixed plot
        top_topics_barchart_plot_ui: top_topics_barchart_plot,
        topic_similarity_heatmap_ui: topic_similarity_heatmap_plot,
        topic_hierarchy_plot_ui: topic_hierarchy_plot,
        # Update and enable the temporal analysis tab if date columns exist
        temporal_analysis_group: gr.update(visible=len(date_columns) > 0),
        date_column_dropdown: gr.update(choices=date_columns, value=date_columns[0] if date_columns else None),
    }

print("✅ Main analysis pipeline function appended to app.py")

Appending to app.py


In [None]:
%%writefile -a app.py

# --- AI REFINEMENT AGENT ---

def run_ai_refinement(topic_model, llm_pipeline, progress=gr.Progress()):
    """
    Uses a lightweight LLM to generate high-quality, contextual topic names.
    Includes a conceptual hook for future AI-powered topic merging.
    """
    logging.info("Starting AI Refinement Agent...")

    # --- Task 1: AI-Powered Topic Naming ---
    progress(0, desc="AI Agent: Generating Topic Names...")
    topic_info_df = topic_model.get_topic_info()
    new_labels = {}

    # This is the advanced, few-shot Bangla prompt we designed.
    # It will be used for each topic.
    prompt_template = """
আপনি একজন পেশাদার সংবাদ সম্পাদক। আপনার কাজ হলো বাংলাদেশের রাজনৈতিক ঘটনাবলী, বিশেষ করে বিএনপির 'তারুণ্যের সমাবেশ' সংক্রান্ত সংবাদের জন্য একটি সংক্ষিপ্ত ও প্রাসঙ্গিক শিরোনাম তৈরি করা। প্রদত্ত কীওয়ার্ডগুলো ব্যবহার করে একটি (৩-৫ শব্দের) সারগর্ভ বাংলা শিরোনাম লিখুন, যেখানে সমাবেশের মূল বিষয় বা স্থান স্পষ্টভাবে ফুটে উঠবে। উদাহরণগুলো দেখুন।

--- উদাহরণ ---
ইনপুট কীওয়ার্ড: ['খুলনা', 'তারুণ্যের', 'সমাবেশ', 'বিএনপি']
আউটপুট শিরোনাম: খুলনায় বিএনপির তারুণ্যের সমাবেশ

ইনপুট কীওয়ার্ড: ['ঢাকা', 'নয়াপল্টন', 'তারুণ্যের', 'স্রোত', 'বৃষ্টি']
আউটপুট শিরোনাম: ঢাকায় তারুণ্যের সমাবেশে জনতার ঢল

ইনপুট কীওয়ার্ড: ['চট্টগ্রাম', 'বক্তব্য', 'মির্জা ফখরুল', 'শোডাউন']
আউটপুট শিরোনাম: চট্টগ্রামে মির্জা ফখরুলের তারুণ্যের সমাবেশ
--- উদাহরণের শেষ ---

--- আপনার কাজ ---
ইনপুট কীওয়ার্ড: {keywords}
আউটপুট শিরোনাম:
"""

    # Tuned parameters for reliable, non-creative naming
    generation_params = {
        "temperature": 0.3,
        "max_new_tokens": 30,
        "repetition_penalty": 1.2,
        "do_sample": True
    }

    # Iterate through each topic to generate a new name
    for index, row in topic_info_df.iterrows():
        topic_id = row['Topic']
        if topic_id == -1:
            # We don't rename the outlier topic
            new_labels[topic_id] = "Topic -1: Outliers"
            continue

        keywords = row['Representation']

        # Format the prompt for the current topic
        prompt = prompt_template.format(keywords=keywords)

        try:
            # Call the LLM pipeline
            response = llm_pipeline(prompt, **generation_params)
            # Extract the generated text, stripping whitespace and the prompt's artifacts
            generated_name = response[0]['generated_text'].split("আউটপুট শিরোনাম:")[1].strip()

            if generated_name:
                new_labels[topic_id] = f"Topic {topic_id}: {generated_name}"
                logging.info(f"Generated name for Topic {topic_id}: {generated_name}")
            else:
                # Fallback to default name if generation fails
                new_labels[topic_id] = topic_model.get_topic_label(topic_id, nr_words=4)
        except Exception as e:
            logging.error(f"LLM failed for Topic {topic_id}. Error: {e}")
            # Fallback for safety
            new_labels[topic_id] = topic_model.get_topic_label(topic_id, nr_words=4)

        progress.update((index + 1) / len(topic_info_df))

    # Apply all the new, AI-generated labels at once
    topic_model.set_topic_labels(new_labels)
    logging.info("✅ AI Naming complete.")

    # --- Task 2: AI-Powered Merging (Conceptual Hook) ---
    # This section is a placeholder for a future enhancement.
    # The logic would be:
    # 1. Calculate topic similarity matrix.
    # 2. Identify pairs with similarity > threshold (e.g., 0.85).
    # 3. Use a "Judge" prompt to ask the LLM if they should be merged.
    # 4. If LLM says "YES", call `topic_model.merge_topics()`.
    logging.info("Skipping AI Topic Merging (conceptual feature).")

    return topic_model

print("✅ AI Refinement Agent function appended to app.py")

Appending to app.py


In [None]:
%%writefile -a app.py

# --- FINAL BACKEND HANDLERS & HELPERS ---

def get_topic_details(topic_id: int):
    """Fetches details for a selected topic to display in the review tab."""
    empty_return = {topic_name_textbox: "", topic_word_cloud_plot: None, topic_docs_df: pd.DataFrame()}
    model = APP_STATE.get("bertopic_model")
    if model is None or topic_id is None: return empty_return
    try:
        topic_id = int(topic_id)
        topic_info = model.get_topic_info(topic_id=topic_id)
        if topic_info.empty: return empty_return

        # Strip the "Topic X: " prefix for cleaner editing
        topic_name = topic_info['Name'].iloc[0]
        cleaned_name = re.sub(r'^Topic \d+:\s*', '', topic_name)

        # For the outlier topic, don't generate plots
        if topic_id == -1:
            return {topic_name_textbox: cleaned_name, topic_word_cloud_plot: None, topic_docs_df: pd.DataFrame()}

        word_cloud_fig = model.visualize_barchart(top_n_topics=1, topics=[topic_id])
        docs_df = pd.DataFrame(model.get_representative_docs(topic_id), columns=['Representative Document'])
        return {topic_name_textbox: cleaned_name, topic_word_cloud_plot: word_cloud_fig, topic_docs_df: docs_df}
    except Exception as e:
        logging.error(f"Error getting topic details for ID {topic_id}: {e}")
        return empty_return

def update_topic_name(topic_id, new_name):
    """Handler for manual topic renaming."""
    model = APP_STATE.get("bertopic_model")
    if model and topic_id is not None and new_name:
        topic_id = int(topic_id)
        # Add the prefix back for consistency
        full_name = f"Topic {topic_id}: {new_name}"
        model.set_topic_labels({topic_id: full_name})
        APP_STATE["topics_df"] = model.get_topic_info()
        gr.Info(f"Topic {topic_id} renamed to '{new_name}'")
        # Return the updated table for the UI
        return gr.update(value=APP_STATE["topics_df"][['Topic', 'Name', 'Count']].rename(columns={'Topic':'ID', 'Name':'Topic Name', 'Count':'Documents'}))
    return gr.update() # No change

def merge_selected_topics(topics_to_merge):
    """Handler for manual topic merging."""
    model = APP_STATE.get("bertopic_model")
    if model and topics_to_merge and len(topics_to_merge) > 1:
        # Convert topic names like "Topic 0: ..." to integer IDs
        topic_ids = [int(re.search(r'\d+', t).group()) for t in topics_to_merge]

        model.merge_topics(topics_to_merge=[topic_ids])

        # After merging, we need to refresh the state and UI components
        APP_STATE["topics_df"] = model.get_topic_info()
        review_topic_table = APP_STATE["topics_df"][['Topic', 'Name', 'Count']].rename(columns={'Topic':'ID', 'Name':'Topic Name', 'Count':'Documents'})

        gr.Info(f"Successfully merged topics: {topic_ids}")
        return {
            review_topic_table_df: gr.update(value=review_topic_table),
            # Clear the selection and the details view
            topic_merger_checkboxgroup: gr.update(value=[]),
            topic_name_textbox: "",
            topic_word_cloud_plot: None,
            topic_docs_df: pd.DataFrame(),
        }
    gr.Warning("Please select at least two topics to merge.")
    return {review_topic_table_df: gr.update(), topic_merger_checkboxgroup: gr.update()}


def generate_temporal_plot(date_column, progress=gr.Progress()):
    """Generates and displays the topics over time plot."""
    progress(0, desc="Preparing time data...")
    if not date_column: return None
    model, df = APP_STATE.get("bertopic_model"), APP_STATE.get("final_df")
    if model is None or df is None: return None

    df_temporal = df.copy()
    df_temporal['timestamp'] = pd.to_datetime(df_temporal[date_column], errors='coerce')
    df_temporal.dropna(subset=['timestamp'], inplace=True)

    if df_temporal.empty:
        gr.Warning(f"The column '{date_column}' contains no valid dates after conversion.")
        return None

    progress(0.6, desc="Generating topic trends over time...")
    try:
        # BERTopic requires the original documents and timestamps for this plot
        docs_temporal = df_temporal['processed_text'].tolist()
        timestamps_temporal = df_temporal['timestamp'].tolist()
        topics_over_time = model.topics_over_time(docs=docs_temporal, timestamps=timestamps_temporal)
        return model.visualize_topics_over_time(topics_over_time)
    except Exception as e:
        gr.Error(f"Could not generate temporal plot. This can happen if topics are not found in the selected time range. Error: {e}")
        return None

def generate_media_analysis(media_column):
    """Generates a bar chart for media source analysis."""
    if not media_column:
        gr.Warning("Please select a media column to analyze.")
        return None
    df = APP_STATE.get("df")
    if df is None or media_column not in df.columns:
        return None

    counts = df[media_column].value_counts().nlargest(20) # Get top 20 sources

    # Using Gradio's built-in plotting for simplicity
    plot_df = pd.DataFrame({'Media Source': counts.index, 'Article Count': counts.values})
    return gr.BarPlot(
        plot_df,
        x='Media Source',
        y='Article Count',
        title=f'Top 20 Media Sources by Article Count',
        tooltip=['Media Source', 'Article Count'],
        height=500,
        vertical_guides=[{'value': counts.mean(), 'label': 'Average'}]
    )

def finalize_and_save():
    """Saves the final DataFrame and topic definitions to files."""
    if APP_STATE.get("final_df") is None or APP_STATE.get("topics_df") is None:
        gr.Warning("No data available to save.")
        return None

    final_df_to_save, topics_df_to_save = APP_STATE["final_df"].copy(), APP_STATE["topics_df"].copy()

    # Convert list columns to JSON strings for compatibility
    for col in ['Representation', 'Representative_Docs']:
        if col in topics_df_to_save.columns:
            topics_df_to_save[col] = topics_df_to_save[col].apply(
                lambda x: json.dumps(x) if isinstance(x, list) else x
            )

    db_path, csv_path = "topic_analysis_results.sqlite", "labeled_documents.csv"

    with sqlite3.connect(db_path) as conn:
        topics_df_to_save.to_sql("topic_definitions", conn, if_exists="replace", index=False)
        final_df_to_save.to_sql("enriched_documents", conn, if_exists="replace", index=False)

    topic_map = topics_df_to_save.set_index('Topic')['Name'].to_dict()
    final_df_to_save['Topic_Name'] = final_df_to_save['Topic'].map(topic_map)
    final_df_to_save.to_csv(csv_path, index=False, encoding='utf-8-sig')

    gr.Info(f"Results saved to {db_path} and {csv_path}")
    return [db_path, csv_path]

print("✅ Final backend handlers appended to app.py")

Appending to app.py


In [None]:
%%writefile -a app.py

# --- GRADIO UI LAYOUT & EVENT HANDLERS ---

with gr.Blocks(theme=gr.themes.Soft(), title=APP_TITLE) as app:
    gr.Markdown(f"# {APP_TITLE}")
    gr.Markdown(f"*{APP_TAGLINE}*")

    with gr.Tabs() as tabs:
        # === SETUP & RUN TAB ===
        with gr.TabItem("1. Setup & Run Analysis", id=0):
            with gr.Row():
                with gr.Column(scale=1):
                    gr.Markdown("### 1. Data Input")
                    file_upload = gr.File(label="Upload CSV File", file_types=[".csv"])
                    gsheet_url = gr.Textbox(label="Or Paste Google Sheets URL", placeholder="https://docs.google.com/spreadsheets/d/...")

                    gr.Markdown("### 2. Select Columns")
                    text_columns_checkboxgroup = gr.CheckboxGroup(label="Select Text Columns for Analysis", interactive=True)

                    gr.Markdown("### 3. Configure Analysis")
                    analysis_mode_radio = gr.Radio(["Discovery Mode", "Manual Seeding"], value="Discovery Mode", label="Analysis Mode")
                    manual_seeds_textbox = gr.Textbox(label="Manual Seed Topics (JSON format)", visible=False, lines=5)
                    # FIX: Assign the markdown to a variable so we can target it directly
                    manual_seeds_example = gr.Markdown("Example: `{\"Topic A\": [\"keyword1\", \"keyword2\"], \"Topic B\": [\"wordA\", \"wordB\"]}`", visible=False)

                    top_n_topics_slider = gr.Slider(label="Number of Topics for Charts", minimum=5, maximum=50, value=15, step=1)

                    gr.Markdown("### 4. Advanced (Optional)")
                    enable_ai_merging_checkbox = gr.Checkbox(label="Enable AI Topic Naming (Requires GPU & HF Token)", value=False)
                    hf_token_textbox = gr.Textbox(label="Hugging Face Token", type="password", placeholder="hf_...", info="Required if AI is enabled.")

                    start_button = gr.Button("Start Analysis", variant="primary")

                with gr.Column(scale=2):
                    log_output = gr.Textbox(label="Pipeline Progress", lines=25, interactive=False, autoscroll=True)

        # === REVIEW & FINALIZE TAB ===
        with gr.TabItem("2. Review & Finalize", id=1, visible=False) as review_tab:
            gr.Markdown("### Review, Refine, and Finalize Your Topic Model")
            with gr.Row():
                with gr.Column(scale=2):
                    gr.Markdown("**Topics Found**")
                    review_topic_table_df = gr.DataFrame(headers=["ID", "Topic Name", "Documents"], interactive=True, wrap=True, scale=2)
                with gr.Column(scale=3):
                    gr.Markdown("**Selected Topic Details**")
                    topic_id_state = gr.State() # Hidden state to store the selected topic ID
                    topic_name_textbox = gr.Textbox(label="Topic Name (Editable)")
                    update_name_button = gr.Button("Update Name")
                    topic_word_cloud_plot = gr.Plot(label="Top Words for Selected Topic")
                    topic_docs_df = gr.DataFrame(headers=["Representative Document"], wrap=True)

            with gr.Row():
                gr.Markdown("### Manual Topic Merging")
            with gr.Row():
                topic_merger_checkboxgroup = gr.CheckboxGroup(label="Select 2 or more topics to merge", interactive=True)
                merge_button = gr.Button("Merge Selected Topics", variant="stop")
            with gr.Row():
                finalize_button = gr.Button("Save Final Results to Files", variant="primary")
                download_link = gr.File(label="Download Results (SQLite DB and CSV)", file_count="multiple")


        # === VISUALIZE & EXPLORE TAB ===
        with gr.TabItem("3. Visualize & Explore", id=2, visible=False) as visualize_tab:
            with gr.Tabs():
                with gr.TabItem("Document Landscape"):
                    gr.Markdown("A 2D map of every document, colored by its assigned topic. This shows the overall structure of your data.")
                    doc_topic_landscape_plot_ui = gr.Plot()
                with gr.TabItem("Topic Relationships"):
                    gr.Markdown("Visualizations showing how topics relate to each other.")
                    inter_topic_map_plot_ui = gr.Plot(label="Inter-Topic Distance Map")
                    topic_hierarchy_plot_ui = gr.Plot(label="Hierarchical Clustering of Topics")
                    topic_similarity_heatmap_ui = gr.Plot(label="Topic Similarity Heatmap")
                with gr.TabItem("Topic Keywords"):
                    gr.Markdown("A bar chart showing the most important keywords for the most prominent topics.")
                    top_topics_barchart_plot_ui = gr.Plot()
                with gr.TabItem("Temporal Analysis"):
                    with gr.Group(visible=False) as temporal_analysis_group:
                        gr.Markdown("Select a date column from your data to see how topic popularity has changed over time.")
                        with gr.Row():
                            date_column_dropdown = gr.Dropdown(label="Select Date Column")
                            generate_trends_button = gr.Button("Generate Trend Plot")
                        temporal_plot_ui = gr.Plot()

        # === SOURCE ANALYSIS TAB ===
        with gr.TabItem("4. Source Analysis", id=3, visible=False) as source_tab:
            gr.Markdown("### Analyze the Distribution of News Sources")
            with gr.Row():
                media_column_dropdown = gr.Dropdown(label="Select Your Media/Source Column")
                analyze_media_button = gr.Button("Analyze Sources")
            with gr.Row():
                media_plot = gr.BarPlot()

    gr.Markdown(f"<div style='text-align: center;'>{APP_FOOTER}</div>")

    # --- EVENT HANDLERS ---

    def update_column_selector(file, url):
        """Populates column selectors after data is loaded."""
        # This function also makes the source analysis tab visible if data loads
        if file is None and not url:
            return {text_columns_checkboxgroup: gr.update(choices=[], value=None), media_column_dropdown: gr.update(choices=[], value=None), source_tab: gr.update(visible=False)}
        try:
            df = load_data(file, url)
            text_cols = [col for col in df.columns if df[col].dtype == 'object']
            return {
                text_columns_checkboxgroup: gr.update(choices=text_cols, value=text_cols if text_cols else None),
                media_column_dropdown: gr.update(choices=df.columns.tolist()),
                source_tab: gr.update(visible=True)
            }
        except Exception as e:
            gr.Warning(f"Failed to read columns: {e}")
            return {text_columns_checkboxgroup: gr.update(choices=[], value=None), media_column_dropdown: gr.update(choices=[], value=None), source_tab: gr.update(visible=False)}

    file_upload.upload(fn=update_column_selector, inputs=[file_upload, gsheet_url], outputs=[text_columns_checkboxgroup, media_column_dropdown, source_tab])
    gsheet_url.submit(fn=update_column_selector, inputs=[file_upload, gsheet_url], outputs=[text_columns_checkboxgroup, media_column_dropdown, source_tab])

    # FIX: A single, robust function to control the visibility of manual seeding UI elements
    def toggle_manual_seeding_ui(mode):
        is_visible = mode == "Manual Seeding"
        return {
            manual_seeds_textbox: gr.update(visible=is_visible),
            manual_seeds_example: gr.update(visible=is_visible)
        }

    analysis_mode_radio.change(
        fn=toggle_manual_seeding_ui,
        inputs=analysis_mode_radio,
        outputs=[manual_seeds_textbox, manual_seeds_example]
    )

    start_button.click(
        fn=run_analysis_pipeline,
        inputs=[file_upload, gsheet_url, text_columns_checkboxgroup, analysis_mode_radio, manual_seeds_textbox, top_n_topics_slider, enable_ai_merging_checkbox, hf_token_textbox],
        outputs=[log_output, review_tab, visualize_tab, review_topic_table_df, doc_topic_landscape_plot_ui, inter_topic_map_plot_ui,
                 top_topics_barchart_plot_ui, topic_similarity_heatmap_ui, topic_hierarchy_plot_ui, temporal_analysis_group, date_column_dropdown]
    )

    def on_select_topic(evt: gr.SelectData):
        """Handles selecting a topic from the main review table."""
        if not isinstance(evt.index, tuple) or len(evt.index) == 0:
            return {topic_id_state: None, topic_name_textbox: "", topic_word_cloud_plot: None, topic_docs_df: pd.DataFrame()}
        try:
            topic_id_val = APP_STATE["topics_df"].iloc[evt.index[0]]['ID']
            details = get_topic_details(topic_id_val)
            details[topic_id_state] = topic_id_val # Store the ID in the hidden state
            return details
        except Exception:
            return {topic_id_state: None, topic_name_textbox: "", topic_word_cloud_plot: None, topic_docs_df: pd.DataFrame()}

    review_topic_table_df.select(fn=on_select_topic, outputs=[topic_id_state, topic_name_textbox, topic_word_cloud_plot, topic_docs_df])

    # Connect the new manual refinement buttons
    update_name_button.click(fn=update_topic_name, inputs=[topic_id_state, topic_name_textbox], outputs=[review_topic_table_df])

    # When the main results are generated, populate the topic merger checklist
    review_topic_table_df.change(lambda df: gr.update(choices=df['Topic Name'].tolist()), inputs=review_topic_table_df, outputs=topic_merger_checkboxgroup)

    merge_button.click(fn=merge_selected_topics, inputs=[topic_merger_checkboxgroup], outputs=[review_topic_table_df, topic_merger_checkboxgroup, topic_name_textbox, topic_word_cloud_plot, topic_docs_df])

    # Connect the new Source Analysis tab
    analyze_media_button.click(fn=generate_media_analysis, inputs=[media_column_dropdown], outputs=[media_plot])

    # Other handlers
    generate_trends_button.click(fn=generate_temporal_plot, inputs=[date_column_dropdown], outputs=[temporal_plot_ui])
    finalize_button.click(fn=finalize_and_save, inputs=[], outputs=[download_link])

# --- LAUNCH THE APP ---
if __name__ == "__main__":
    app.launch(debug=True, share=True)

Appending to app.py


In [None]:
!python app.py

2025-08-11 21:41:35.417492: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754948495.694693    1278 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754948495.770746    1278 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1754948496.324810    1278 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1754948496.324861    1278 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1754948496.324866    1278 computation_placer.cc:177] computation placer alr