# Goal
Apply an unsupervised approach to thematic analysis of pain mentions. Compare/contrast these with our LLM biopsychosocial pain dimensions. This can show us if we’re comprehensive in our approach or if there are others that we aren’t catching—like other dimensions of pain

# Installations

In [None]:
# !pip install bertopic sentence-transformers

In [None]:
# !pip install hvplot

# Necessary Imports

In [None]:
import pandas as pd
from tabulate import tabulate
import ast
import numpy as np
from collections import Counter
import panel as pn
import hvplot.pandas
from IPython.display import display
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
from bertopic import BERTopic
import os
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel
import plotly.express as px
from typing import List, Tuple, Dict
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Exploratory Data Analysis

In [None]:
'''
The dataset being read below contains all sentences with matches in the pain lexicon, and only the matched sentences.
Pain Lexicons: https://docs.google.com/spreadsheets/d/1GoV-g_38ntJ4X6q3TmCIoWazb2JPSu-QptFDRiP5sYo
'''

df = pd.read_csv('reddit-data/matched-datasets/pain_lexicon_matched_subreddit_lupus_posts_full.csv', index_col=0)

In [None]:
print(f"shape of dataset: {df.shape}")
display(df.head(3))

In [None]:
pn.extension()
frequency_counter = Counter()
for entry in df['matches']:
    parsed_list = ast.literal_eval(entry)
    frequency_counter.update(parsed_list)
terms = [term for term, frequency in frequency_counter.most_common()]
frequencies = [frequency for term, frequency in frequency_counter.most_common()]
data = pd.DataFrame({'Term': terms, 'Frequency': frequencies})
slider = pn.widgets.IntSlider(name='Frequency Threshold', 
                            start=1, 
                            end=max(frequencies), 
                            step=1, 
                            value=10)

# Dynamic table and plot with increased size
@pn.depends(slider)
def filter_data(threshold):
    filtered_data = data[data['Frequency'] >= threshold]
    plot = filtered_data.hvplot.bar(
        x='Term', 
        y='Frequency', 
        rot=45, 
        title=f'Terms with Frequency >= {threshold}',
        height=400,  # Increased height
        width=1000,   # Increased width
        fontsize={'title': 16, 'labels': 12, 'xticks': 10, 'yticks': 10}  # Larger fonts
    )
    return pn.Column(filtered_data, plot)
dashboard = pn.Column(slider, filter_data)
display(dashboard)

# Single Sentence Thematic Analysis using BERTopic

In [None]:
class PainTopicAnalyzer:
    def __init__(self, output_folder: str = "single_sentence_pain_topic_analysis"):
        self.output_folder = output_folder
        os.makedirs(output_folder, exist_ok=True)
        
        self.pain_dimensions = {
            'biological': ['location', 'severity', 'duration', 'sex', 'gender', 'age', 'comorbidities'],
            'psychological': ['affective', 'cognitive', 'behavioral', 'existential', 'spiritual'],
            'social': ['functional', 'economic', 'sociocultural'],
            'nociplastic': ['fatigue', 'brain fog', 'widespread pain', 'sensitivity'],
            'management': ['medication', 'treatment', 'therapy', 'management']
        }
    def load_and_prepare_data(self, filepath: str) -> Tuple[pd.DataFrame, List[str]]:
        df = pd.read_csv(filepath)
        print(f"Loaded {len(df)} rows")
        
        df['sentence'] = df['sentence'].astype(str)
        df = df[df['sentence'].str.strip().str.len() > 0]
        
        context_df = df.copy()
        context_df['matched_pain_terms'] = df['matches']
        context_df['post_date'] = pd.to_datetime(df['created_date_cleaned'])
        
        docs = df['sentence'].tolist()
        
        print(f"Prepared {len(docs)} sentences for analysis")
        return context_df, docs

    def map_topics_to_dimensions(self, topic_words: Dict[int, List[str]]) -> Dict:
        """Map discovered topics to pain dimensions framework."""
        topic_dimension_mapping = {}
        
        for topic_id, words in topic_words.items():
            matched_dimensions = []
            
            for dimension, keywords in self.pain_dimensions.items():
                if any(keyword in ' '.join(words).lower() for keyword in keywords):
                    matched_dimensions.append(dimension)
            
            topic_dimension_mapping[topic_id] = {
                'topic_words': words,
                'matched_dimensions': matched_dimensions,
                'unknown_dimension': len(matched_dimensions) == 0
            }
        
        return topic_dimension_mapping
        
    def save_analysis_results(self, topic_words: Dict[int, List[str]], 
                            topic_dimension_mapping: Dict, 
                            context_df: pd.DataFrame,
                            base_filename: str):
        # Save topic-dimension mapping
        mapping_df = pd.DataFrame([
            {
                'Topic': topic_id,
                'Words': ', '.join(info['topic_words']),
                'Matched_Dimensions': ', '.join(info['matched_dimensions']),
                'New_Dimension': info['unknown_dimension']
            }
            for topic_id, info in topic_dimension_mapping.items()
        ])
        
        mapping_df.to_csv(f"{base_filename}_dimension_mapping.csv", index=False)
        
        # Save sentences with their topics and matched pain terms
        analysis_df = context_df[['sentence', 'matches', 'topic', 'post_id', 'created_date_cleaned']]
        analysis_df.to_csv(f"{base_filename}_sentences_with_topics.csv", index=False)
        
        # Save summary statistics
        with open(f"{base_filename}_analysis_summary.txt", 'w') as f:
            f.write("Pain Topic Analysis Summary\n")
            f.write("=========================\n\n")
            
            # Count topics by dimension
            dimension_counts = {}
            for info in topic_dimension_mapping.values():
                for dim in info['matched_dimensions']:
                    dimension_counts[dim] = dimension_counts.get(dim, 0) + 1
            
            f.write("Topics per Pain Dimension:\n")
            for dim, count in dimension_counts.items():
                f.write(f"{dim}: {count} topics\n")
            
            # Count potentially new dimensions
            new_dims = sum(1 for info in topic_dimension_mapping.values() 
                         if info['unknown_dimension'])
            f.write(f"\nPotential New Dimensions: {new_dims} topics\n")
            
            # Temporal analysis
            f.write("\nTemporal Distribution:\n")
            context_df['month_year'] = pd.to_datetime(context_df['created_date_cleaned']).dt.to_period('M')
            temporal_dist = context_df.groupby('month_year').size()
            f.write(temporal_dist.to_string())
    def create_topic_model(self, 
                          docs: List[str],
                          min_cluster_size: int = 50,
                          n_neighbors: int = 10,
                          n_components: int = 2,
                          min_distance: float = 0.1) -> Tuple[BERTopic, np.ndarray, List[int]]:
        
        embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
        embeddings = embedding_model.encode(docs, show_progress_bar=True)

        umap_model = UMAP(
            n_neighbors=n_neighbors,
            n_components=n_components,
            min_dist=min_distance,
            metric='cosine'
        )

        hdbscan_model = HDBSCAN(
            min_cluster_size=min_cluster_size,
            metric='euclidean',
            cluster_selection_method='eom',
            prediction_data=True
        )

        vectorizer_model = CountVectorizer(
            stop_words="english",
            min_df=2,
            ngram_range=(1, 4)
        )

        topic_model = BERTopic(
            embedding_model=embedding_model,
            vectorizer_model=vectorizer_model,
            umap_model=umap_model,
            hdbscan_model=hdbscan_model
        )

        topics, _ = topic_model.fit_transform(docs, embeddings)
        
        return topic_model, embeddings, topics

    def save_visualizations(self, topic_model: BERTopic, docs: List[str], embeddings: np.ndarray):
        base_filename = os.path.join(self.output_folder, "pain_topic_analysis")
        
        # Save interactive visualizations
        topic_vis = topic_model.visualize_topics()
        topic_vis.write_html(f"{base_filename}_topic_visualization.html")

        doc_vis = topic_model.visualize_documents(docs, embeddings=embeddings)
        doc_vis.write_html(f"{base_filename}_document_visualization.html")

        barchart = topic_model.visualize_barchart(top_n_topics=50)
        barchart.write_html(f"{base_filename}_barchart.html")

        heatmap = topic_model.visualize_heatmap()
        heatmap.write_html(f"{base_filename}_heatmap.html")

        hierarchical_tree = topic_model.visualize_hierarchy()
        hierarchical_tree.write_html(f"{base_filename}_hierarchy.html")

    def calculate_topic_coherence(self, topic_model: BERTopic, docs: List[str]) -> pd.DataFrame:
        """Calculate coherence scores for topics and add interpretation."""
        # Extract topics and their words
        topics = topic_model.get_topics()
        topic_words = [[word for word, _ in topic_model.get_topic(topic_id)] for topic_id in topics.keys() if topic_id != -1 ]
        
        # Preprocess documents for Gensim
        tokenized_docs = [doc.split() for doc in docs]
        dictionary = corpora.Dictionary(tokenized_docs)
        corpus = [dictionary.doc2bow(text) for text in tokenized_docs]
    
        # Calculate coherence
        coherence_model = CoherenceModel(
            topics=topic_words, 
            texts=tokenized_docs, 
            dictionary=dictionary, 
            coherence='c_v'  # Use 'c_v' for human-interpretative coherence
        )
        coherence_scores = coherence_model.get_coherence_per_topic()
    
        # Define interpretation based on coherence score thresholds
        def interpret_coherence(score):
            if score >= 0.7:
                return "Highly Coherent"
            elif score >= 0.4:
                return "Moderately Coherent"
            else:
                return "Low Coherence"
    
        # Combine topic IDs, coherence scores, and interpretations into a DataFrame
        coherence_df = pd.DataFrame({
            'Topic': [topic_id for topic_id in topics.keys() if topic_id != -1],
            'Coherence_Score': coherence_scores,
            'Interpretation': [interpret_coherence(score) for score in coherence_scores]
        })
        
        # Save coherence scores to file
        output_file = os.path.join(self.output_folder, "topic_coherence_scores.csv")
        coherence_df.to_csv(output_file, index=False)
        print(f"Coherence scores with interpretations saved to {output_file}")
    
        return coherence_df

    def analyze_topics(self, topic_model: BERTopic, context_df: pd.DataFrame, 
                      docs: List[str], embeddings: np.ndarray, topics: List[int]):
        base_filename = os.path.join(self.output_folder, "pain_topic_analysis")

        topic_info = topic_model.get_topic_info()
        print("\nTopic Information:")
        # print(topic_info)
        topic_df = pd.DataFrame(topic_info)
        display(topic_df)
        topic_df.to_csv(f"{base_filename}.csv",index=False)

        self.save_visualizations(topic_model, docs, embeddings)
        context_df['topic'] = topics
        coherence_df = self.calculate_topic_coherence(topic_model, docs)
        print("\nTopic Coherence Scores:")
        display(coherence_df)
        
        topic_words = {}
        for topic in set(topics):
            if topic != -1:
                words = topic_model.get_topic(topic)
                topic_words[topic] = [word[0] for word in words]
        topic_dimension_mapping = self.map_topics_to_dimensions(topic_words)
        
        self.save_analysis_results(topic_words, topic_dimension_mapping, 
                                 context_df, base_filename)
        context_df = context_df.dropna(subset=['post_date'])
        context_df['post_date'] = pd.to_datetime(context_df['post_date'], errors='coerce')
        context_df = context_df.dropna(subset=['post_date'])
        valid_indices = context_df.index
        filtered_docs = [docs[i] for i in valid_indices]
        filtered_topics = [topics[i] for i in valid_indices]
        filtered_timestamps = context_df['post_date'].tolist()
        print("Creating dynamic visualization of topic trends over time...")
        topics_over_time = topic_model.topics_over_time(
            filtered_docs, filtered_timestamps, filtered_topics, nr_bins=20)
        timeline_vis = topic_model.visualize_topics_over_time(topics_over_time)
        timeline_vis.write_html(f"{base_filename}_topics_over_time.html")
        print("Dynamic visualization saved successfully.")
        
def main():
    analyzer = PainTopicAnalyzer()
    
    # Load data
    context_df, docs = analyzer.load_and_prepare_data(
        "reddit-data/matched-datasets/pain_lexicon_matched_subreddit_lupus_posts_full.csv")
    topic_model, embeddings, topics = analyzer.create_topic_model(docs)    
    analyzer.analyze_topics(topic_model, context_df, docs, embeddings, topics)

if __name__ == "__main__":
    main()