### BertTopic

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
import openai
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech

# Load your CSV data
data = pd.read_csv("data/Clotho/clotho_captions_development.csv")

# Extract captions columns
caption_columns = [f"caption_{i}" for i in range(1, 6)]

In [None]:
# Load SentenceTransformer model for embeddings
embedding_model = SentenceTransformer("dunzhang/stella_en_400M_v5", trust_remote_code=True)

# UMAP and HDBSCAN models as defined
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=100, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# CountVectorizer model as defined
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

# OpenAI model definition (Make sure your OpenAI setup is correct)
openai_client = openai.OpenAI(
    base_url='http://172.18.176.1:11434/v1',
    api_key='ollama',  # required, but unused
)
prompt = """
I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
topic: <topic label>
"""
openai_model = OpenAI(client=openai_client, model="llama3.1", exponential_backoff=True, chat=True, prompt=prompt, diversity=1, nr_docs=9)

# Representation models
representation_models = {
    "KeyBERT": KeyBERTInspired(),
    "OpenAI": openai_model,
    "MMR": MaximalMarginalRelevance(diversity=0.6),
    "POS": PartOfSpeech("en_core_web_sm")
}

# Initialize BERTopic model without fitting yet
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    # vectorizer_model=vectorizer_model,
    representation_model=representation_models,
    top_n_words=10,
    verbose=True
)

In [None]:
from collections import Counter
# Variable to store topic counts for each caption column
all_topic_counts = {}

# Iterate through each caption column and fit the BERTopic model
for caption_column in caption_columns:
    # Extract the captions from the current column
    captions = data[caption_column].tolist()

    # Generate embeddings for captions
    embeddings = embedding_model.encode(captions, show_progress_bar=True)

    # Fit the BERTopic model to the current column's captions
    topics, probs = topic_model.fit_transform(captions, embeddings)

    # Get the topic labels
    topic_labels = topic_model.get_topic_info()
    topic_labels_dict = topic_labels.set_index('Topic')['Name'].to_dict()

    # Handle -1 topics (documents not assigned to any topic)
    topic_labels_dict[-1] = "IRRelevant"

    # Add topic labels and representations to the original DataFrame
    data[f'{caption_column}_topic'] = [topic_labels_dict.get(topic, "No topic assigned") for topic in topics]
    data[f'{caption_column}_KeyBERT'] = [topic_labels.loc[topic_labels['Topic'] == topic, 'KeyBERT'].values[0] if topic in topic_labels_dict else "No topic assigned" for topic in topics]
    data[f'{caption_column}_OpenAI'] = [topic_labels.loc[topic_labels['Topic'] == topic, 'OpenAI'].values[0] if topic in topic_labels_dict else "No topic assigned" for topic in topics]
    data[f'{caption_column}_MMR'] = [topic_labels.loc[topic_labels['Topic'] == topic, 'MMR'].values[0] if topic in topic_labels_dict else "No topic assigned" for topic in topics]
    data[f'{caption_column}_POS'] = [topic_labels.loc[topic_labels['Topic'] == topic, 'POS'].values[0] if topic in topic_labels_dict else "No topic assigned" for topic in topics]

    # Count the number of occurrences of each topic
    topic_counts = Counter(topics)

    # Store the topic counts for the current caption column
    all_topic_counts[caption_column] = {topic: {"Count": count,
                                                "KeyBERT": topic_labels.loc[topic_labels['Topic'] == topic, 'KeyBERT'].values[0] if topic in topic_labels_dict else "No topic assigned",
                                                "OpenAI": topic_labels.loc[topic_labels['Topic'] == topic, 'OpenAI'].values[0] if topic in topic_labels_dict else "No topic assigned",
                                                "MMR": topic_labels.loc[topic_labels['Topic'] == topic, 'MMR'].values[0] if topic in topic_labels_dict else "No topic assigned",
                                                "POS": topic_labels.loc[topic_labels['Topic'] == topic, 'POS'].values[0] if topic in topic_labels_dict else "No topic assigned"}
                                        for topic, count in topic_counts.items()}

In [None]:
fig = topic_model.visualize_barchart(topics)

In [None]:
fig.show()

In [None]:
import pandas as pd

# Create a DataFrame from the all_topic_counts dictionary
topic_counts_df = pd.DataFrame({column: pd.Series(topic_counts) for column, topic_counts in all_topic_counts.items()})

# Reset the index to create a multi-index DataFrame
topic_counts_df = topic_counts_df.stack().reset_index()
topic_counts_df.columns = ['Topic', 'Caption Column', 'Topic Info']

# Convert the 'Topic Info' column to a dictionary
topic_counts_df['Topic Info'] = topic_counts_df['Topic Info'].apply(lambda x: dict(x))

# Expand the 'Topic Info' dictionary into separate columns
topic_counts_df = pd.concat([topic_counts_df.drop('Topic Info', axis=1), topic_counts_df['Topic Info'].apply(pd.Series)], axis=1)

# Rename the columns
topic_counts_df.columns = ['Topic', 'Caption Column', 'Count', 'KeyBERT', 'OpenAI', 'MMR', 'POS']

In [None]:
df = topic_counts_df.copy()

In [None]:
import pandas as pd

# Assuming 'df' is your existing DataFrame
# Split the DataFrame into separate DataFrames for each unique 'Caption Column'
unique_captions = df['Caption Column'].unique()

# Create a dictionary to store the separate DataFrames
caption_dfs = {}

for caption in unique_captions:
    caption_dfs[caption] = df[df['Caption Column'] == caption].reset_index(drop = True)

# Now, caption_dfs will contain separate DataFrames for each 'Caption Column'
# For example, you can access the DataFrame for 'caption_1' using:
df_caption_1 = caption_dfs['caption_1']

In [None]:
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

def create_combined_wordcloud(df, ignore_topic='-1', representation='KeyBERT'):
    # Validate ignore_topic and representation
    if ignore_topic not in df['Topic'].unique():
        raise ValueError(f"Invalid ignore_topic '{ignore_topic}'. It should be one of {df['Topic'].unique()}")
    if representation not in ['KeyBERT', 'OpenAI', 'MMR', 'POS']:
        raise ValueError(f"Invalid representation '{representation}'. It should be one of ['KeyBERT', 'OpenAI', 'MMR', 'POS']")

    stop_words = set(stopwords.words('english'))

    col_mapping = {
        'KeyBERT': lambda x:  [word for sentence in x for word, _ in pos_tag(word_tokenize(sentence)) if word.lower() not in stop_words],
        'OpenAI': lambda x: [word for word in x],
        'MMR': lambda x: [word for sentence in x for word, _ in pos_tag(word_tokenize(sentence)) if word.lower() not in stop_words],
        'POS': lambda x: [word for sentence in x for word, _ in pos_tag(word_tokenize(sentence)) if word.lower() not in stop_words]
    }

    # Filter the DataFrame
    df_filtered = df[df['Topic'] != ignore_topic]

    # Apply the mapping function to the specified column
    all_words = []
    for _, row in df_filtered.iterrows():
        words = col_mapping[representation](row[representation])
        all_words.extend(words * row['Count'])  # Use Count as weight for the words

    # Calculate word frequencies
    word_freqs = Counter(all_words)

    # Generate and display the word cloud
    plt.figure(figsize=(12, 8))
    wc = WordCloud(width=800, height=500, max_font_size=110, background_color='white', colormap='viridis').generate_from_frequencies(word_freqs)
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"{representation} Representation")
    plt.show()

In [None]:
create_combined_wordcloud(caption_dfs['caption_1'], ignore_topic=-1, representation='OpenAI')
create_combined_wordcloud(caption_dfs['caption_2'], ignore_topic=-1, representation='OpenAI')
create_combined_wordcloud(caption_dfs['caption_3'], ignore_topic=-1, representation='OpenAI')
create_combined_wordcloud(caption_dfs['caption_4'], ignore_topic=-1, representation='OpenAI')
create_combined_wordcloud(caption_dfs['caption_5'], ignore_topic=-1, representation='OpenAI')

In [None]:
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
import numpy as np
import matplotlib as mpl

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

def create_combined_wordcloud(df, ignore_topic='-1', representation='KeyBERT'):
    # Validate ignore_topic and representation
    if ignore_topic not in df['Topic'].unique():
        raise ValueError(f"Invalid ignore_topic '{ignore_topic}'. It should be one of {df['Topic'].unique()}")
    if representation not in ['KeyBERT', 'OpenAI', 'MMR', 'POS']:
        raise ValueError(f"Invalid representation '{representation}'. It should be one of ['KeyBERT', 'OpenAI', 'MMR', 'POS']")

    stop_words = set(stopwords.words('english'))

    col_mapping = {
        'KeyBERT': lambda x:  [word for sentence in x for word, _ in pos_tag(word_tokenize(sentence)) if word.lower() not in stop_words],
        'OpenAI': lambda x: [word for word in x],
        'MMR': lambda x: [word for sentence in x for word, _ in pos_tag(word_tokenize(sentence)) if word.lower() not in stop_words],
        'POS': lambda x: [word for sentence in x for word, _ in pos_tag(word_tokenize(sentence)) if word.lower() not in stop_words]
    }

    # Filter the DataFrame
    df_filtered = df[df['Topic'] != ignore_topic]

    # Apply the mapping function to the specified column
    all_words = []
    for _, row in df_filtered.iterrows():
        words = col_mapping[representation](row[representation])
        all_words.extend(words * row['Count'])  # Use Count as weight for the words

    # Calculate word frequencies
    word_freqs = Counter(all_words)

    # Generate the word cloud
    wc = WordCloud(width=800, height=500, max_font_size=110, background_color='white', colormap='viridis').generate_from_frequencies(word_freqs)

    # Create figure and axis
    fig, ax = plt.subplots(1, 2, figsize=(14, 8))

    # Plot the word cloud
    ax[0].imshow(wc, interpolation='bilinear')
    ax[0].axis('off')
    ax[0].set_title(f"{representation} Representation")

    # Create color bar
    color_map = plt.get_cmap('viridis')
    norm = mpl.colors.Normalize(vmin=min(word_freqs.values()), vmax=max(word_freqs.values()))
    sm = plt.cm.ScalarMappable(cmap=color_map, norm=norm)
    sm.set_array([])  # Only needed for older versions of Matplotlib

    # Plot the color bar
    cbar = fig.colorbar(sm, ax=ax[1], orientation='vertical', fraction=0.02, pad=0.04)
    ax[1].set_title('Word Frequency Color Bar')

    plt.show()


In [None]:
# ### Common Sounds
# Sure, here are the unique topics derived from the captions:

# 1. **Human Activity**: talking, people, speaking, walking, running, chatting, conversing, footsteps, crowd, conversations
# 2. **Machinery**: machine, machinery, whirring, factory, grinding, mechanical, motor, industrial
# 3. **Bird and Insect Sounds**: chirping, chirps, chirp, birds, squawking, crickets, bird, crows, quacking, singing, buzzing
# 4. **Water Sounds**: shower, waterfall, faucet, water, splashing, splashes, sink, flowing, fountain, dripping, pouring
# 5. **Vehicle Engines**: revving, car, cars, engine, engines, vehicle, vehicles, driving, motor, drive
# 6. **Rain and Storm**: downpour, raining, rain, rainstorm, rainfall, thunderstorm, raindrops, torrential, thunder, hail, storm
# 7. **Crowd Noise**: crowd, crowded, talk, talks, conversations, chatter, speaks
# 8. **Footsteps**: walking, walks, walk, walked, footsteps, stepping, hiking, shoes, boots, leaves, snow, underfoot
# 9. **Door Sounds**: door, doors, hinges, opened, opens, creaking, opening, creaks, open, hinge, banging
# 10. **Train and Railway Sounds**: train, trains, locomotive, railway, railroad, rail, tracks, whistle, track, squealing, sounds, station, passing, clacking, trolley, honking
# 11. **Wind and Ocean Sounds**: windy, wind, waves, blowing, ocean, breeze, beach, shore, blows, storm, sea, gusts, whistling, gust
# 12. **General Noises and Loud Sounds**: noises, noise, sounds, sound, loud, louder, tapping, metallic, blaring, sound, while, they
# 13. **Water Flowing Sounds**: water, faucet, waterfall, splashing, pouring, sink, fountain, flowing, splashes, dripping, drips
# 14. **Bird and Wind Sounds**: chirping, chirps, birds, chirp, whistling, bird, squawking, wind, crickets, chimes

# This list combines the distinct topics from all provided captions.