In [None]:
!git clone https://github.com/anonymousindividual007/Multi-environment-Topic-Models

In [None]:
import numpy as np
import pandas as pd
import itertools as it
import math
import csv

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import torch.distributions as dist
from torch.utils.data import TensorDataset, DataLoader
from torch.distributions import Normal, Distribution, HalfCauchy, Laplace

import nltk
nltk.download('punkt')
from collections import Counter
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from scipy.sparse import csr_matrix

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

The political_stopwords.txt is used for preprocessing in all of our experiments.


In [None]:
file_path = "/content/Multi-environment-Topic-Models/political_stopwords.txt"

with open(file_path, 'r') as file:
    stopwords_list = file.readlines()

all_stopwords = [word.strip() for word in stopwords_list]

In [None]:
class LemmaTokenizer:
	def __init__(self):
		self.wnl = WordNetLemmatizer()
	def __call__(self, doc):
		return [t for t in word_tokenize(doc) if str.isalpha(t)]

To use your own data replace the file path. Ensure there is a column called 'source' which indicates the environments of your dataset. In the cell below represents the data is for the Political Advertisements experiment.

In [None]:
file_path = '/content/Multi-environment-Topic-Models/local_channels.csv'

train_data = pd.read_csv(file_path)

test1 = train_data[train_data['source'] == 'right'].sample(frac=0.2, random_state=42)
test2 = train_data[train_data['source'] == 'left'].sample(frac=0.2, random_state=42)

# Drop the sampled rows from train_data
train_data = train_data.drop(test1.index)
train_data = train_data.drop(test2.index)

The data in the cell below is for the ideology dataset.


In [None]:
# train_data= pd.read_csv('/content/Multi-environment-Topic-Models/channels_ideology_train.csv')
# channels_ideology_test = pd.read_csv('/content/Multi-environment-Topic-Models/channels_ideology_test.csv')
# test1 = channels_ideology_test[channels_ideology_test['source'] == 'Republican']
# test2 = channels_ideology_test[channels_ideology_test['source'] == 'Democratic']
# test3 = channels_ideology_test[channels_ideology_test['source'] == 'balanced']


The code below represents the preprocessing for the Style dataset.

In [None]:
# Specify the path to the zip file and the name of the CSV file inside it
# zip_file_path = '/content/Multi-environment-Topic-Models/style_train_large.csv.zip'
# csv_file_name = 'style_train_large.csv'  # Change this if the CSV file has a different name inside the zip

# # Specify the temporary directory to extract the CSV file
# temp_dir = '/content/temp_dir'

# # Create a temporary directory if it doesn't exist
# if not os.path.exists(temp_dir):
#     os.makedirs(temp_dir)

# # Extract the CSV file
# with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
#     zip_ref.extract(csv_file_name, temp_dir)

# # Full path to the extracted CSV file
# csv_file_path = os.path.join(temp_dir, csv_file_name)

# # Load the CSV file into a Pandas DataFrame
# train_data = pd.read_csv(csv_file_path, encoding='ISO-8859-1')
# style_test_df = pd.read_csv('/content/Multi-environment-Topic-Models/style_test.csv', encoding='ISO-8859-1')


In [None]:
# # 2. Map the 'source' values to environments
# env_map = {'articles': 'env_0', 'speeches': 'env_1', 'tweets': 'env_2'}
# style_test_df['source'] = style_test_df['source'].map(env_map)

# # Count number of 'articles' and 'speeches' in train_data
# num_articles = len(train_data[train_data['source'] == 'articles'])
# num_speeches = len(train_data[train_data['source'] == 'speeches'])
# num_tweets = len(train_data[train_data['source'] == 'tweets'])


# # Determine the lesser count
# min_count = min(num_articles, num_speeches, num_tweets)

# # Randomly sample that many from both sources
# sampled_articles = train_data[train_data['source'] == 'articles'].sample(min_count, random_state=42)
# sampled_speeches = train_data[train_data['source'] == 'speeches'].sample(min_count, random_state=42)
# sampled_tweets = train_data[train_data['source'] == 'tweets'].sample(min_count, random_state=42)

# # Combine the two sampled dataframes to create a balanced train_data
# train_data = pd.concat([sampled_articles, sampled_speeches, sampled_tweets], ignore_index=True)

# #call it combined for the ood test
# # combined_data = pd.concat([sampled_articles, sampled_speeches, sampled_tweets], ignore_index=True)

# # # no tweets, but test on tweets
# # train_data = pd.concat([sampled_articles, sampled_speeches], ignore_index=True)

# # Now, map the 'source' values to the environments (assuming env_map is already defined)

# train_data['source'] = train_data['source'].map(env_map)

# test1 = style_test_df[style_test_df['source'] == 'env_0']
# test2 = style_test_df[style_test_df['source'] == 'env_1']
# test3 = style_test_df[style_test_df['source'] == 'env_2']


Creating environment covariates

In [None]:
env_mapping = {value: index for index, value in enumerate(train_data['source'].unique())}

num_docs = len(train_data)
num_envs = len(env_mapping)
env_index_matrix = np.zeros((num_docs, num_envs), dtype=int)

for doc_idx, source in enumerate(train_data['source']):
    env_idx = env_mapping[source]
    env_index_matrix[doc_idx, env_idx] = 1

env_index_tensor = torch.from_numpy(env_index_matrix).float().to(device)

Preprocessing the ideology and channels dataset

In [None]:
vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), ngram_range=(1, 1), stop_words=all_stopwords, max_df=0.4, min_df=0.0006)

docs_word_matrix_raw = vectorizer.fit_transform(train_data['text'])
docs_word_matrix_tensor = torch.from_numpy(docs_word_matrix_raw.toarray()).float().to(device)

On comment the code below to preprocess the IID style data

In [None]:
#style tok iid
# vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), ngram_range=(1, 1), stop_words=all_stopwords, max_df=0.5, min_df=0.006)


# docs_word_matrix_raw = vectorizer.fit_transform(train_data['text'])
# docs_word_matrix_tensor = torch.from_numpy(docs_word_matrix_raw.toarray()).float().to(device)

On comment the code below for preprocessing OOD data

In [None]:

# vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(),
#                              ngram_range=(1, 1),
#                              stop_words=all_stopwords,
#                              max_df=0.5,
#                              min_df=0.006)

# # Fit the vectorizer on the combined dataset
# vectorizer.fit(train_data['text'])

# # Only transform the train_data['text'] without fitting again
# docs_word_matrix_raw = vectorizer.transform(train_data['text'])

# env_mapping = {value: index for index, value in enumerate(train_data['source'].unique())}
# env_index = train_data['source'].apply(lambda x: env_mapping[x])

# docs_word_matrix_tensor = torch.from_numpy(docs_word_matrix_raw.toarray()).float().to(device)
# env_index_tensor = torch.from_numpy(env_index.to_numpy()).long().to(device)


In [None]:
class EnvTM(nn.Module):
    def __init__(self, num_topics, num_words, num_envs, device='cpu', empirical_bayes=True):
        super(EnvTM, self).__init__()

        def init_param(shape):
            return nn.Parameter(torch.randn(shape, device=device))

        def init_param_zeros(shape):
            return nn.Parameter(torch.zeros(shape, device=device))

        self.num_topics, self.num_words, self.num_envs = num_topics, num_words, num_envs

        self.beta = init_param([num_topics, num_words])
        self.beta_logvar = init_param_zeros([num_topics, num_words])
        self.beta_prior = Normal(torch.zeros([num_topics, num_words], device=device), torch.ones([num_topics, num_words], device=device))

        if empirical_bayes:
            self.log_alpha_a = nn.Parameter(torch.tensor(1.0, device=device))
            self.log_alpha_b = nn.Parameter(torch.tensor(1.0, device=device))
        else:
            # alpha_a_fixed = torch.tensor(4.0, device=device) #Ideology dataset
            # alpha_b_fixed = torch.tensor(0.11, device=device) #Ideology dataset

            # alpha_a_fixed = torch.tensor(3.8, device=device) #general model local channels dataset (15ep)
            # alpha_b_fixed = torch.tensor(0.13, device=device) #general model local channels dataset (15ep)

            # alpha_a_fixed = torch.tensor(3.7, device=device) #general model style dataset (50ep)
            # alpha_b_fixed = torch.tensor(0.34, device=device) #general model style dataset (50ep)

            # alpha_a_fixed = torch.tensor(2.87, device=device) #style tr:ads, articles, test: tweets dataset (30ep)
            # alpha_b_fixed = torch.tensor(0.25, device=device) #style  dataset (30ep)

            alpha_a_fixed = torch.tensor(2.92, device=device) #oodtrain:speeches,articles  (50ep)
            alpha_b_fixed = torch.tensor(0.25, device=device) #train:speeches,articles  (50ep)

            # alpha_a_fixed = torch.tensor(2.87, device=device) #oodtr:ads, articles, test: tweets dataset (30ep)
            # alpha_b_fixed = torch.tensor(0.25, device=device) #general model style dataset (30ep)

            self.log_alpha_a = alpha_a_fixed
            self.log_alpha_b = alpha_b_fixed

        self.sigma = torch.distributions.Gamma(torch.exp(self.log_alpha_a), torch.exp(self.log_alpha_b)).rsample([num_envs, num_topics, num_words])

        # Initialize gamma with variance given by the inverse of sigma
        self.gamma = init_param_zeros([num_envs, num_topics, num_words])
        self.gamma_logvar = -torch.log(self.sigma).add(1e-8)
        self.gamma_prior = Normal(torch.zeros_like(self.gamma), torch.sqrt(1.0/self.sigma).add(1e-8))

        # Global Theta, θ_{d} ~ 𝒩(·,·)
        self.theta_global_prior = Normal(torch.zeros(num_topics, device=device), torch.ones(num_topics, device=device))

        self.theta_global_net = nn.Sequential(
            nn.Linear(num_words, 50),
            nn.BatchNorm1d(50),
            nn.ReLU(),
            nn.Linear(50, num_topics * 2)
        )


    def forward(self, bow, x_d):
        batch_size, vocab_size = bow.size()

        self.theta_global_params = self.theta_global_net(bow)
        theta_global_mu, theta_global_logvar = self.theta_global_params.split(self.num_topics, dim=-1)
        theta_global_logvar = theta_global_logvar.add(1e-8)
        theta_sample = Normal(theta_global_mu, torch.exp(0.5 * theta_global_logvar).add(1e-8)).rsample()
        theta_softmax = F.softmax(theta_sample, dim=-1)

        beta_dist = Normal(self.beta, torch.exp(0.5 * self.beta_logvar).add(1e-8))
        beta_sample = beta_dist.rsample()

        gamma_dist = Normal(self.gamma, torch.exp(0.5 * self.gamma_logvar).add(1e-8))
        gamma_sample = gamma_dist.rsample()
        gamma_effect = torch.einsum('be,etv->btv', x_d, gamma_sample)

        adjusted_beta = self.beta.unsqueeze(0) + gamma_effect
        adjusted_beta_softmax = F.softmax(adjusted_beta, dim=-1)
        eta_d = torch.einsum('bt,btv->bv', theta_softmax, adjusted_beta_softmax)

        return eta_d

In [None]:
def calculate_kl_divergences(EnvTM, env, empirical_bayes=True):
    theta_global_mu, theta_global_logvar = EnvTM.theta_global_params.split(EnvTM.num_topics, dim=-1)
    theta_global_logvar = theta_global_logvar.add(1e-8)
    theta_global = Normal(theta_global_mu, torch.exp(0.5 * theta_global_logvar).add(1e-8))
    theta_global_kl = torch.distributions.kl.kl_divergence(theta_global, EnvTM.theta_global_prior).sum()

    beta = Normal(EnvTM.beta, torch.exp(0.5 * EnvTM.beta_logvar))
    beta_kl = torch.distributions.kl.kl_divergence(beta, EnvTM.beta_prior).sum()

    if not empirical_bayes:
        gamma = Normal(EnvTM.gamma, torch.exp(0.5 * EnvTM.gamma_logvar))
        gamma_kl = torch.distributions.kl.kl_divergence(gamma, EnvTM.gamma_prior).sum()
    else:
        gamma_kl = 0

    return theta_global_kl, beta_kl, gamma_kl


In [None]:
def bbvi_update(minibatch, env_index, EnvTM, optimizer, n_samples):
    optimizer.zero_grad()
    elbo_accumulator = torch.zeros(1, device=minibatch.device)

    z = EnvTM(minibatch, env_index)

    kl_theta, kl_beta, kl_gamma = calculate_kl_divergences(env_tm_model, env_index, empirical_bayes=False)

    elbo = (minibatch * z.log()).sum(-1).mul(n_samples).sub(kl_theta + kl_beta + kl_gamma)
    elbo_accumulator += elbo.sum()

    (-elbo_accumulator).backward(retain_graph=True)
    optimizer.step()

    return elbo_accumulator.item()

In [None]:
def empirical_bayes_update(EnvTM, optimizer_hyper, empirical_bayes=True, num_epochs_hyper=2, kl_threshold=1e-5):
    """Empirical Bayes update for the hyperparameters of the Gamma distribution."""

    if not empirical_bayes:
        EnvTM.log_alpha_a = torch.log(torch.tensor(3.1, device=EnvTM.log_alpha_a.device) - 1)
        EnvTM.log_alpha_b = torch.log(torch.tensor(0.29, device=EnvTM.log_alpha_b.device) - 1)
        return

    previous_gamma_kl = float('inf')

    for _ in range(num_epochs_hyper):
        optimizer_hyper.zero_grad()

        sigma_sample = torch.distributions.Gamma(torch.nn.functional.softplus(EnvTM.log_alpha_a), torch.nn.functional.softplus(EnvTM.log_alpha_b)).rsample([EnvTM.num_envs, EnvTM.num_topics, EnvTM.num_words])

        gamma_prior = Normal(torch.zeros_like(EnvTM.gamma), torch.sqrt(1.0/sigma_sample).add(1e-8))

        gamma = Normal(EnvTM.gamma, torch.exp(0.5 * EnvTM.gamma_logvar))
        gamma_kl = torch.distributions.kl.kl_divergence(gamma, gamma_prior).sum()

        delta_gamma_kl = torch.abs(gamma_kl - previous_gamma_kl).item()

        if delta_gamma_kl < kl_threshold:
            print("Early stopping of hyperparameter updates based on gamma KL divergence stability.")
            break

        (-gamma_kl).backward(retain_graph=True)
        optimizer_hyper.step()

        previous_gamma_kl = gamma_kl.item()

In [None]:
def train_model(EnvTM, docs_word_matrix_tensor, env_index_tensor, num_epochs=80, minibatch_size=16, lr=0.01, empirical_bayes=True):
    EnvTM = EnvTM.to(device)
    optimizer = torch.optim.Adam(EnvTM.parameters(), lr=lr, betas=(0.9, 0.999))
    optimizer_hyper = torch.optim.Adam([EnvTM.log_alpha_a, EnvTM.log_alpha_b], lr=lr, betas=(0.9, 0.999))

    docs_word_matrix_tensor = docs_word_matrix_tensor.to(device)
    env_index_tensor = env_index_tensor.to(device)

    for epoch in range(num_epochs):
        elbo_accumulator = 0.0
        permutation = torch.randperm(docs_word_matrix_tensor.size()[0])

        for i in range(0, docs_word_matrix_tensor.size()[0], minibatch_size):
            indices = permutation[i:i+minibatch_size]
            minibatch = docs_word_matrix_tensor[indices]
            minibatch_env_index = env_index_tensor[indices]

            elbo = bbvi_update(minibatch, minibatch_env_index, EnvTM, optimizer, docs_word_matrix_tensor.size()[0])
            elbo_accumulator += elbo
        if empirical_bayes:
            empirical_bayes_update(EnvTM, optimizer_hyper)

        avg_elbo = elbo_accumulator / (docs_word_matrix_tensor.size()[0] / minibatch_size)

        print(f'Epoch: {epoch+1}, Average ELBO: {avg_elbo}')


In [None]:
empirical_bayes = False

num_topics = 20
num_envs = 2

if empirical_bayes:
    num_epochs = 15
else:
    num_epochs = 150

In [None]:
env_tm_model = EnvTM(num_topics=num_topics, num_words=len(vectorizer.get_feature_names_out()), num_envs=num_envs, device=device, empirical_bayes=empirical_bayes)

train_model(env_tm_model, docs_word_matrix_tensor, env_index_tensor, num_epochs=num_epochs, minibatch_size=1024, lr=0.01)

In [None]:
def softplus(x):
    return math.log(1 + math.exp(x))

if not empirical_bayes:
    alpha_a = env_tm_model.log_alpha_a.item()
    alpha_b = env_tm_model.log_alpha_b.item()
    print(alpha_a)
    print(alpha_b)
else:
    alpha_a_softplus = softplus(env_tm_model.log_alpha_a.item())
    alpha_b_softplus = softplus(env_tm_model.log_alpha_b.item())
    print(f"After Training (softplus): log_alpha_a = {alpha_a_softplus}, log_alpha_b = {alpha_b_softplus}")


In [None]:
test_data_word_matrix_raw = vectorizer.transform(test2['text'])
test_data_word_matrix_tensor = torch.from_numpy(test_data_word_matrix_raw.toarray()).float().to(device)


In [None]:
def evaluate_model(env_tm_model, test_data_word_matrix_tensor):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    env_tm_model.to(device)
    env_tm_model.eval()

    with torch.no_grad():
        theta_test_params = env_tm_model.theta_global_net(test_data_word_matrix_tensor)
        theta_test_mu, theta_test_logvar = theta_test_params.split(env_tm_model.num_topics, dim=-1)
        theta_test_dist = Normal(theta_test_mu, torch.exp(0.5 * theta_test_logvar).add(1e-8))
        theta_test = theta_test_dist.rsample()
        theta_test_softmax = F.softmax(theta_test, dim=-1)
        beta_test_softmax = F.softmax(env_tm_model.beta.to(device), dim=-1)

        likelihood = torch.mm(theta_test_softmax, beta_test_softmax)
        N = torch.sum(test_data_word_matrix_tensor)
        log_perplex = -torch.sum(torch.log(likelihood) * test_data_word_matrix_tensor) / N
        perplexity = torch.exp(log_perplex)

    return perplexity, theta_test_softmax

def evaluate_model_with_gamma_per_env(env_tm_model, test_data_word_matrix_tensor):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    env_tm_model.to(device)
    env_tm_model.eval()


    with torch.no_grad():
        theta_test_params = env_tm_model.theta_global_net(test_data_word_matrix_tensor)
        theta_test_mu, theta_test_logvar = theta_test_params.split(env_tm_model.num_topics, dim=-1)
        theta_test_dist = Normal(theta_test_mu, torch.exp(0.5 * theta_test_logvar).add(1e-8))
        theta_test = theta_test_dist.rsample()
        theta_test_softmax = F.softmax(theta_test, dim=-1)

        gamma_learned = env_tm_model.gamma[0]

        beta_gamma_test_softmax = F.softmax(env_tm_model.beta.to(device) + gamma_learned, dim=-1)
        log_likelihood = torch.mm(theta_test_softmax, beta_gamma_test_softmax)
        N = torch.sum(test_data_word_matrix_tensor)
        log_perplex = -torch.sum(torch.log(log_likelihood) * test_data_word_matrix_tensor) / N
        perplexity = torch.exp(log_perplex)

    return perplexity


In [None]:
perplexity, theta_test_softmax = evaluate_model(env_tm_model, test_data_word_matrix_tensor)
perplexities_by_env = evaluate_model_with_gamma_per_env(env_tm_model, test_data_word_matrix_tensor)

print(f'Perplexity for environment {0} effects: {perplexities_by_env}')

print(f'Test Perplexity: {perplexity}')

In [None]:
def print_top_words(env_tm_model, vectorizer, num_top_words):
    global_beta = torch.nn.functional.softmax(env_tm_model.beta, dim=1)  # Convert to probabilities
    gamma = env_tm_model.gamma

    # Print top words for global beta
    print("Top words for global beta:")
    for i, topic in enumerate(global_beta):
        top_words = topic.topk(num_top_words).indices
        print(f'Topic {i+1}: {[vectorizer.get_feature_names_out()[i] for i in top_words]}')

    # Print top words for gamma
    print("\nTop words for gamma:")
    for env_index, env_gamma in enumerate(gamma):
        print(f"Environment {env_index+1}:")
        for i, topic in enumerate(env_gamma):
            top_words = topic.topk(num_top_words).indices
            print(f'Topic {i+1}: {[vectorizer.get_feature_names_out()[i] for i in top_words]}')
        print()

In [None]:
print_top_words(env_tm_model, vectorizer, num_top_words=11)

In [None]:
def normalize(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data))

def get_top_indices_values(arr, top_n=8):
    indices = np.argsort(-arr)[:top_n]
    values = arr[indices]
    return indices, values

def get_words(vocabulary, indices):
    return [vocabulary[i] for i in indices]

def plot_gamma_beta_heatmaps(gamma_data, beta_data, words, title):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))

    # Defining the color scale between 0 and 1
    im1 = ax1.imshow(gamma_data.T, cmap='hot', interpolation='nearest', vmin=0, vmax=1)
    im2 = ax2.imshow(beta_data.T.reshape(-1, 1), cmap='hot', interpolation='nearest', vmin=0, vmax=1)

    num_environments = gamma_data.shape[0]
    environments = [f'Environment {i}' for i in range(num_environments)]

    # Settings for gamma heatmap
    ax1.set_yticks(np.arange(len(words)))
    ax1.set_xticks(np.arange(num_environments))
    ax1.set_yticklabels(words)
    ax1.set_xticklabels(environments)
    plt.setp(ax1.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
    cbar1 = fig.colorbar(im1, ax=ax1)
    cbar1.ax.set_ylabel("Normalized Gamma", rotation=-90, va="bottom")

    # Settings for beta grid
    ax2.set_yticks(np.arange(len(words)))
    ax2.set_xticks([0])
    ax2.set_yticklabels(words)
    ax2.set_xticklabels(['Beta'])
    plt.setp(ax2.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
    cbar2 = fig.colorbar(im2, ax=ax2)
    cbar2.ax.set_ylabel("Normalized Beta", rotation=-90, va="bottom")

    ax1.set_title(title)
    fig.tight_layout()
    plt.show()

def analyze_topic(lda, vocabulary, topic_index, top_n=8):
    # Normalize the entire beta array for the specific topic
    beta_values = normalize(lda.beta[topic_index, :].cpu().detach().numpy())

    # Normalize the entire gamma arrays for the specific topic in all environments
    num_environments = lda.gamma.shape[0]
    gamma_values = [normalize(lda.gamma[i, topic_index, :].cpu().detach().numpy()) for i in range(num_environments)]

    # Get the top beta indices and values
    beta_indices, _ = get_top_indices_values(beta_values, top_n)

    for env_index, gamma_value in enumerate(gamma_values):
        # Get the top gamma indices and values
        gamma_indices, _ = get_top_indices_values(gamma_value, top_n)

        # Get the corresponding words from the vocabulary
        gamma_words = get_words(vocabulary, gamma_indices)
        beta_words = get_words(vocabulary, beta_indices)

        # Print the top words
        print(f"Top words in gamma environment {env_index}:", gamma_words)
        print("Top words in beta:               ", beta_words)

        # Get the gamma and beta values for top words
        gamma_values_top_words = [gamma_values[i][gamma_indices] for i in range(num_environments)]
        beta_values_top_words = beta_values[gamma_indices]

        # Plot the heatmaps
        plot_gamma_beta_heatmaps(np.array(gamma_values_top_words), beta_values_top_words, gamma_words, f"Environment {env_index}: Top Words")

    # Gamma and Beta values for top words in beta
    gamma_values_beta = [gamma_values[i][beta_indices] for i in range(num_environments)]
    beta_values_beta = beta_values[beta_indices]
    beta_words = get_words(vocabulary, beta_indices)

    # Plot the heatmaps for the top words in beta
    plot_gamma_beta_heatmaps(np.array(gamma_values_beta), beta_values_beta, beta_words, "Beta: Top Words")


vocabulary = list(vectorizer.get_feature_names_out())

# Analyzing topic 4 with 8 top words
analyze_topic(env_tm_model, vocabulary, topic_index=10, top_n=8)

In [None]:
def analyze_gamma_per_environment(model, threshold=1e-2):
    gamma_values = model.gamma.detach().cpu().numpy()

    for env_index in range(gamma_values.shape[0]):
        print(f"Environment {env_index}:")
        gamma_env_values = gamma_values[env_index]

        close_to_zero = np.abs(gamma_env_values) < threshold
        sparsity_percentage = 100 * np.sum(close_to_zero) / gamma_env_values.size

        print(f"Sparsity Percentage: {sparsity_percentage}%")
        print(f"Mean of Gamma: {np.mean(gamma_env_values)}")
        print(f"Standard Deviation of Gamma: {np.std(gamma_env_values)}")
        plt.hist(gamma_env_values.flatten(), bins=50)
        plt.title(f"Histogram of Gamma Values for Environment {env_index}")
        plt.show()


In [None]:
analyze_gamma_per_environment(env_tm_model)
