In [None]:
!git clone https://github.com/anonymousindividual007/Multi-environment-Topic-Models

In [None]:
import numpy as np
import pandas as pd
import itertools as it
import math
import csv

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import torch.distributions as dist
from torch.utils.data import TensorDataset, DataLoader
from torch.distributions import Normal, Distribution, HalfCauchy, Laplace

import nltk
nltk.download('punkt')
from collections import Counter
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from scipy.sparse import csr_matrix

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # checks whether a GPU is available and chooses the GPU if it is

In [None]:
file_path = "/content/Multi-environment-Topic-Models/political_stopwords.txt"

with open(file_path, 'r') as file:
    stopwords_list = file.readlines()

all_stopwords = [word.strip() for word in stopwords_list]

In [None]:
class LemmaTokenizer:
	def __init__(self):
		self.wnl = WordNetLemmatizer()
	def __call__(self, doc):
		return [t for t in word_tokenize(doc) if str.isalpha(t)]

To use your own data replace the file path. In the cell below represents the data is for the Political Advertisements experiment.



In [None]:
file_path = '/content/Multi-environment-Topic-Models/local_channels.csv'

train_data = pd.read_csv(file_path)

test1 = train_data[train_data['source'] == 'right'].sample(frac=0.2, random_state=42)
test2 = train_data[train_data['source'] == 'left'].sample(frac=0.2, random_state=42)

# Drop the sampled rows from train_data
train_data = train_data.drop(test1.index)
train_data = train_data.drop(test2.index)

The data in the cell below is for the ideology dataset.

In [None]:
# train_data= pd.read_csv('/content/Multi-environment-Topic-Models/channels_ideology_train.csv')
# channels_ideology_test = pd.read_csv('/content/Multi-environment-Topic-Models/channels_ideology_test.csv')
# test1 = channels_ideology_test[channels_ideology_test['source'] == 'Republican']
# test2 = channels_ideology_test[channels_ideology_test['source'] == 'Democratic']
# test3 = channels_ideology_test[channels_ideology_test['source'] == 'balanced']

In [None]:
# Specify the path to the zip file and the name of the CSV file inside it
# zip_file_path = '/content/Multi-environment-Topic-Models/style_train_large.csv.zip'
# csv_file_name = 'style_train_large.csv'  # Change this if the CSV file has a different name inside the zip

# # Specify the temporary directory to extract the CSV file
# temp_dir = '/content/temp_dir'

# # Create a temporary directory if it doesn't exist
# if not os.path.exists(temp_dir):
#     os.makedirs(temp_dir)

# # Extract the CSV file
# with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
#     zip_ref.extract(csv_file_name, temp_dir)

# # Full path to the extracted CSV file
# csv_file_path = os.path.join(temp_dir, csv_file_name)

# # Load the CSV file into a Pandas DataFrame
# train_data = pd.read_csv(csv_file_path, encoding='ISO-8859-1')
# style_test_df = pd.read_csv('/content/Multi-environment-Topic-Models/style_test.csv', encoding='ISO-8859-1')

In [None]:
env_mapping = {value: index for index, value in enumerate(train_data['source'].unique())}

num_docs = len(train_data)
num_envs = len(env_mapping)
env_index_matrix = np.zeros((num_docs, num_envs), dtype=int)

for doc_idx, source in enumerate(train_data['source']):
    env_idx = env_mapping[source]
    env_index_matrix[doc_idx, env_idx] = 1

env_index_tensor = torch.from_numpy(env_index_matrix).float().to(device)


Ads and ideology data preprocessing

In [None]:
vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), ngram_range=(1, 1), stop_words=all_stopwords, max_df=0.4, min_df=0.0006)

docs_word_matrix_raw = vectorizer.fit_transform(train_data['text'])
docs_word_matrix_tensor = torch.from_numpy(docs_word_matrix_raw.toarray()).float().to(device)

Style tokenizer IID

In [None]:
# vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), ngram_range=(1, 1), stop_words=all_stopwords, max_df=0.5, min_df=0.006)
# docs_word_matrix_raw = vectorizer.fit_transform(train_data['text'])
# docs_word_matrix_tensor = torch.from_numpy(docs_word_matrix_raw.toarray()).float().to(device)


Style tokenizer OOD data

In [None]:
# vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(),
#                              ngram_range=(1, 1),
#                              stop_words=all_stopwords,
#                              max_df=0.5,
#                              min_df=0.006)

# vectorizer.fit(train_data['text'])
# docs_word_matrix_raw = vectorizer.transform(train_data['text'])
# docs_word_matrix_tensor = torch.from_numpy(docs_word_matrix_raw.toarray()).float().to(device)

In [None]:
class EnvTM(nn.Module):
    def __init__(self, num_topics, num_words, num_envs, device='cpu'):
        super(EnvTM, self).__init__()

        def init_param(shape):
            return nn.Parameter(torch.randn(shape, device=device))

        def init_param_zeros(shape):
            return nn.Parameter(torch.zeros(shape, device=device))

        self.num_topics, self.num_words, self.num_envs = num_topics, num_words, num_envs

        # Global Beta, β_{0,k} ~ 𝒩(·,·)
        self.beta = init_param([num_topics, num_words])
        self.beta_logvar = init_param_zeros([num_topics, num_words])
        self.beta_prior = Normal(torch.zeros([num_topics, num_words], device=device), torch.ones([num_topics, num_words], device=device))

        # Lambda parameters, λ_{e,k} ~ Half-Cauchy(0,_)
        self.lambda_ek = torch.distributions.HalfCauchy(scale=torch.tensor(0.4, device=device)).rsample([num_envs, num_topics])

        # Expand lambda_ek to have the same shape across words
        self.lambda_ek = self.lambda_ek.unsqueeze(-1).expand(-1, -1, num_words)

        # Tau parameter, τ ~ Half-Cauchy(0, _)
        self.tau = torch.distributions.HalfCauchy(scale=torch.tensor(0.5, device=device)).rsample()

        # Gamma parameters, γ_{e,k} ~ 𝒩(0, λ_{e,k}^2 τ^2) --> hMTM
        self.gamma = init_param_zeros([num_envs, num_topics, num_words])
        self.gamma_logvar = init_param_zeros([num_envs, num_topics, num_words])
        gamma_prior_variance = (self.lambda_ek ** 2) * (self.tau ** 2)
        self.gamma_prior = Normal(torch.zeros_like(gamma_prior_variance), torch.sqrt(gamma_prior_variance).add(1e-8))

        # Gamma parameters, γ_{e,k} ~ 𝒩(0, 1) --> nEATM
        # self.gamma = init_param([num_envs, num_topics, num_words])  # Initialize with normal distribution
        # self.gamma_logvar = init_param_zeros([num_envs, num_topics, num_words]) # Initialize log variance
        # self.gamma_prior = Normal(torch.zeros([num_envs, num_topics, num_words], device=device), torch.ones([num_envs, num_topics, num_words], device=device))

        self.theta_global_prior = Normal(torch.zeros(num_topics, device=device), torch.ones(num_topics, device=device))

        self.theta_global_net = nn.Sequential(
            nn.Linear(num_words, 50),
            nn.BatchNorm1d(50),
            nn.ReLU(),
            nn.Linear(50, num_topics * 2)
        )

    def forward(self, bow, x_d):
        batch_size, vocab_size = bow.size()
        self.theta_global_params = self.theta_global_net(bow)
        theta_global_mu, theta_global_logvar = self.theta_global_params.split(self.num_topics, dim=-1)
        theta_global_logvar = theta_global_logvar.add(1e-8)
        theta_sample = Normal(theta_global_mu, torch.exp(0.5 * theta_global_logvar).add(1e-8)).rsample()
        theta_softmax = F.softmax(theta_sample, dim=-1)

        beta_dist = Normal(self.beta, torch.exp(0.5 * self.beta_logvar).add(1e-8))
        beta_sample = beta_dist.rsample()

        gamma_dist = Normal(self.gamma, torch.exp(0.5 * self.gamma_logvar).add(1e-8))
        gamma_sample = gamma_dist.rsample()
        gamma_effect = torch.einsum('be,etv->btv', x_d, gamma_sample)

        adjusted_beta = self.beta.unsqueeze(0) + gamma_effect
        adjusted_beta_softmax = F.softmax(adjusted_beta, dim=-1)
        eta_d = torch.einsum('bt,btv->bv', theta_softmax, adjusted_beta_softmax)

        return eta_d


In [None]:
def calculate_kl_divergences(EnvTM, x_d):
    theta_global_mu, theta_global_logvar = EnvTM.theta_global_params.split(EnvTM.num_topics, dim=-1)
    theta_global_logvar = theta_global_logvar.add(1e-8)
    theta_global = Normal(theta_global_mu, torch.exp(0.5 * theta_global_logvar).add(1e-8))
    theta_global_kl = torch.distributions.kl.kl_divergence(theta_global, EnvTM.theta_global_prior).sum()

    beta = Normal(EnvTM.beta, torch.exp(0.5 * EnvTM.beta_logvar))
    beta_kl = torch.distributions.kl.kl_divergence(beta, EnvTM.beta_prior).sum()

    gamma = Normal(EnvTM.gamma, torch.exp(0.5 * EnvTM.gamma_logvar).add(1e-8))
    gamma_kl = torch.distributions.kl.kl_divergence(gamma, EnvTM.gamma_prior).sum()

    return theta_global_kl, beta_kl, gamma_kl

In [None]:
def bbvi_update(minibatch, env_index, EnvTM, optimizer, n_samples):
    optimizer.zero_grad()
    elbo_accumulator = torch.zeros(1, device=minibatch.device)
    z = EnvTM(minibatch, env_index)

    theta_global_kl, beta_kl, gamma_kl = calculate_kl_divergences(EnvTM, env_index)
    elbo = (minibatch * z.log()).sum(-1).mul(n_samples).sub(theta_global_kl + beta_kl + gamma_kl)
    elbo_accumulator += elbo.sum()

    (-elbo_accumulator).backward()
    optimizer.step()

    return elbo_accumulator.item()

In [None]:
def train_model(EnvTM, docs_word_matrix_tensor, env_index_tensor, num_epochs=80, minibatch_size=1024, lr=0.01):
    EnvTM = EnvTM.to(device)
    optimizer = torch.optim.Adam(EnvTM.parameters(), lr=lr, betas=(0.9, 0.999))

    docs_word_matrix_tensor = docs_word_matrix_tensor.to(device)
    env_index_tensor = env_index_tensor.to(device)

    for epoch in range(num_epochs):
        elbo_accumulator = 0.0
        permutation = torch.randperm(docs_word_matrix_tensor.size()[0])

        for i in range(0, docs_word_matrix_tensor.size()[0], minibatch_size):
            indices = permutation[i:i+minibatch_size]
            minibatch = docs_word_matrix_tensor[indices]
            minibatch_env_index = env_index_tensor[indices]

            elbo = bbvi_update(minibatch, minibatch_env_index, EnvTM, optimizer, docs_word_matrix_tensor.size()[0])
            elbo_accumulator += elbo

        avg_elbo = elbo_accumulator / (docs_word_matrix_tensor.size()[0] / minibatch_size)
        print(f'Epoch: {epoch+1}, Average ELBO: {avg_elbo}')

In [None]:
num_topics = 20
num_envs = 2
num_epoch = 150
env_tm_model = EnvTM(num_topics=num_topics, num_words=len(vectorizer.get_feature_names_out()), num_envs=num_envs, device=device)

train_model(env_tm_model, docs_word_matrix_tensor, env_index_tensor, num_epochs=num_epoch, minibatch_size=1024, lr=0.01)

In [None]:
test_data_word_matrix_raw = vectorizer.transform(test1['text'])
test_data_word_matrix_tensor = torch.from_numpy(test_data_word_matrix_raw.toarray()).float().to(device)


In [None]:
def evaluate_model(env_tm_model, test_data_word_matrix_tensor):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    env_tm_model.to(device)
    env_tm_model.eval()

    with torch.no_grad():
        theta_test_params = env_tm_model.theta_global_net(test_data_word_matrix_tensor)
        theta_test_mu, theta_test_logvar = theta_test_params.split(env_tm_model.num_topics, dim=-1)
        theta_test_dist = Normal(theta_test_mu, torch.exp(0.5 * theta_test_logvar).add(1e-8))
        theta_test = theta_test_dist.rsample()
        theta_test_softmax = F.softmax(theta_test, dim=-1)
        beta_test_softmax = F.softmax(env_tm_model.beta.to(device), dim=-1)

        likelihood = torch.mm(theta_test_softmax, beta_test_softmax)
        N = torch.sum(test_data_word_matrix_tensor)
        log_perplex = -torch.sum(torch.log(likelihood) * test_data_word_matrix_tensor) / N
        perplexity = torch.exp(log_perplex)

    return perplexity, theta_test_softmax

def evaluate_model_with_gamma_per_env(env_tm_model, test_data_word_matrix_tensor):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    env_tm_model.to(device)
    env_tm_model.eval()


    with torch.no_grad():
        theta_test_params = env_tm_model.theta_global_net(test_data_word_matrix_tensor)
        theta_test_mu, theta_test_logvar = theta_test_params.split(env_tm_model.num_topics, dim=-1)
        theta_test_dist = Normal(theta_test_mu, torch.exp(0.5 * theta_test_logvar).add(1e-8))
        theta_test = theta_test_dist.rsample()
        theta_test_softmax = F.softmax(theta_test, dim=-1)

        gamma_learned = env_tm_model.gamma[0]

        beta_gamma_test_softmax = F.softmax(env_tm_model.beta.to(device) + gamma_learned, dim=-1)
        log_likelihood = torch.mm(theta_test_softmax, beta_gamma_test_softmax)
        N = torch.sum(test_data_word_matrix_tensor)
        log_perplex = -torch.sum(torch.log(log_likelihood) * test_data_word_matrix_tensor) / N
        perplexity = torch.exp(log_perplex)

    return perplexity

perplexity, theta_test_softmax = evaluate_model(env_tm_model, test_data_word_matrix_tensor)
perplexities_by_env = evaluate_model_with_gamma_per_env(env_tm_model, test_data_word_matrix_tensor)

print(f'Perplexity for environment {0}: {perplexities_by_env}')

print(f'Test Perplexity: {perplexity}')

In [None]:
def print_top_words(env_tm_model, vectorizer, num_top_words):
    global_beta = torch.nn.functional.softmax(env_tm_model.beta, dim=1)  # Convert to probabilities
    gamma = env_tm_model.gamma

    # Print top words for global beta
    print("Top words for global beta:")
    for i, topic in enumerate(global_beta):
        top_words = topic.topk(num_top_words).indices
        print(f'Topic {i+1}: {[vectorizer.get_feature_names_out()[i] for i in top_words]}')

    # Print top words for gamma
    print("\nTop words for gamma:")
    for env_index, env_gamma in enumerate(gamma):
        print(f"Environment {env_index+1}:")
        for i, topic in enumerate(env_gamma):
            top_words = topic.topk(num_top_words).indices
            print(f'Topic {i+1}: {[vectorizer.get_feature_names_out()[i] for i in top_words]}')
        print()

In [None]:
print_top_words(env_tm_model, vectorizer, num_top_words=12)