In [None]:
!git clone https://github.com/anonymousindividual007/Multi-environment-Topic-Models

In [None]:
import numpy as np
import pandas as pd
import math
import csv
import itertools as it

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import torch.distributions as dist
from torch.utils.data import TensorDataset, DataLoader
from torch.distributions import Normal, Distribution, HalfCauchy, Laplace

import nltk
nltk.download('punkt')
from collections import Counter
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from scipy.sparse import csr_matrix

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
file_path = "/content/Multi-environment-Topic-Models/political_stopwords.txt"

with open(file_path, 'r') as file:
    stopwords_list = file.readlines()

all_stopwords = [word.strip() for word in stopwords_list]

In [None]:
class LemmaTokenizer:
	def __init__(self):
		self.wnl = WordNetLemmatizer()
	def __call__(self, doc):
		return [t for t in word_tokenize(doc) if str.isalpha(t)]

To use your own data replace the file path. In the cell below represents the data is for the Political Advertisements experiment.



In [None]:
file_path = '/content/Multi-environment-Topic-Models/local_channels.csv'

train_data = pd.read_csv(file_path)

test1 = train_data[train_data['source'] == 'right'].sample(frac=0.2, random_state=42)
test2 = train_data[train_data['source'] == 'left'].sample(frac=0.2, random_state=42)

# Drop the sampled rows from train_data
train_data = train_data.drop(test1.index)
train_data = train_data.drop(test2.index)

The data in the cell below is for the ideology dataset.

In [None]:
# train_data= pd.read_csv('/content/Multi-environment-Topic-Models/channels_ideology_train.csv')
# channels_ideology_test = pd.read_csv('/content/Multi-environment-Topic-Models/channels_ideology_test.csv')
# test1 = channels_ideology_test[channels_ideology_test['source'] == 'Republican']
# test2 = channels_ideology_test[channels_ideology_test['source'] == 'Democratic']
# test3 = channels_ideology_test[channels_ideology_test['source'] == 'balanced']

The code below represents the preprocessing for the Style dataset.

In [None]:
# Specify the path to the zip file and the name of the CSV file inside it
# zip_file_path = '/content/Multi-environment-Topic-Models/style_train_large.csv.zip'
# csv_file_name = 'style_train_large.csv'  # Change this if the CSV file has a different name inside the zip

# # Specify the temporary directory to extract the CSV file
# temp_dir = '/content/temp_dir'

# # Create a temporary directory if it doesn't exist
# if not os.path.exists(temp_dir):
#     os.makedirs(temp_dir)

# # Extract the CSV file
# with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
#     zip_ref.extract(csv_file_name, temp_dir)

# # Full path to the extracted CSV file
# csv_file_path = os.path.join(temp_dir, csv_file_name)

# # Load the CSV file into a Pandas DataFrame
# train_data = pd.read_csv(csv_file_path, encoding='ISO-8859-1')
# style_test_df = pd.read_csv('/content/Multi-environment-Topic-Models/style_test.csv', encoding='ISO-8859-1')

Preprocessing the ideology and channels dataset

In [None]:
vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), ngram_range=(1, 1), stop_words=all_stopwords, max_df=0.4, min_df=0.0006)

docs_word_matrix_raw = vectorizer.fit_transform(train_data['text'])
docs_word_matrix_tensor = torch.from_numpy(docs_word_matrix_raw.toarray()).float().to(device)

The code below represents the preprocessing for the iid Style experiment.



In [None]:
#style tok iid
# vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), ngram_range=(1, 1), stop_words=all_stopwords, max_df=0.5, min_df=0.006)


# docs_word_matrix_raw = vectorizer.fit_transform(train_data['text'])
# docs_word_matrix_tensor = torch.from_numpy(docs_word_matrix_raw.toarray()).float().to(device)


On comment the code below for preprocessing OOD data



In [None]:
# vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(),
#                              ngram_range=(1, 1),
#                              stop_words=all_stopwords,
#                              max_df=0.5,
#                              min_df=0.006)

# vectorizer.fit(train_data['text'])

# docs_word_matrix_raw = vectorizer.transform(train_data['text'])

# env_mapping = {value: index for index, value in enumerate(train_data['source'].unique())}
# env_index = train_data['source'].apply(lambda x: env_mapping[x])

# docs_word_matrix_tensor = torch.from_numpy(docs_word_matrix_raw.toarray()).float().to(device)
# env_index_tensor = torch.from_numpy(env_index.to_numpy()).long().to(device)


In [None]:
class LDA(nn.Module):
    def __init__(self, num_topics, num_words, hidden_dim, device='cpu'):
        super(LDA, self).__init__()

        self.num_topics = num_topics
        self.num_words = num_words

        self.beta = nn.Parameter(torch.randn([num_topics, num_words], device=device))
        self.beta_logvar = nn.Parameter(torch.zeros([num_topics, num_words], device=device))

        self.beta_prior = Normal(torch.zeros(num_topics, num_words, device=device), torch.ones(num_topics, num_words, device=device))
        self.theta_prior = Normal(torch.zeros(num_topics, device=device), torch.ones(num_topics, device=device))

        self.theta_network = nn.Sequential(
            nn.Linear(num_words, 50),
            nn.BatchNorm1d(50),
            nn.ReLU(),
            nn.Linear(50, num_topics * 2)
        )


    def forward(self, bow):
        batch_size, vocab_size = bow.size()

        theta_params = self.theta_network(bow)
        theta_mu, theta_logvar = theta_params.split(self.num_topics, dim=-1)
        theta = Normal(theta_mu, torch.exp(0.5 * theta_logvar).add(1e-8)).rsample()

        beta_dist = Normal(self.beta, torch.exp(0.5 * self.beta_logvar).add(1e-8))
        beta_sample = beta_dist.rsample()

        theta_softmax = F.softmax(theta, dim=-1)
        beta_softmax = F.softmax(beta_sample, dim=-1)

        z = theta_softmax @ beta_softmax
        return z

In [None]:
def calculate_kl_divergences(model, minibatch):
    # Compute ELBO
    z = model(minibatch)
    theta_mu, theta_logvar = model.theta_network(minibatch).split(model.num_topics, dim=-1)
    theta = Normal(theta_mu, torch.exp(0.5 * theta_logvar))
    beta = Normal(model.beta, torch.exp(0.5 * model.beta_logvar))

    theta_kl = torch.distributions.kl.kl_divergence(theta, model.theta_prior).sum()
    beta_kl = torch.distributions.kl.kl_divergence(beta, model.beta_prior).sum()

    return theta_kl, beta_kl

In [None]:
def bbvi_update(minibatch, lda_model, optimizer, total_samples):
    optimizer.zero_grad()
    elbo_accumulator = torch.zeros(1, device=minibatch.device)

    z = lda_model(minibatch)
    theta_kl, beta_kl = calculate_kl_divergences(lda_model, minibatch)

    elbo = (minibatch * z.log()).sum() - theta_kl - beta_kl
    elbo_accumulator += elbo.sum()
    (-elbo_accumulator).backward()
    optimizer.step()

    return elbo_accumulator.item()

In [None]:
def train_model(lda_model, docs_word_matrix_tensor, num_epochs=150, minibatch_size=1024, lr=0.01, device='cpu'):
    lda = lda_model.to(device)
    optimizer = torch.optim.Adam(lda_model.parameters(), lr=lr, betas=(0.9, 0.999))


    docs_word_matrix_tensor = docs_word_matrix_tensor.to(device)

    for epoch in range(num_epochs):
        elbo_accumulator = 0.0
        permutation = torch.randperm(docs_word_matrix_tensor.size()[0])

        for i in range(0, docs_word_matrix_tensor.size()[0], minibatch_size):
            indices = permutation[i:i+minibatch_size]
            minibatch = docs_word_matrix_tensor[indices]

            elbo = bbvi_update(minibatch, lda, optimizer, docs_word_matrix_tensor.size()[0])
            elbo_accumulator += elbo

        avg_elbo = elbo_accumulator / (docs_word_matrix_tensor.size()[0] / minibatch_size)
        print(f'Epoch: {epoch+1}, Average ELBO: {avg_elbo}')

In [None]:
num_topics = 20
num_epoch = 150
hidden = 50

lda_model = LDA(num_topics=num_topics, num_words=len(vectorizer.get_feature_names_out()), hidden_dim = hidden, device=device)

train_model(lda_model, docs_word_matrix_tensor, num_epochs=num_epoch, minibatch_size=1024, lr=0.01, device=device)

In [None]:
test_data_word_matrix_raw = vectorizer.transform(test1['text'])
test_data_word_matrix_tensor = torch.from_numpy(test_data_word_matrix_raw.toarray()).float().to(device)


In [None]:
def evaluate_model(env_tm_model, test_data_word_matrix_tensor):
    lda_model.to(device)
    lda_model.eval()

    with torch.no_grad():
        theta_test_params = lda_model.theta_network(test_data_word_matrix_tensor)
        theta_test_mu, theta_test_logvar = theta_test_params.split(lda_model.num_topics, dim=-1)
        theta_test_dist = Normal(theta_test_mu, torch.exp(0.5 * theta_test_logvar).add(1e-8))
        theta_test = theta_test_dist.rsample()
        theta_test_softmax = F.softmax(theta_test, dim=-1)

        beta_test_softmax = F.softmax(lda_model.beta.to(device), dim=-1)
        likelihood = torch.mm(theta_test_softmax, beta_test_softmax)

        # Compute perplexity of the test data
        N = torch.sum(test_data_word_matrix_tensor)
        log_perplex = -torch.sum(torch.log(likelihood) * test_data_word_matrix_tensor) / N
        perplexity = torch.exp(log_perplex)
        return perplexity

In [None]:
perplexity = evaluate_model(lda_model, test_data_word_matrix_tensor)

In [None]:
print(perplexity)

In [None]:
def print_top_words(lda, vectorizer, num_top_words):
    beta = torch.nn.functional.softmax(lda.beta, dim=1)  # Convert to probabilities
    for i, topic in enumerate(beta):
        top_words = topic.topk(num_top_words).indices
        print(f'Topic {i+1}: {[vectorizer.get_feature_names_out()[i] for i in top_words]}')


In [None]:
print_top_words(lda_model, vectorizer, num_top_words=12)