In [1]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import gensim
from gensim.corpora import Dictionary
from gensim.models.wrappers import LdaMallet
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import LdaModel
import pyLDAvis
import pyLDAvis.gensim as gensimvis
from matplotlib import pyplot as plt
import ast

def load_data(file_path):
    df = pd.read_csv(file_path)
    df['bigram_keyphrases'] = df['bigram_keyphrases'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else None)
    df['cleaned_bigrams'] = df['bigram_keyphrases'].apply(lambda x: [item[0].replace(' ', '_') for item in x] if x else [])
    return df

def filter_data(df):
    return df[(~df['clean_text'].str.contains('spirituality', case=False)) & 
              (df['clean_text'].str.contains('religion', case=False))]

def create_doc_term_matrix(data):
    dictionary = Dictionary(data['cleaned_bigrams'])
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in data['cleaned_bigrams']]
    return dictionary, doc_term_matrix

def train_lda_model(mallet_path, doc_term_matrix, dictionary, num_topics):
    lda = LdaMallet(mallet_path, corpus=doc_term_matrix, num_topics=num_topics, id2word=dictionary)
    os.makedirs('../models', exist_ok=True)
    lda.save(os.path.join('../models', 'lda_religion'))
    return lda

def convert_to_gensim_lda(lda_model_mallet):
    lda_model_gensim = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(lda_model_mallet)
    return lda_model_gensim

def visualize_lda_model(lda_model_gensim, doc_term_matrix, dictionary, num_topics):
    vis_data = gensimvis.prepare(lda_model_gensim, doc_term_matrix, dictionary)
    html_dir = '../visualizations'
    os.makedirs(html_dir, exist_ok=True)
    html_filename = os.path.join(html_dir, f'lda_religion_{num_topics}_topics.html')
    pyLDAvis.save_html(vis_data, html_filename)


file_path = "/Users/shtosti/Dropbox/study/UZH/FW23/SMA/topic_modelling_DEPO/data/with_clean_keybert_bigrams.csv"
mallet_path = '/Users/shtosti/Dropbox/study/UZH/FW23/SMA/topic_modelling_DEPO/Mallet/mallet-2.0.8/bin/mallet'

num_topics = 10

# load data
df = load_data(file_path)

# filter data
filtered_df = filter_data(df)

# create vocabulary and doc-term matrix
dictionary, doc_term_matrix = create_doc_term_matrix(filtered_df)

# train
lda_model_mallet = train_lda_model(mallet_path, doc_term_matrix, dictionary, num_topics)

# convert to Gensim LDA model
lda_model_gensim = convert_to_gensim_lda(lda_model_mallet)

# visualize and save the HTML
visualize_lda_model(lda_model_gensim, doc_term_matrix, dictionary, num_topics)


  from imp import reload
Mallet LDA: 10 topics, 4 topic bits, 1111 topic mask
Data loaded.
max tokens: 5
total tokens: 524355
<10> LL/token: -14.54632
<20> LL/token: -14.424
<30> LL/token: -14.36332
<40> LL/token: -14.32906

0	5	religion muslimadvocates muslimban_support nilc_muslimadvocates muslimban hindu religion_embeddedurl quran new_religion religion_political organized_religion humanrights_freedom believe_religion embeddedurl entitled_humanrights vaccine greencard_lottery cult religion_religion allah 
1	5	religion freedom_religion religion_embeddedurl true_religion secular hate_religion religion_politics hindu islam embeddedurl taliban terrorist freedom religion_cult monastery religion_peace hindutva terrorists_stormed stormed_monastery monastery_thegone 
2	5	religion islam islam_religion religious embeddedurl cult state_religion hindus religion_embeddedurl religion_islam god_religion faith discrimination religion_politics losing_religion muhammad caste_religion embeddedurl_embed