In [18]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import gensim
from gensim.corpora import Dictionary
from gensim.models.wrappers import LdaMallet
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import LdaModel
import pyLDAvis
import pyLDAvis.gensim as gensimvis
from matplotlib import pyplot as plt
import ast

def load_data(file_path):
    df = pd.read_csv(file_path)
    df['bigram_keyphrases'] = df['bigram_keyphrases'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else None)
    df['cleaned_bigrams'] = df['bigram_keyphrases'].apply(lambda x: [item[0].replace(' ', '_') for item in x] if x else [])
    return df

def filter_data(df):
    return df[(df['clean_text'].str.contains('spirituality', case=False)) & 
              (df['clean_text'].str.contains('religion', case=False))]

def create_doc_term_matrix(data):
    dictionary = Dictionary(data['cleaned_bigrams'])
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in data['cleaned_bigrams']]
    return dictionary, doc_term_matrix

def train_lda_model(mallet_path, doc_term_matrix, dictionary, num_topics):
    lda = LdaMallet(mallet_path, corpus=doc_term_matrix, num_topics=num_topics, id2word=dictionary)
    os.makedirs('../models', exist_ok=True)
    lda.save(os.path.join('../models', 'lda_religion_AND_spirituality'))
    return lda

def convert_to_gensim_lda(lda_model_mallet):
    lda_model_gensim = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(lda_model_mallet)
    return lda_model_gensim

def visualize_lda_model(lda_model_gensim, doc_term_matrix, dictionary, num_topics):
    vis_data = gensimvis.prepare(lda_model_gensim, doc_term_matrix, dictionary)
    html_dir = '../visualizations'
    os.makedirs(html_dir, exist_ok=True)
    html_filename = os.path.join(html_dir, f'lda_religion_AND_spirituality_{num_topics}_topics.html')
    pyLDAvis.save_html(vis_data, html_filename)


file_path = "/Users/shtosti/Dropbox/study/UZH/FW23/SMA/topic_modelling_DEPO/data/with_clean_keybert_bigrams.csv"
mallet_path = '/Users/shtosti/Dropbox/study/UZH/FW23/SMA/topic_modelling_DEPO/Mallet/mallet-2.0.8/bin/mallet'

num_topics = 10

# load data
df = load_data(file_path)

# filter data
filtered_df = filter_data(df)

# create vocabulary and doc-term matrix
dictionary, doc_term_matrix = create_doc_term_matrix(filtered_df)

# train
lda_model_mallet = train_lda_model(mallet_path, doc_term_matrix, dictionary, num_topics)

# convert to Gensim LDA model
lda_model_gensim = convert_to_gensim_lda(lda_model_mallet)

# visualize and save the HTML
visualize_lda_model(lda_model_gensim, doc_term_matrix, dictionary, num_topics)


Mallet LDA: 10 topics, 4 topic bits, 1111 topic mask
Data loaded.
max tokens: 5
total tokens: 57644
<10> LL/token: -11.08716
<20> LL/token: -10.93397
<30> LL/token: -10.8458
<40> LL/token: -10.80546

0	5	spirituality religion_spirituality spiritual spirituality_religion christian_kindle morhafalachkar_religion embeddedurl_morhafalachkar religions religion_spiritual books church spirituality_christian supernatural_inspirational morhafalachkar supernatural religion mennonite_fiction catholic spirituality_related organised_religion 
1	5	mary_the_blessed_virgin_jesus mary_the_blessed_virgin spirituality salvation_inspiration religion_scripture bible_mary_the_blessed_virgin african religion_spirituality religion_book kindle_vanceshepperson spirituality_spiritualityquotes africans want_religion selflove_religion love_hatred bible_exposition hatred_info hindu african_spirituality reading_kindle 
2	5	spirituality religion_spirituality retirement_finances retirement makingmoney_retirement finan