In [1]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.corpora import Dictionary
from gensim.models.wrappers import LdaMallet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

import gensim
from gensim.corpora import Dictionary
from gensim.models.wrappers import LdaMallet
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import LdaModel
import pyLDAvis
import pyLDAvis.gensim as gensimvis
from matplotlib import pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


# Preprocess a single tweet/text
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    stop_words.add("embeddedurl")
    tokens = [word for word in tokens if word not in stop_words]
    
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return tokens

# Load the data and transform it into a DataFrame
df = pd.read_csv("/Users/shtosti/Dropbox/study/UZH/FW23/SMA/topic_modelling_DEPO/data/full_year.csv")

# Filter tweets containing both "spirituality" and "religion"
df = df[(df['clean_text'].str.contains('spirituality', case=False)) & 
        (~df['clean_text'].str.contains('religion', case=False))]
df['tokenized_text'] = df['clean_text'].apply(preprocess_text)

# Use TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['clean_text'])

# Convert the TF-IDF matrix to a Gensim-compatible format
corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)

# Create vocabulary and doc-term matrix
dictionary = Dictionary(df['tokenized_text'])
doc_term_matrix = [dictionary.doc2bow(doc) for doc in df['tokenized_text']]

# Path to the Mallet executable
mallet_path = '../mallet-2.0.8/bin/mallet'

# Run and save the model
num_topics = 10 
lda = LdaMallet(mallet_path, corpus=doc_term_matrix, num_topics=num_topics, id2word=dictionary)
os.makedirs('models', exist_ok=True)
lda.save(os.path.join('models', 'lda_religion'))


# convert Mallet LDA model to Gensim LDA model
lda_model_mallet = lda
lda_model_gensim = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(lda_model_mallet)
vis_data = gensimvis.prepare(lda_model_gensim, doc_term_matrix, dictionary)

# save to HTML
html_dir = 'visualizations'
os.makedirs(html_dir, exist_ok=True)
html_filename = os.path.join(html_dir, f'lda_spirituality_{num_topics}_topics.html')
pyLDAvis.save_html(vis_data, html_filename)

[nltk_data] Downloading package punkt to /Users/shtosti/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shtosti/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/shtosti/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  from imp import reload
Mallet LDA: 10 topics, 4 topic bits, 1111 topic mask
Data loaded.
max tokens: 61
total tokens: 1366629
<10> LL/token: -10.40699
<20> LL/token: -10.07835
<30> LL/token: -9.72283
<40> LL/token: -9.43628

0	5	spirituality life book spiritual work love experience gt real success freedom guide start business faith gratitude lifestyle month miracle writingcommunity 
1	5	spirituality christian music church faith christianity thing kindle people rt amazon life thought medium share listen good catholic bible time 
2	5	spirituality day ji true saint world humanity path guru make india relig