# LDA Topic Modelling using Gensim

In [8]:
import pandas as pd

#nlp libraries
import nltk
from nltk.corpus import stopwords

#gensim libraries
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel

#spacy for lemmatisation
import spacy

import pyLDAvis
import pyLDAvis.gensim

import warnings
warnings.filterwarnings("ignore")

In [9]:
# Load data and add a column 'anonymity_class' for grouping
data = pd.read_csv("threads_master_data.csv")

In [10]:
# Download NLTK stopwords if necessary
nltk.download("stopwords")
stopwords = stopwords.words("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akshathailangovan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
# Preprocessing: Lemmatization function
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    try:
        nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    except OSError:
        # If model is not found, download it
        import subprocess
        subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
        nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
        
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = [token.lemma_ for token in doc if token.pos_ in allowed_postags]
        texts_out.append(" ".join(new_text))
    return texts_out

In [12]:
# Tokenization and removal of stopwords
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return final

In [13]:
# Function to process and build LDA for each anonymity class
def process_group(df_group):
    # Preprocessing step: Lemmatize the data
    lemmatized_texts = lemmatization(df_group['Post_Content'].values.tolist())

    # Tokenization and removal of stopwords
    data_words = gen_words(lemmatized_texts)

    # Create Dictionary and Corpus
    id2word = corpora.Dictionary(data_words)
    corpus = [id2word.doc2bow(text) for text in data_words]

    # Build LDA model for this group
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=id2word,
                                                num_topics=10,  # Adjust number of topics as needed
                                                random_state=100,
                                                update_every=1,
                                                chunksize=100,
                                                passes=10,
                                                alpha='auto')
    # Print coherence score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()
    print(f"Coherence Score for Group {df_group['Anonymity_Class'].iloc[0]}: {coherence_score}")

    # Visualize the LDA model using pyLDAvis
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds="mmds", R=10)
    
    return vis

In [14]:
# Grouping data by 'Anonymity_Class'
grouped_data = data.groupby('Anonymity_Class')

# Iterating over each group, processing it, and displaying the LDA visualization
lda_vis_per_class = {}
for anonymity_class, group in grouped_data:
    print(f"Processing Anonymity Class: {anonymity_class}")
    lda_vis_per_class[anonymity_class] = process_group(group)

# Saving each LDA visualization separately
for anonymity_class, vis in lda_vis_per_class.items():
    pyLDAvis.save_html(vis, f'lda_topics_visualization_{anonymity_class}.html')

Processing Anonymity Class: Anonymous
Coherence Score for Group Anonymous: 0.5353695473697434
Processing Anonymity Class: Highly Identifiable
Coherence Score for Group Highly Identifiable: 0.47332960453758827
Processing Anonymity Class: Identifiable
Coherence Score for Group Identifiable: 0.39321859876535725
Processing Anonymity Class: Partially Anonymous
Coherence Score for Group Partially Anonymous: 0.46587067272202126
