In [2]:
import pandas as pd
import numpy as np
import sklearn
import ast
import validators
import string
import demoji
import pyLDAvis.sklearn
import re

from nltk.corpus import stopwords
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

In [3]:
def DFToDocument(df):
    document = []
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    for i, row in df.iterrows():
        sentence = ""
        tokenized_text = ast.literal_eval(row['tokenized_text'])
        pos_tags = pos_tag(tokenized_text)
        tonkenized_text = [token for token, pos in pos_tags if pos not in ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS', 'VB']]
        for token in tokenized_text:
            if token not in stop_words and not validators.url(token) and token not in string.punctuation and not re.search(r'(chat[-\s])?gpt|\d+', token.lower()):
                sentence += lemmatizer.lemmatize(token) + ' '
        sentence = demoji.replace(string = sentence, repl = "")
        document.append(sentence[:-1])
    return document

In [4]:
df = pd.read_csv('translated_tokenized/translated_dataframe.csv')
df.shape

(35371, 12)

In [None]:
document = DFToDocument(df)
document

In [None]:
# CountVectorizer converts the text documents to a matrix of token counts
# vectorizer = CountVectorizer()
vectorizer = TfidfVectorizer()

docs_vecs = vectorizer.fit_transform(document)

# get_feature_names saves all the words. This allows us to see the words in each topic later
feature_names = vectorizer.get_feature_names_out()

In [None]:
docs_lda = LatentDirichletAllocation(n_components=10,               # Number of topics
                                      learning_method='online',   
                                      n_jobs = -1, random_state=1)              # use all available CPU
lda_output = docs_lda.fit_transform(docs_vecs)

In [None]:
# write a function to display the 10 topics and for each topic we choose the top most frequent words. 
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 15
display_topics(docs_lda, feature_names, 15)

In [None]:
lda_df = pd.DataFrame({"text": document, "topic": lda_output.argmax(axis=1)})

In [None]:
for topic in range(docs_lda.n_components):
    print(f"Topic {topic}:")
    print(lda_df[lda_df['topic'] == topic].sort_values(by='topic', ascending=False).head(50))
    print()

# Visualizations

In [None]:
#Set up the environment to display the graphical outputs
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [None]:
# Generate the visuals
visual = pyLDAvis.sklearn.prepare(docs_lda, docs_vecs, vectorizer)

In [None]:
visual