In [14]:
import pandas as pd
import numpy as np
import sklearn
import ast
import validators
import string
import demoji
import pyLDAvis.sklearn
import re

from nltk.corpus import stopwords
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

In [19]:
def DFToDocument(df):
    document = []
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    for i, row in df.iterrows():
        sentence = ""
        tokenized_text = ast.literal_eval(row['tokenized_text'])
        pos_tags = pos_tag(tokenized_text)
        tonkenized_text = [token for token, pos in pos_tags if pos not in ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS', 'VB']]
        for token in tokenized_text:
            if token not in stop_words and not validators.url(token) and token not in string.punctuation and not re.search(r'(chat[-\s])?gpt|\d+', token.lower()):
                sentence += lemmatizer.lemmatize(token) + ' '
        sentence = demoji.replace(string = sentence, repl = "")
        document.append(sentence[:-1])
    return document

In [3]:
df = pd.read_csv('./translated_dataframe.csv')
df.shape

(35371, 12)

In [20]:
document = DFToDocument(df)
document

['I want increase Twitter follower At least',
 'Overall using AI create art expand boundary creativity allow artist explore new direction work  ',
 'This prompt created new  chat model following #MartesDeDatos talk text-davinci new model  ',
 "Just played around OpenAI's new model asked write poem embodied carbon building This I got ... Try #embodiedcarbon #poetry",
 'I asked write story Sova Jett falling love ...',
 "OpenAI's new good creative writing #openAI #poetry #Bitcoin",
 'This great improvement Unlike text-davinci understands context previous user ’ answer',
 '#OpenAI Wow',
 'My mind blown I cannot believe good OpenAI I see already replacing lot googling task I right box I ’ even know say anymore I must something wrong Someone please break',
 'Absolutely insane I asked generate simple personal website It showed step step create added HTML CSS The output used ',
 'It incredible manages propose analysis magnitude lot sense',
 'pretty mind-blowing',
 ':) #機械学習 #自然言語処理',
 'release

In [21]:
# CountVectorizer converts the text documents to a matrix of token counts
vectorizer = CountVectorizer()
docs_vecs = vectorizer.fit_transform(document)

# get_feature_names saves all the words. This allows us to see the words in each topic later
feature_names = vectorizer.get_feature_names_out()

In [22]:
docs_lda = LatentDirichletAllocation(n_components=10,               # Number of topics
                                      learning_method='online',   
                                      n_jobs = -1, random_state=1)              # use all available CPU
lda_output = docs_lda.fit_transform(docs_vecs)

In [23]:
# write a function to display the 10 topics and for each topic we choose the top most frequent words. 
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 15
display_topics(docs_lda, feature_names, 15)

Topic 0:
video japan found copy quite launch bit group sorry scary car only capable okay tesla
Topic 1:
ai would microsoft google take next bing chatbot business change the new week current case
Topic 2:
one intelligence artificial ai the here like via content using in response love story write
Topic 3:
it ai time use thing can work think good write want asked this know still
Topic 4:
to end news one the written from school high teacher self first and episode including
Topic 5:
ai openai tool technology the language model seems tech artificialintelligence like human world data so
Topic 6:
day information twitter market want service user keep provide become at update order increase million
Topic 7:
answer make question ask if use way you need get it give people what prompt
Topic 8:
also but well topic saying fact getting hot poem true turn tell yesterday it else
Topic 9:
chat like know search better right asked think going even when much google already look


In [24]:
lda_df = pd.DataFrame({"text": document, "topic": lda_output.argmax(axis=1)})

In [25]:
for topic in range(docs_lda.n_components):
    print(f"Topic {topic}:")
    print(lda_df[lda_df['topic'] == topic].sort_values(by='topic', ascending=False).head(50))
    print()

Topic 0:
                                                    text  topic
21                                                 Flipo      0
25523                                                         0
25526                                                         0
25529  I kinda prefer Bing's belligerent gaslighting ...      0
25536                                                         0
25541                                                         0
25546                                                         0
25558                                                         0
25564                                                         0
25565                           It's Mia Khalifa I asked      0
25568                                                         0
25569  TRG RDD Irritating US Media Bot DISSES Meghan ...      0
25570                                                         0
25574                                                         0
25577                          

# Visualizations

In [10]:
#Set up the environment to display the graphical outputs
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [11]:
# Generate the visuals
visual = pyLDAvis.sklearn.prepare(docs_lda, docs_vecs, vectorizer)

  default_term_info = default_term_info.sort_values(


In [12]:
visual