In [1]:
import pandas as pd
import numpy as np
import sklearn
import ast
import validators
import string
import demoji
import pyLDAvis.sklearn
import re
import nltk

from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import GridSearchCV
from nltk import pos_tag

In [2]:
def DFToDocument(df):
    document = []
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    for i, row in df.iterrows():
        sentence = ""
        tokenized_text = ast.literal_eval(row['tokenized_text'])
        pos_tags = pos_tag(tokenized_text)
        tonkenized_text = [token for token, pos in pos_tags if pos not in ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS', 'VB']]
        for token in tokenized_text:
            if token not in stop_words and not validators.url(token) and token not in string.punctuation and not re.search(r'(chat[-\s])?gpt|\d+', token.lower()):
                sentence += lemmatizer.lemmatize(token) + ' '
        sentence = demoji.replace(string = sentence, repl = "")
        document.append(sentence[:-1])
    return document

In [3]:
def ProcessInput(inputArray):
    document = []
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    for text in inputArray:
        sentence = ""
        tokenized_text = nltk.word_tokenize(text)
        pos_tags = pos_tag(tokenized_text)
        tonkenized_text = [token for token, pos in pos_tags if pos not in ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS', 'VB']]
        for token in tokenized_text:
            if token not in stop_words and not validators.url(token) and token not in string.punctuation and not re.search(r'(chat[-\s])?gpt|\d+', token.lower()):
                sentence += lemmatizer.lemmatize(token) + ' '
        sentence = demoji.replace(string = sentence, repl = "")
        document.append(sentence[:-1])
    return document

In [4]:
def ShowDocInTopic(topic, count, model_output, model):
    lda_df = pd.DataFrame({"text": document_train, "topic": model_output.argmax(axis=1)})
    pd.set_option('display.max_colwidth', None)
    return lda_df[lda_df['topic'] == topic].head(count)

In [5]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [6]:
df = pd.read_csv('./translated_dataframe.csv')
document = DFToDocument(df)
document_train, document_test = train_test_split(document, test_size=0.3, random_state=1)

In [7]:
vectorizer = CountVectorizer()
train_vecs = vectorizer.fit_transform(document_train)
test_vecs = vectorizer.transform(document_test)
feature_names = vectorizer.get_feature_names_out()

In [8]:
lda = LatentDirichletAllocation(n_components=10, learning_method='online', n_jobs = -1, random_state=1)
lda_output = lda.fit_transform(train_vecs)
display_topics(lda, feature_names, 10)

Topic 0:
ai tool via could make code it data business like
Topic 1:
google better chatbot next company ai point the bard created
Topic 2:
make take time based research yet fun asked way life
Topic 3:
want and writing ai create love crypto high correct from
Topic 4:
think good like used still much know when learning try
Topic 5:
use chat using many way asked work it great version
Topic 6:
ai you technology world interesting future new language problem the
Topic 7:
it answer question ai bing ask time chat like write
Topic 8:
the information even use one say that student thing is
Topic 9:
ai openai using service model video new image generative language


In [9]:
print("Perplexity: ", lda.perplexity(test_vecs))

Perplexity:  9756.231697967969


In [13]:
search_params = {'learning_decay': np.arange(0.1, 0.9, 0.1), 'doc_topic_prior': np.arange(0.1, 0.9, 0.1), 'topic_word_prior': np.arange(0.1, 0.9, 0.1)}
docs_lda = LatentDirichletAllocation(n_components=6, learning_decay=0.8, topic_word_prior=0.1, learning_method="online", n_jobs=-1)  

model = GridSearchCV(docs_lda, param_grid=search_params, n_jobs=8, verbose=3)

model.fit(train_vecs)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


0.8

learning_decay = 0.8
doc_topic_prior = 0.8
topic_word_prior = 0.1

In [14]:
best_lda = LatentDirichletAllocation(n_components=6, learning_method="online", learning_decay= 0.8, doc_topic_prior= 0.8, topic_word_prior= 0.1, n_jobs=-1)
best_lda_output = best_lda.fit_transform(train_vecs)

In [15]:
display_topics(best_lda, feature_names, 15)

Topic 0:
make you say people code it time in what bot love life for is point
Topic 1:
article good post morning problem chat high generated blog sentence note lol asked link called
Topic 2:
ai openai new google bing tool technology model microsoft artificialintelligence search how language via text
Topic 3:
use using asked write chat want day way if one prompt take come help create
Topic 4:
the intelligence artificial world future human what year nft crypto english language information source become
Topic 5:
like it answer know question think ask would get much even good thing also really


In [16]:
best_lda_df = pd.DataFrame({"text": document_train, "topic": best_lda_output.argmax(axis=1)})
print("Perplexity: ", best_lda.perplexity(test_vecs))

Perplexity:  6139.150702191745


In [17]:
new_doc = vectorizer.transform(ProcessInput(['I hate chatgpt, it is going to take my job']))

In [18]:
topic_probs = best_lda.transform(new_doc)
topic_probs.argmax()

3

In [19]:
ShowDocInTopic(topic_probs.argmax(), 50, best_lda_output, best_lda)

Unnamed: 0,text,topic
1,I'm exploring using get way help #teaching lot task asked #faculty day take needed necessary time teaching researching appropriaterelevant meaningful tech ARMtech help effectivity quality might I use tech,3
4,using igl,3
7,Here old prompt engineered solution I created multiplication seems better v one tested extensively,3
8,The longest English word containing one vowel strength,3
16,Artists beware I asked recreate Return Jedi movie poster using David Bowie Luke Skywalker Royal family rest cast #StarWars,3
21,Chat Ground-Shattering technology But people stuck beginner mode dead simple way use productivity copy-and-paste,3
25,Mr violates content policy If question guilty person's answer I'm guilty I wonder treated well seems penalty,3
27,Regarding utilization case I sold day We distributed free charge wrote review Inheritance Brain ⇧ I told I distribute due mistake Sorry Currently available Inheritance Review Privilege please check,3
32,exclusive stable account handmade One number shared account hour self-service mall Smile level politics safe,3
39,Using create script video lesson I programming,3
