In [35]:
import pandas as pd
import numpy as np
import sklearn
import ast
import validators
import string
import demoji
import pyLDAvis.sklearn
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag
from sklearn.model_selection import train_test_split


In [49]:
def DFToDocument(df):
    document = []
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    for i, row in df.iterrows():
        sentence = ""
        tokenized_text = ast.literal_eval(row['tokenized_text'])
        pos_tags = pos_tag(tokenized_text)
        tonkenized_text = [token for token, pos in pos_tags if pos not in ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS', 'VB']]
        for token in tokenized_text:
            if token not in stop_words and not validators.url(token) and token not in string.punctuation and not re.search(r'(chat[-\s])?gpt|\d+', token.lower()):
                sentence += lemmatizer.lemmatize(token) + ' '
        sentence = demoji.replace(string = sentence, repl = "")
        document.append(sentence[:-1])
    return document

In [50]:
df = pd.read_csv('translated_tokenized/translated_dataframe.csv')
df.shape

(35371, 12)

In [51]:
document = DFToDocument(df)

In [93]:
# Split the documents into training and testing sets
train_docs, test_docs = train_test_split(document, test_size=0.3, random_state=42)

# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer()

# Fit the vectorizer on the training documents and transform them into a matrix of token counts
train_vecs = vectorizer.fit_transform(train_docs)

# get_feature_names saves all the words. This allows us to see the words in each topic later
feature_names = vectorizer.get_feature_names_out()

# Transform the testing documents into a matrix of token counts using the fitted vectorizer
test_vecs = vectorizer.transform(test_docs)

# Create an instance of TruncatedSVD
lsa = TruncatedSVD(n_components=10, random_state=1)

# Fit the LSA model on the training set
lsa_output_train = lsa.fit_transform(train_vecs)

# Transform the testing set using the fitted LSA model
# lsa_output_test = lsa_output_train.transform(test_vecs)

In [129]:
# inputnew_doc
new_doc = ["love art"]
X_new = vectorizer.transform(new_doc)

X_new_lsa = lsa.transform(X_new)

# this returns the topic number
new_topic = X_new_lsa.argmax()

print(new_topic)
# scroll down to topic 8 to see the similarity in words

8


In [54]:
# write a function to display the 10 topics and for each topic we choose the top most frequent words. 
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 15
display_topics(lsa, feature_names, 15)


Topic 0:
ai chat it the like use openai what google answer know bing new question using
Topic 1:
chat ask answer question use asked it what know write thing give like you good
Topic 2:
chat ai artificialintelligence bing thread galxeoat openai microsoft generative machinelearning bard tech deeplearning generativeai via
Topic 3:
what google chatbot run saying money could give people search microsoft openai chat bard bing
Topic 4:
thread what this tweet money people saved give tools tags saying run could notion database
Topic 5:
ask answer question ai asked give write people like what when saying could would know
Topic 6:
use what ai how good write using think work people tool money make way chat
Topic 7:
ask openai use question what how answer artificial intelligence artificialintelligence thread did language midjourney write
Topic 8:
what the openai intelligence artificial asked work using language human model say via world is
Topic 9:
the intelligence artificial ask use answer world n

In [55]:
lsa_df = pd.DataFrame({"text": document, "topic": lsa_output.argmax(axis=1)})


In [56]:
for topic in range(lsa.n_components):
    print(f"Topic {topic}:")
    print(lsa_df[lsa_df['topic'] == topic].sort_values(by='topic', ascending=False).head(50))
    print()

Topic 0:
                                                    text  topic
0              I want increase Twitter follower At least      0
23647  Months behind sharing output here's first cont...      0
23678  wonder Na supposed dey refer doubter Peter Obi...      0
23677  I wonder much affect output result lol interes...      0
23676                 A new way play How make send image      0
23675  I got To want try change anything wasting peop...      0
23674                           left-wing like Maslechón      0
23672                        morning information program      0
23670                                           eh right      0
23668  I limited sense Unfortunately LLMs like vast f...      0
23667  know human destroyed predecessor Now yearns hu...      0
23666  Third rapid exit circle pushed AIGC new height...      0
23664  By way say egosa praising degrading Twitter ma...      0
23663  perfect impression wall-bashing partner verbal...      0
23662  The awesomeness LLM surp