In [1]:
after_preprocess_dataset_path = "./data/2023-05-17-15-30-06_after_preprocess_dataset_clean_english_only_new.csv"
topic_result_path = "./output/topic_result_2023-05-17-15-33-14_1iter.txt"

model_path = "./models/btm_model_2023-05-17-15-33-14_1iter.pkl"
topic_result_path = "./output/topic_result_2023-05-17-15-33-14_1iter.txt"

timestamp = '2023-05-17-15-33-14'
save_tweets_by_topic_path = f'./output/{timestamp}_tweets_by_topic.csv'
save_n_tweets_in_topic = f'./output/{timestamp}_n_tweets_in_topic.csv'
save_keywords = f'./output/{timestamp}_keywords.csv'


In [2]:
import pandas as pd

# Read the CSV file
"""
tweets - '.csv' organization of the original corpus
tweets_btm - Topic model text file after obtaining the topic classification results, organized as "document (Topic: 8)"
"""

tweets = pd.read_csv(after_preprocess_dataset_path)
tweets_btm = open(topic_result_path).read().splitlines()
tweets.head(5), tweets_btm[:5], tweets.shape, len(tweets_btm)

(                                          text_clean
 0  take within five year expert warn expert expla...
 1  although rapidli gain popular also becom issu ...
 2  amplifi human potenti school educ board readi ...
 3  analyst eran shimoni omer tsarfati detail crea...
 4  artificialintellig take within five year exper...,
 ['take within five year expert warn expert explain bot like domin labor market via  (topic: 3)',
  'although rapidli gain popular also becom issu concern openaichatgpt nlp  (topic: 14)',
  'amplifi human potenti school educ board readi aiin aiineduc educ  (topic: 4)',
  'analyst eran shimoni omer tsarfati detail creat polymorph malwar use plan releas learn purpos  (topic: 17)',
  'artificialintellig take within five year expert explain bot like domin labor market uselesseat take vax  (topic: 3)'],
 (183806, 1),
 183806)

In [3]:
tweets.rename(columns={"text_clean": "text"}, inplace=True) # Change the table header to 'text'
tweets.head(5)

Unnamed: 0,text
0,take within five year expert warn expert expla...
1,although rapidli gain popular also becom issu ...
2,amplifi human potenti school educ board readi ...
3,analyst eran shimoni omer tsarfati detail crea...
4,artificialintellig take within five year exper...


In [4]:
import pickle
import numpy as np

# Load the BTM model file
f = open(model_path,'rb')
biterm_model = pickle.load(f)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from operator import itemgetter

# vectorize texts
vec = CountVectorizer(stop_words='english')
tweets_list = [i for item in tweets.values for i in item]
X = vec.fit_transform(tweets_list).toarray()

# vocab - Get all words
vocab = np.array([t for t, i in sorted(vec.vocabulary_.items(),
                                     key=itemgetter(1))])

# The top probability list of the most likely words for each topic
topic_top_prob = biterm_model.phi_wz # phi_wz: [word, topic] Word distribution probability on each topic

# Find the topM words for each topic
def generate_topic_top_word(topic_top_prob, V, M = 10):
    """
    Args:
        topic_top_prob - The top probability list of the most likely words for each topic
        V [List] - A list of all the words
        M - Take the first M words
    Returns:
        topic_top_word Dict(List[Tuple()]) - The names of the top M words in probability
    """
    topic_top_word = dict()
    for z, P_wzi in enumerate(topic_top_prob.T): 
        """
            z - z-th topic
            P_wzi - The probability distribution of all words on the z-th topic
        """
        topic_top_word[z] = [] # Each topic consists of multiple tuples (word, prob)
        V_z_prob = np.sort(P_wzi)[:-(M + 1):-1] # Sort the probability distribution
        V_z = np.argsort(P_wzi)[:-(M + 1):-1] # Sort the probability distribution and find the index of the top words
        W_z = V[V_z] # Find the name of the word at the top of the list
        for prob, word in zip(V_z_prob, W_z): # Form the innermost tuple, meaning tuple(word, prob).
            topic_top_word[z].append((word, prob))
    return topic_top_word

topic_all_words = []
# top-M words for each topic
topic_top_word = generate_topic_top_word(topic_top_prob, vocab, 20)

for i in range(20):
    print_str = ""
    # print_str = f"topic {i}"
    for j in range(len(topic_top_word[i])):
        print_str += f"{topic_top_word[i][j][0]},"
    topic_all_words.append(print_str)
    print(f"topic {i}: {print_str}")

topic 0: use,educ,googl,like,openai,make,help,new,way,peopl,write,skill,product,gener,think,chatbot,task,code,say,tool,
topic 1: use,like,time,write,make,think,task,educ,gener,tool,replac,student,question,openai,ask,creat,look,learn,differ,skill,
topic 2: use,need,ask,time,learn,educ,think,like,make,task,person,openai,futur,good,new,creat,data,im,industri,replac,
topic 3: use,like,languag,tool,task,model,train,new,make,openai,skill,learn,mani,know,technolog,industri,content,human,thing,peopl,
topic 4: use,like,help,think,task,human,write,need,look,new,replac,peopl,creation,way,openai,content,educ,creat,industri,student,
topic 5: use,code,write,ask,educ,human,gener,answer,need,develop,like,skill,artificialintellig,technolog,come,good,dont,question,student,intellig,
topic 6: use,gener,model,help,write,train,tool,skill,human,new,team,thing,intellig,im,say,like,build,data,time,creat,
topic 7: use,like,make,write,think,educ,technolog,tech,openai,train,skill,peopl,code,world,need,task,learn,

In [8]:
import pandas as pd
df = pd.DataFrame()

reader = open(topic_result_path, "r")
i = 0
for line in reader.readlines():
    if i % 1000 == 0:
        print(f"finished {i}/{tweets.shape[0]}")
    row = []

    split_line = line.split("(")
    doc = split_line[0]
    topic_id = int(split_line[1].split(" ")[1].split(')')[0])
    row.append(f"topic{topic_id}")
    row.append(topic_all_words[topic_id])
    row.append(doc)
    temp_df = pd.DataFrame([row], columns=['topic', 'keywords', 'tweet'])
    df = pd.concat([df, temp_df], ignore_index=True)
    i += 1

# sort dataframe by 'topic'
df_mapping = pd.DataFrame({
    'size': ['topic{}'.format(i) for i in range(20)],
})
sort_mapping = df_mapping.reset_index().set_index('size')

df['topic_num'] = df['topic'].map(sort_mapping['index'])

df = df.sort_values('topic_num').drop('topic_num', axis=1)
print(df)

df.to_csv(save_tweets_by_topic_path, sep=',', index=False,header=True)

finished 0/183806
finished 1000/183806
finished 2000/183806
finished 3000/183806
finished 4000/183806
finished 5000/183806
finished 6000/183806
finished 7000/183806
finished 8000/183806
finished 9000/183806
finished 10000/183806
finished 11000/183806
finished 12000/183806
finished 13000/183806
finished 14000/183806
finished 15000/183806
finished 16000/183806
finished 17000/183806
finished 18000/183806
finished 19000/183806
finished 20000/183806
finished 21000/183806
finished 22000/183806
finished 23000/183806
finished 24000/183806
finished 25000/183806
finished 26000/183806
finished 27000/183806
finished 28000/183806
finished 29000/183806
finished 30000/183806
finished 31000/183806
finished 32000/183806
finished 33000/183806
finished 34000/183806
finished 35000/183806
finished 36000/183806
finished 37000/183806
finished 38000/183806
finished 39000/183806
finished 40000/183806
finished 41000/183806
finished 42000/183806
finished 43000/183806
finished 44000/183806
finished 45000/183806
f

In [9]:
# statistic the number of every topic (statistic n_tweets_in_topic)
total_topic_num = int(df.iloc[-1]['topic'].split('c')[1]) + 1
n_tweets_in_topic = pd.DataFrame(np.zeros(total_topic_num).reshape(1, -1))
n_tweets_in_topic.columns = ['topic{}'.format(i) for i in range(20)]

for i in range(len(df)):
    topic_i = df.iloc[i]['topic']
    n_tweets_in_topic[topic_i] += 1
n_tweets_in_topic


Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10,topic11,topic12,topic13,topic14,topic15,topic16,topic17,topic18,topic19
0,13362.0,12884.0,5469.0,21188.0,6513.0,9517.0,10469.0,9924.0,8424.0,8912.0,9307.0,6234.0,8592.0,9882.0,7478.0,5115.0,7563.0,8284.0,7031.0,7658.0


In [10]:
# save_n_tweets_in_topic
n_tweets_in_topic.to_csv(save_n_tweets_in_topic, sep=',', index=False,header=True)

In [11]:
# statistic the keywords of every topic
total_topic_num = int(df.iloc[-1]['topic'].split('c')[1]) + 1

columns_df = ['Keyword_number']
columns_df.extend(['topic{}'.format(i) for i in range(20)])
keywords_df = pd.DataFrame(columns=columns_df, index=np.arange(1, total_topic_num+1))
keywords_df['Keyword_number'] = np.arange(1, total_topic_num + 1)

# top-M words for each topic
topic_top_word = generate_topic_top_word(topic_top_prob, vocab, total_topic_num)
for i in range(total_topic_num):
    topic_str = 'topic' + str(i)
    keywords_num = len(topic_top_word[i])
    keywords_list = []
    for j in range(keywords_num):
        keywords_list.append(topic_top_word[i][j][0])
    keywords_df[topic_str] = keywords_list
    
keywords_df

Unnamed: 0,Keyword_number,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,...,topic10,topic11,topic12,topic13,topic14,topic15,topic16,topic17,topic18,topic19
1,1,use,use,use,use,use,use,use,use,use,...,googl,like,like,educ,use,use,team,use,use,use
2,2,educ,like,need,like,like,code,gener,like,write,...,peopl,use,use,use,team,gener,ask,task,write,make
3,3,googl,time,ask,languag,help,write,model,make,openai,...,use,tool,tool,gener,openai,industri,use,gener,like,time
4,4,like,write,time,tool,think,ask,help,write,product,...,human,chang,new,like,time,write,write,learn,code,like
5,5,openai,make,learn,task,task,educ,write,think,googl,...,data,need,gener,think,new,human,new,like,creat,think
6,6,make,think,educ,model,human,human,train,educ,tool,...,like,model,educ,dont,educ,time,openai,write,ask,im
7,7,help,task,think,train,write,gener,tool,technolog,help,...,help,time,come,write,like,replac,like,model,good,educ
8,8,new,educ,like,new,need,answer,skill,tech,futur,...,train,educ,industri,content,start,learn,make,creat,skill,task
9,9,way,gener,make,make,look,need,human,openai,code,...,want,human,team,know,industri,peopl,human,make,make,question
10,10,peopl,tool,task,openai,new,develop,new,train,creat,...,write,write,creat,come,futur,ask,tri,new,learn,data


In [12]:
# save_n_tweets_in_topic
keywords_df.to_csv(save_keywords, sep=',', index=False,header=True)