In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing

!pip install top2vec
!pip install top2vec[sentence_encoders]
from top2vec import Top2Vec
from sklearn.model_selection import GridSearchCV




In [2]:
!pip install redditcleaner
import re
import redditcleaner



In [3]:
def clean_data(csv_file_name):
    import re
    import redditcleaner
    df = pd.read_csv(csv_file_name, lineterminator="\n") ## reading the csv file
    df = df[~df['selftext'].isin(['[removed]', '[deleted]' ])].dropna(subset=['selftext']) ## dropping removed and deleted posts
    df['selftext'] = df['selftext'].map(redditcleaner.clean) ## cleaning text of reddit specific punctuations
    df['selftext'] = df['selftext'].map(lambda x: re.sub(r"[^A-Za-z ]", '', x)) ## cleaning punctuations
    df['selftext'] = df['selftext'].map(lambda x: re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', x)) ## cleaning url
    df['selftext'] = df['selftext'].apply(lambda x: x.lower()) ## lowercase
    df['selftext'] = df.selftext.values.tolist()
#    df['date'] = pd.to_datetime(df['date']).dt.to_period('D') 
#    df['date'] = df['date'].dt.strftime('%d-%m-%Y') ##formatting date into d-m-y 
    df = df[['url', 'selftext']]
    return(df)

In [5]:
df_pre = clean_data("file_name.csv")
df_pre['post_wordcount'] = df_pre['selftext'].str.count(' ') + 1
df_pre_clean = df_pre[df_pre.post_wordcount > 3]


In [6]:
frames = [df_pre_clean] 
df = pd.concat(frames)
display(df)

Unnamed: 0,url,selftext,post_wordcount
0,https://19thnews.org/2021/07/the-covid-delta-v...,the highly contagious delta variant of covid ...,775
1,https://19thnews.org/2020/11/karen-bass-addres...,were the only newsroom dedicated to writing ab...,913
2,https://19thnews.org/2020/08/kamala-harris-com...,read the latest story on how lgbtq americans a...,1582
3,https://19thnews.org/2020/11/kim-ng-mlb-first-...,were the only newsroom dedicated to writing ab...,868
4,https://19thnews.org/2020/12/shirley-sherrod-h...,as a nonprofit newsroom members are critical t...,1150
...,...,...,...
2714,https://www.zerohedge.com/political/state-wash...,authored by jonathan turley the house democrat...,341
2715,https://www.zerohedge.com/geopolitical/biden-k...,the biden administration has said it will cont...,325
2716,https://www.zerohedge.com/political/we-want-re...,update et someone on twitter points out that a...,810
2717,https://www.zerohedge.com/technology/here-are-...,over the past week president trump has been ki...,825


In [None]:
## Finding the Optimal Top2Vec Model

model_list = []
num_top = []
model_topics = []
df["selftext"] = df["selftext"].values.astype('str')

for min_count in range(5, 50, 5):
    model = Top2Vec(list(df["selftext"]), min_count, chunk_length=100, embedding_model="universal-sentence-encoder")
    model_topics.append(min_count)
    model_list.append(model)
    num_top.append(model.get_num_topics())
    
    ## Top2Vec Coherence Score, code from: https://github.com/Datanaught & https://github.com/ddangelov/Top2Vec/issues/158
    import gensim.corpora as corpora
    from gensim.utils import tokenize
    from gensim.models import CoherenceModel

    def t2vCoherence(df, topic_words):
        tokenized = [list(tokenize(doc)) for doc in df.selftext.tolist()]
        id2word = corpora.Dictionary(tokenized)
        corpus = [id2word.doc2bow(text) for text in tokenized]
    # make sure you grab the topic words from the topic model and convert them to a list
        coherence_model = CoherenceModel(topics=topic_words, texts=tokenized, 
                                     corpus=corpus, dictionary=id2word, coherence='c_v', topn=50)  
        coherence = coherence_model.get_coherence()
        # print("Model Coherence C_V is:{0}".format(coherence))
        return coherence

#    topic_words, word_scores, topic_nums = model.get_topics(model.get_num_topics())
    topic_words, word_scores = model._find_topic_words_and_scores(model.topic_vectors) 
    
    print("# Min Count: " + str(min_count) + " & Number of Topics: " + str(model.get_num_topics()) + " & Coherence:" + str(t2vCoherence(df, topic_words)))

2022-05-18 14:40:37,575 - top2vec - INFO - Pre-processing documents for training
2022-05-18 14:40:46,032 - top2vec - INFO - Downloading universal-sentence-encoder model
2022-05-18 14:41:45.797641: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-05-18 14:41:58,967 - top2vec - INFO - Creating joint document/word embedding
INFO:top2vec:Creating joint document/word embedding
2022-05-18 14:42:19,125 - top2vec - INFO - Creating lower dimension embedding of documents
INFO:top2vec:Creating lower dimension embedding of documents
2022-05-18 14:42:35,159 - top2vec - INFO - Finding dense areas of documents
INFO:top2vec:Finding dense areas of documents
2022-05-18 14:42:35,318 - top2vec - INFO - Finding topics
INFO:top2vec:Finding to

# Min Count: 5 & Number of Topics: 2 & Coherence:0.8464550645125578


2022-05-18 14:43:13,749 - top2vec - INFO - Downloading universal-sentence-encoder model
INFO:top2vec:Downloading universal-sentence-encoder model
2022-05-18 14:43:27,870 - top2vec - INFO - Creating joint document/word embedding
INFO:top2vec:Creating joint document/word embedding
2022-05-18 14:43:47,075 - top2vec - INFO - Creating lower dimension embedding of documents
INFO:top2vec:Creating lower dimension embedding of documents
2022-05-18 14:44:01,337 - top2vec - INFO - Finding dense areas of documents
INFO:top2vec:Finding dense areas of documents
2022-05-18 14:44:01,501 - top2vec - INFO - Finding topics
INFO:top2vec:Finding topics
2022-05-18 14:45:01,456 - top2vec - INFO - Pre-processing documents for training
INFO:top2vec:Pre-processing documents for training


# Min Count: 10 & Number of Topics: 34 & Coherence:0.6514580155603966


2022-05-18 14:45:10,064 - top2vec - INFO - Downloading universal-sentence-encoder model
INFO:top2vec:Downloading universal-sentence-encoder model
2022-05-18 14:45:25,935 - top2vec - INFO - Creating joint document/word embedding
INFO:top2vec:Creating joint document/word embedding
2022-05-18 14:45:47,474 - top2vec - INFO - Creating lower dimension embedding of documents
INFO:top2vec:Creating lower dimension embedding of documents
2022-05-18 14:46:02,850 - top2vec - INFO - Finding dense areas of documents
INFO:top2vec:Finding dense areas of documents
2022-05-18 14:46:03,026 - top2vec - INFO - Finding topics
INFO:top2vec:Finding topics
2022-05-18 14:46:33,898 - top2vec - INFO - Pre-processing documents for training
INFO:top2vec:Pre-processing documents for training


# Min Count: 15 & Number of Topics: 2 & Coherence:0.7538062869096442


2022-05-18 14:46:42,417 - top2vec - INFO - Downloading universal-sentence-encoder model
INFO:top2vec:Downloading universal-sentence-encoder model
2022-05-18 14:46:58,575 - top2vec - INFO - Creating joint document/word embedding
INFO:top2vec:Creating joint document/word embedding
2022-05-18 14:47:20,021 - top2vec - INFO - Creating lower dimension embedding of documents
INFO:top2vec:Creating lower dimension embedding of documents
2022-05-18 14:47:37,413 - top2vec - INFO - Finding dense areas of documents
INFO:top2vec:Finding dense areas of documents
2022-05-18 14:47:37,588 - top2vec - INFO - Finding topics
INFO:top2vec:Finding topics
2022-05-18 14:48:03,179 - top2vec - INFO - Pre-processing documents for training
INFO:top2vec:Pre-processing documents for training


# Min Count: 20 & Number of Topics: 2 & Coherence:0.7320953433270296


2022-05-18 14:48:12,465 - top2vec - INFO - Downloading universal-sentence-encoder model
INFO:top2vec:Downloading universal-sentence-encoder model
2022-05-18 14:48:26,275 - top2vec - INFO - Creating joint document/word embedding
INFO:top2vec:Creating joint document/word embedding
2022-05-18 14:48:46,798 - top2vec - INFO - Creating lower dimension embedding of documents
INFO:top2vec:Creating lower dimension embedding of documents
2022-05-18 14:49:02,363 - top2vec - INFO - Finding dense areas of documents
INFO:top2vec:Finding dense areas of documents
2022-05-18 14:49:02,521 - top2vec - INFO - Finding topics
INFO:top2vec:Finding topics
2022-05-18 14:49:31,138 - top2vec - INFO - Pre-processing documents for training
INFO:top2vec:Pre-processing documents for training


# Min Count: 25 & Number of Topics: 2 & Coherence:0.6931288998039375


2022-05-18 14:49:40,512 - top2vec - INFO - Downloading universal-sentence-encoder model
INFO:top2vec:Downloading universal-sentence-encoder model
2022-05-18 14:49:51,530 - top2vec - INFO - Creating joint document/word embedding
INFO:top2vec:Creating joint document/word embedding
2022-05-18 14:50:09,364 - top2vec - INFO - Creating lower dimension embedding of documents
INFO:top2vec:Creating lower dimension embedding of documents
2022-05-18 14:50:24,400 - top2vec - INFO - Finding dense areas of documents
INFO:top2vec:Finding dense areas of documents
2022-05-18 14:50:24,557 - top2vec - INFO - Finding topics
INFO:top2vec:Finding topics
2022-05-18 14:50:50,726 - top2vec - INFO - Pre-processing documents for training
INFO:top2vec:Pre-processing documents for training


# Min Count: 30 & Number of Topics: 2 & Coherence:0.6434535771890731


2022-05-18 14:50:59,305 - top2vec - INFO - Downloading universal-sentence-encoder model
INFO:top2vec:Downloading universal-sentence-encoder model
2022-05-18 14:51:12,821 - top2vec - INFO - Creating joint document/word embedding
INFO:top2vec:Creating joint document/word embedding
2022-05-18 14:51:32,622 - top2vec - INFO - Creating lower dimension embedding of documents
INFO:top2vec:Creating lower dimension embedding of documents
