In [1]:
import re
import numpy as np
import pandas as pd
from copy import deepcopy
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
nltk.download('punkt')
cached_stopwords = stopwords.words("english")
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
!pip install bertopic[visualization] --quiet

[?25l[K     |█████▊                          | 10 kB 24.2 MB/s eta 0:00:01[K     |███████████▌                    | 20 kB 25.6 MB/s eta 0:00:01[K     |█████████████████▏              | 30 kB 18.1 MB/s eta 0:00:01[K     |███████████████████████         | 40 kB 15.0 MB/s eta 0:00:01[K     |████████████████████████████▊   | 51 kB 5.7 MB/s eta 0:00:01[K     |████████████████████████████████| 57 kB 2.7 MB/s 
[K     |████████████████████████████████| 13.2 MB 150 kB/s 
[K     |████████████████████████████████| 6.4 MB 36.4 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 78 kB 6.9 MB/s 
[K     |████████████████████████████████| 86 kB 5.5 MB/s 
[K     |████████████████████████████████| 3.1 MB 34.7 MB/s 
[K     |████████████████████████████████| 3.3 MB 31.5 MB/s 
[K     |████████████████████████████████| 1.2 MB 

In [2]:
from bertopic import BERTopic

In [3]:
def read_data(file_name):
    with open(file_name , 'r' , encoding="utf-8") as f:
        clean_data = []
        content = f.readline()
        while content:
            content = "".join([ch for ch in content if ord(ch)<= 128]) # remove unicode characters
            remove_date_and_name = re.sub(r"^\[\d{2}\/\d{2}\/\d{2},\s*\d+:\d{2}:\d{2}\s*\w+\]\s*([+\w\d ]+):","",content)
            if(len(remove_date_and_name) == len(content)):
                remove_date = re.sub(r"^\[\d{2}\/\d{2}\/\d{2},\s*\d+:\d{2}:\d{2}\s*\w+\]\s*","",content)
                remove_phone = re.sub(r"\+[\d\s\(\)‑ ]+","",remove_date)
                remove_phone = re.sub(r"@[\d]+[.,]*","",remove_phone)
                remove_phone = re.sub(r"@16825516642","",remove_phone)
                clean_data.append(remove_phone)
            else:
                clean_data.append(remove_date_and_name)
            content = f.readline()
        return clean_data

In [4]:
def remove_links(data_list):
    all_links = []
    for content in data_list:
        links_replaced = re.sub(r'((?:http|https)://[\d\w.\/\-@\?%=&]+)','link_given',content)
        all_links.append(links_replaced)
    return all_links

In [5]:
# def tokenize_clean(text):
#   tokens_without_sw = [word for word in word_tokenize(text) if (not word in cached_stopwords and len(word) > 3)]
#   filtered_sentence = (" ").join(tokens_without_sw)
#   filtered_sentence.replace(',','')
#   filtered_sentence.replace(':','')

#   return filtered_sentence

def tokenize_clean(text):
    text.replace(',','')
    text.replace(':','')
    tokens_without_sw = [word for word in word_tokenize(text) if (not word in cached_stopwords and len(word) > 3)]
    lemmatizer = WordNetLemmatizer()
    stems = [lemmatizer.lemmatize(item) for item in tokens_without_sw]
    return (" ").join(stems) 

In [6]:
def create_dataframe(data_list):
    return pd.DataFrame(data_list , columns=['chat_message'])

In [7]:
data_list = remove_links(read_data('_chat.txt'))

In [8]:
chat_df = create_dataframe(data_list)
chat_df['chat_message'] = chat_df['chat_message'].apply(tokenize_clean)

In [9]:
docs = list(chat_df.loc[:, "chat_message"].values)

In [10]:
model = BERTopic(language="english" , min_topic_size=40)
#model = BERTopic(language="english" , nr_topics=20)

In [11]:
topics, probs = model.fit_transform(docs)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]



In [12]:
model.get_topic_freq()

Unnamed: 0,Topic,Count
0,-1,5302
1,0,886
2,1,452
3,2,349
4,3,342
...,...,...
59,58,42
60,59,41
61,60,41
62,61,41


In [14]:
#new_topics, new_probs = model.reduce_topics(docs, topics, probabilities=probs, nr_topics=20)

In [28]:
model.get_topic(56)[:5]

[('congratulations', 1.1959201321499262),
 ('congrats', 0.7201893503822927),
 ('congratulation', 0.1560409679100237),
 ('proud', 0.09286659596404036),
 ('participated', 0.08349974217268974)]

In [29]:
model.get_topic(9)[:5]

[('thanks', 0.39519732774286853),
 ('atmosphere', 0.03949498908815509),
 ('thankyou', 0.03949498908815509),
 ('captured', 0.0355113846021784),
 ('thnx', 0.0355113846021784)]

In [30]:
model.get_topic(18)[:5]

[('invite', 0.3894250391267575),
 ('joined', 0.3761959272289742),
 ('group', 0.18212957527937707),
 ('groupme', 0.01978454958234919),
 ('invited', 0.01978454958234919)]

In [31]:
model.get_topic(26)[:5]

[('vaccine', 0.08975366127295045),
 ('vaccination', 0.06881916002432918),
 ('immunization', 0.0675635881392541),
 ('quarantine', 0.054998864988267844),
 ('covid', 0.04426740676312394)]

In [32]:
model.get_topic(36)[:5]

[('hiking', 0.2636375608499091),
 ('mountain', 0.11367546562924662),
 ('hike', 0.1087959350405013),
 ('climbing', 0.07720924972812965),
 ('climb', 0.0500228000627881)]

In [33]:
model.get_topic(39)[:5]

[('session', 0.31173688690590146),
 ('recording', 0.08233410825846739),
 ('recorded', 0.07948228088848255),
 ('online', 0.043472265905393846),
 ('attend', 0.041298989751998076)]

In [34]:
model.get_topic(44)[:5]

[('waitlist', 0.33315896115518173),
 ('waitlisted', 0.2119566114380209),
 ('listed', 0.0543922621343674),
 ('list', 0.046870890761877046),
 ('lines', 0.02612976464387447)]

In [35]:
model.get_topic(31)[:5]

[('register', 0.15768239137238962),
 ('registration', 0.14678328397854706),
 ('registered', 0.05006150076456967),
 ('registering', 0.04544574096130613),
 ('orientation', 0.0368039006116881)]

In [36]:
model.get_topic(13)[:5]

[('permit', 0.27914621503266074),
 ('cscse', 0.03657781417860859),
 ('permits', 0.027625670282965278),
 ('register', 0.025506318581088393),
 ('approved', 0.02423933248681785)]

In [37]:
model.get_topic(2)[:5]

[('linkgiven', 0.4184410044497927),
 ('link', 0.034782859188185924),
 ('linkgivenportal', 0.02230679032772217),
 ('predict', 0.02084301737426309),
 ('embed', 0.012404563359817666)]

In [26]:
model.visualize_topics()