In [55]:
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
import gensim
from gensim.models import CoherenceModel
from gensim import corpora
import pandas as pd
from pprint import pprint
import string
import os
import re
import  sys
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis


In [56]:
# Add parent directory to path to import modules from src
rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

from src.loader import SlackDataLoader
import src.utils as utils

In [57]:
data_loader = SlackDataLoader("../data")

def get_channel_messages(channel):
    channel_messages = utils.get_messages_on_channel(f"../data/{channel}") 
    # Create an empty DataFrame
    df = pd.DataFrame(channel_messages)
    return df

def get_all_channels_message():
    dfs = []  # List to store individual DataFrames

    for channel in data_loader.channels:
        dfs.append(get_channel_messages(channel["name"]))

    # Concatenate all DataFrames into a single DataFrame
    result_df = pd.concat(dfs, ignore_index=True)

    return result_df

In [59]:
def preprocess_text(text):
    # Extract and remove URLs
    urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    for url in urls:
        text = text.replace(url, '')

    text = re.sub(r'<@.*?>', '', text)

    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Perform stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    # Perform lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join the tokens back into a string
    text = ' '.join(tokens)

    return text


In [61]:

def prepare_data(df):
    df['cleaned_text'] = df['text'].apply(preprocess_text)
    sentence_list = [tweet for tweet in df['cleaned_text']]
    word_list = [sent.split() for sent in sentence_list]

    #Create dictionary which contains Id and word
    word_to_id = corpora.Dictionary(word_list) #generate unique tokens
    corpus = [word_to_id.doc2bow(tweet) for tweet in word_list]
    
    return df, word_list, word_to_id, corpus

In [63]:
def build_model(corpus, word_to_id):
    # Build LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus,
                                        id2word=word_to_id,
                                        num_topics=5,
                                        random_state=100,
                                        update_every=1,
                                        chunksize=100,
                                        passes=10,
                                        alpha='auto',
                                        per_word_topics=True)    
    return lda_model

In [65]:

def show_topics(lda_model):
    pprint(lda_model.show_topics(formatted=False))

In [66]:
def model_analysis(lda_model, corpus, word_list, word_to_id):
    print('\nPerplexity: ', lda_model.log_perplexity(corpus))
    doc_lda = lda_model[corpus]


    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=word_list, dictionary=word_to_id, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\n Lda model Coherence Score/Accuracy on Tweets: ', coherence_lda)

In [84]:
def get_top_topics(df):
    df, word_list, word_to_id, corpus = prepare_data(df)
    lda_model = build_model(corpus, word_to_id)

    # Show the top 10 topics
    show_topics(lda_model)
    

    # Visualize the top 10 topics
    pyLDAvis.enable_notebook()
    LDAvis_prepared = gensimvis.prepare(lda_model, corpus, word_to_id)
    return LDAvis_prepared

### Top 10 topics of the different channels

In [85]:
df = get_channel_messages("all-week1")
get_top_topics(df)

[(0,
  [('use', 0.04629722),
   ('user', 0.034003064),
   ('session', 0.028530736),
   ('featur', 0.018247005),
   ('score', 0.015890928),
   ('total', 0.014961363),
   ('•', 0.014906585),
   ('start', 0.014091308),
   ('engag', 0.013270916),
   ('think', 0.013084882)]),
 (1,
  [('work', 0.04845276),
   ('channel', 0.032532107),
   ('two', 0.018797847),
   ('file', 0.018182827),
   ('use', 0.01643002),
   ('data', 0.015751915),
   ('notebook', 0.012717859),
   ('incognito', 0.01247491),
   ('add', 0.011952376),
   ('expect', 0.011469893)]),
 (2,
  [('thank', 0.06351325),
   ('column', 0.024919178),
   ('id', 0.023483997),
   ('valu', 0.022867942),
   ('data', 0.019010132),
   ('use', 0.016901003),
   ('link', 0.016111318),
   ('submiss', 0.015757607),
   ('github', 0.014633137),
   ('task', 0.0133222)]),
 (3,
  [('normal', 0.02755062),
   ('use', 0.02691801),
   ('outlier', 0.02512235),
   ('report', 0.025107931),
   ('tri', 0.021214876),
   ('mean', 0.017638601),
   ('want', 0.0169880

In [86]:
df = get_channel_messages("all-community-building")
get_top_topics(df)

[(0,
  [('arun', 0.042168994),
   ('get', 0.021699265),
   ('dont', 0.020860696),
   ('first', 0.01833718),
   ('know', 0.016804324),
   ('think', 0.0153800035),
   ('joy', 0.0148091195),
   ('say', 0.014181729),
   ('go', 0.013683029),
   ('man', 0.012136477)]),
 (1,
  [('u', 0.04294473),
   ('one', 0.035910558),
   ('like', 0.03278028),
   ('thank', 0.02106849),
   ('make', 0.020496279),
   ('grin', 0.018016053),
   ('day', 0.016980072),
   ('person', 0.014324365),
   ('realli', 0.01369259),
   ('would', 0.013683403)]),
 (2,
  [('good', 0.090647586),
   ('morn', 0.07716033),
   ('time', 0.036429323),
   ('cb', 0.023522833),
   ('•', 0.022947542),
   ('today', 0.020783931),
   ('session', 0.018044068),
   ('plea', 0.017591298),
   ('happi', 0.014716192),
   ('let', 0.014710607)]),
 (3,
  [('hello', 0.10202492),
   ('week', 0.03399055),
   ('guy', 0.022563318),
   ('peopl', 0.019647341),
   ('use', 0.018128946),
   ('friend', 0.017528538),
   ('last', 0.016712157),
   ('laugh', 0.01351

### Top 10 topics of all channels

In [87]:
df = get_all_channels_message()
get_top_topics(df)

[(0,
  [('work', 0.061310686),
   ('thank', 0.035198063),
   ('think', 0.03282949),
   ('task', 0.020581743),
   ('im', 0.017542895),
   ('logo', 0.015831266),
   ('dont', 0.015652644),
   ('also', 0.015453295),
   ('code', 0.014646822),
   ('model', 0.013476593)]),
 (1,
  [('channel', 0.18876673),
   ('join', 0.18849722),
   ('use', 0.050648026),
   ('data', 0.025946053),
   ('ye', 0.022727432),
   ('connect', 0.018275846),
   ('plea', 0.017530445),
   ('time', 0.017410634),
   ('start', 0.016544862),
   ('sure', 0.0149129685)]),
 (2,
  [('let', 0.032837767),
   ('guy', 0.03207949),
   ('good', 0.030575136),
   ('u', 0.030185215),
   ('make', 0.026710935),
   ('group', 0.026556589),
   ('featur', 0.02613478),
   ('need', 0.025963098),
   ('today', 0.023980152),
   ('run', 0.023953129)]),
 (3,
  [('file', 0.04864594),
   ('tri', 0.037348438),
   ('line', 0.029828414),
   ('error', 0.02105171),
   ('instanc', 0.019424342),
   ('get', 0.01919499),
   ('imag', 0.013803347),
   ('creat', 0