In [3]:
import warnings
warnings.filterwarnings('ignore')
import gensim
from gensim.models import CoherenceModel
from gensim import corpora
import pandas as pd
from pprint import pprint
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

import mlflow
import mlflow.pyfunc
import mlflow.pyfunc.model

In [4]:
class TopicModel:
    def prepare_data(self, df):
        sentence_list = [tweet for tweet in df['cleaned_text']]
        word_list = []

        for sent in sentence_list:
            # Check if the value is not a float (assuming you want to exclude floats)
            if not isinstance(sent, float):
                word_list.append(sent.split())

        # Create dictionary which contains Id and word
        word_to_id = corpora.Dictionary(word_list)  # generate unique tokens
        corpus = [word_to_id.doc2bow(tweet) for tweet in word_list]

        return df, word_list, word_to_id, corpus

    def build_model(self, corpus, word_to_id):
        # Build LDA model
        lda_model = gensim.models.ldamodel.LdaModel(corpus,
                                            id2word=word_to_id,
                                            num_topics=5,
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto',
                                            per_word_topics=True)    
        return lda_model
    
    def log_to_mlflow(self, lda_model, corpus, word_list, word_to_id):
        with mlflow.start_run():
            # Log parameters
            mlflow.log_param("num_topics", 5)
            mlflow.log_param("passes", 10)

            # Log the model
            mlflow.sklearn.log_model(lda_model, "lda_model")

            # Log metrics
            mlflow.log_metric("perplexity", lda_model.log_perplexity(corpus))
            coherence_model_lda = CoherenceModel(model=lda_model, texts=word_list, dictionary=word_to_id, coherence='c_v')
            coherence_lda = coherence_model_lda.get_coherence()
            mlflow.log_metric("coherence", coherence_lda)


            # Show the top 10 topics
            self.show_topics(lda_model)

            # Visualize the top 10 topics
            pyLDAvis.enable_notebook()
            LDAvis_prepared = gensimvis.prepare(lda_model, corpus, word_to_id)

            # Explicitly end the MLflow run
            mlflow.end_run()

            return LDAvis_prepared
        #     mlflow.end_run()
        


    def show_topics(self, lda_model):
        pprint(lda_model.show_topics(formatted=False))
    
    def model_analysis(self, lda_model, corpus, word_list, word_to_id):
        print('\nPerplexity: ', lda_model.log_perplexity(corpus))
        doc_lda = lda_model[corpus]


        # Compute Coherence Score
        coherence_model_lda = CoherenceModel(model=lda_model, texts=word_list, dictionary=word_to_id, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        print('\n Lda model Coherence Score/Accuracy on Tweets: ', coherence_lda)
    
    def get_top_topics(self, df):
        df, word_list, word_to_id, corpus = self.prepare_data(df)
        lda_model = self.build_model(corpus, word_to_id)

        # Use the MLflow logging function
        result = self.log_to_mlflow(lda_model, corpus, word_list, word_to_id)

        return result


In [5]:
df = pd.read_csv("../data/slack_data.csv")
df.head()

Unnamed: 0,msg_id,text,cleaned_text,user_id,mentions,reactions,replies,ts,channel_id
0,16f68d4e-0ceb-448a-b660-d5ef2eb05305,*HOTSEAT ANNOUNCEMENT*,hotseat announc,U03V1AM5TFA,[],[],[],1662621000.0,C03T0APHX63
1,7c641275-2e52-4074-9894-744f049d5377,*<!here>* Good morning Community! We are very ...,good morn commun happi excit announc today hot...,U03V1AM5TFA,['U03U1GHT39V'],"[{'name': 'fire', 'users': ['U03U9FWPNCE'], 'c...",[],1662621000.0,C03T0APHX63
2,245ecc4d-2c1b-4bee-b280-a1fd5ab7fee3,*<!here> Community Building Session REMINDER!*...,commun build session remindertimerclock plea n...,U03V1AM5TFA,[],"[{'name': 'heart_eyes', 'users': ['U03UG4Q7V42...",[],1662638000.0,C03T0APHX63
3,fe80aff2-20f2-42ad-94a8-8b48ac63083f,Sweet music on Google meet now\n:point_right: ...,sweet music googl meet pointright meetgoogleco...,U03V1AM5TFA,[],[],[],1662638000.0,C03T0APHX63
4,2be29318-9c50-4b56-ae0b-ae8bcd4c92a3,Hellooo Helllo again my people the lovely com...,hellooo helllo peopl love commun guy ……it cb t...,U03V1AM5TFA,[],[],[],1662638000.0,C03T0APHX63


In [6]:
topic_model = TopicModel()
topic_model.get_top_topics(df)



[(0,
  [('work', 0.07143357),
   ('think', 0.037082665),
   ('instal', 0.0351738),
   ('error', 0.030519322),
   ('instanc', 0.026786616),
   ('task', 0.022259971),
   ('im', 0.019716762),
   ('go', 0.01951656),
   ('run', 0.017809704),
   ('logo', 0.017805267)]),
 (1,
  [('use', 0.075843915),
   ('extract', 0.025301505),
   ('guy', 0.023569485),
   ('link', 0.022902394),
   ('featur', 0.018135138),
   ('face', 0.016919008),
   ('key', 0.012890085),
   ('document', 0.012620783),
   ('detect', 0.012020348),
   ('yet', 0.011509425)]),
 (2,
  [('data', 0.041016508),
   ('ye', 0.036323603),
   ('plea', 0.035350125),
   ('meet', 0.034153566),
   ('connect', 0.03410955),
   ('let', 0.029789165),
   ('time', 0.028427737),
   ('week', 0.026523033),
   ('today', 0.026311453),
   ('start', 0.02576902)]),
 (3,
  [('file', 0.041779153),
   ('thank', 0.027321959),
   ('line', 0.024389803),
   ('get', 0.019460786),
   ('u', 0.013600272),
   ('channel', 0.012400379),
   ('creat', 0.012287803),
   ('i

  EPOCH = datetime.datetime.utcfromtimestamp(0)
  EPOCH = datetime.datetime.utcfromtimestamp(0)
  EPOCH = datetime.datetime.utcfromtimestamp(0)
  EPOCH = datetime.datetime.utcfromtimestamp(0)
  EPOCH = datetime.datetime.utcfromtimestamp(0)
  EPOCH = datetime.datetime.utcfromtimestamp(0)
  EPOCH = datetime.datetime.utcfromtimestamp(0)
  EPOCH = datetime.datetime.utcfromtimestamp(0)
