In [1]:
%reload_ext autoreload
%autoreload 2

In [147]:
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
import gensim
from gensim.models import CoherenceModel
from gensim import corpora
import pandas as pd
from pprint import pprint
import string
import os
import re
import  sys
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer


In [148]:
# Add parent directory to path to import modules from src
rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

from src.loader import SlackDataLoader
import src.utils as utils

In [163]:
data_loader = SlackDataLoader("../data")

def get_channel_messages(channel):
    channel_messages = utils.get_messages_on_channel(f"../data/{channel}") 
    print(channel_messages)  

    # Create an empty DataFrame
    df = pd.DataFrame(channel_messages)
    return channel_messages

def get_all_channels_message():
    dfs = []  # List to store individual DataFrames

    for channel in data_loader.channels:
        dfs.append(get_channel_messages(channel))

    # Concatenate all DataFrames into a single DataFrame
    result_df = pd.concat(dfs, ignore_index=True)

    return result_df


# all_channels_message = {}
# for channel in data_loader.channels:
#     channel_messages = utils.get_messages_on_channel(f"../data/{channel["name"]}")
#     all_channels_message[channel["name"]] = channel_messages



# data = []

# # Iterate through the channels and messages
# for channel, messages in all_channels_message.items():
#     for message in messages:
#         data.append({'channel': channel, 'ts': message['ts'], 'text': message['text']})

# # Create a Pandas DataFrame
# df = pd.DataFrame(data)
# print(df.columns)
# print(df.head(1))



In [120]:
def preprocess_text(text):
    # Extract and remove URLs
    urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    for url in urls:
        text = text.replace(url, '')

    text = re.sub(r'<@.*?>', '', text)

    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Perform stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    # Perform lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join the tokens back into a string
    text = ' '.join(tokens)

    return text


In [162]:

def prepare_data(df):
    df['cleaned_text'] = df['text'].apply(preprocess_text)
    sentence_list = [tweet for tweet in df['cleaned_text']]
    word_list = [sent.split() for sent in sentence_list]

    #Create dictionary which contains Id and word
    word_to_id = corpora.Dictionary(word_list) #generate unique tokens
    corpus = [word_to_id.doc2bow(tweet) for tweet in word_list]
    
    return df, word_list, word_to_id, corpus

In [110]:
def build_model(corpus, word_to_id):
    # Build LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus,
                                        id2word=word_to_id,
                                        num_topics=5,
                                        random_state=100,
                                        update_every=1,
                                        chunksize=100,
                                        passes=10,
                                        alpha='auto',
                                        per_word_topics=True)    
    return lda_model

In [112]:

def show_topics(lda_model):
    pprint(lda_model.show_topics(formatted=False))

In [114]:
def model_analysis(lda_model, corpus, word_list, word_to_id):
    print('\nPerplexity: ', lda_model.log_perplexity(corpus))
    doc_lda = lda_model[corpus]


    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=word_list, dictionary=word_to_id, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\n Lda model Coherence Score/Accuracy on Tweets: ', coherence_lda)

In [166]:
  
def get_top_topics(df):
    df,word_list, word_to_id, corpus = prepare_data(df)
    # lda_model = build_model(corpus, word_to_id)

    # show_topics(lda_model)

    # print("Perform model analysis")
    # model_analysis(lda_model,corpus, word_list, word_to_id)
    

### Top 10 topics of the different channels

In [167]:
df = get_channel_messages("all-week1")
get_top_topics(df)

[{'text': "<!here> <@U03V8LHPDME> <@U03UKL27B0R> I appreciate if anyone can give me an idea on label encoding 'Bearer Id, IMEI, IMSI and MSISDN/Number columns'? May it have a different meaning if I encode them like Handset Manufacturer and Handset Type columns?", 'ts': '1661499578.568379'}, {'text': '<http://meet.google.com/ysb-kjdn-hpp>', 'ts': '1661500664.614519'}, {'text': 'so how could I extract a dataset from the groupby function so I could preform normalization in the data ???', 'ts': '1661503227.480699'}, {'text': 'the groupby function returns you a df and you can assign it to a variable to do what you want to do.', 'ts': '1661503335.973589'}, {'text': 'join us here <http://meet.google.com/ysb-kjdn-hpp> for the tenx demo', 'ts': '1661504654.183159'}, {'text': 'The tenx session has started <http://meet.google.com/ysb-kjdn-hpp>', 'ts': '1661504880.646299'}, {'text': 'are we supposed to store the features on a database (MySQL)? if yes all of the features or specific columns.', 'ts'

TypeError: list indices must be integers or slices, not str