In [62]:
%reload_ext autoreload
%autoreload 2

In [63]:
import os, sys
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/babi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/babi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [64]:
# Add parent directory to path to import modules from src
rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

from src.loader import SlackDataLoader
import src.utils as utils

In [65]:
data_loader = SlackDataLoader("../data")
all_channels_message = {}
for channel in data_loader.channels:
    channel_messages = utils.get_messages_on_channel(f"../data/{channel["name"]}")
    all_channels_message[channel["name"]] = channel_messages

In [66]:
def preprocess_text(text):
    # Extract and remove URLs
    urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    for url in urls:
        text = text.replace(url, '')
    
    text = re.sub(r'<@.*?>', '', text)


    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Join the tokens back into a string
    text = ' '.join(tokens)

    return text

def preprocess_channel_data(channel_data):
    preprocessed_data = []
    for msg in channel_data:
        pre_procssed = preprocess_text(msg['text'])
        if pre_procssed:
            preprocessed_data.append({'text': pre_procssed, 'ts': msg['ts']})

    return preprocessed_data


# Apply preprocessing to each channel's data
preprocessed_channel_data = {}

for channel, data in all_channels_message.items():
    preprocessed_channel_data[channel] = preprocess_channel_data(data)


In [67]:
# Define the number of top words to extract for each channel
num_top_words = 10

# Define a function to get the top words from TF-IDF features
def get_top_words(channel_data):
    # Extract text from each message
    texts = [msg['text'] for msg in channel_data]

    if not texts:
        return []

    # Vectorization using TF-IDF
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(texts)

    # Get feature names (words)
    feature_names = vectorizer.get_feature_names_out()

    # Get the top words based on TF-IDF weights
    top_words = [feature_names[i] for i in X.sum(axis=0).argsort(axis=1)[0, -num_top_words:][::-1]] # type: ignore

    return top_words

# Get top words for each channel
top_words_by_channel = {channel: get_top_words(data) for channel, data in preprocessed_channel_data.items()}

# Print the top words for each channel
for channel, words in top_words_by_channel.items():
    print(f"Top words for {channel}: {words}")

Top words for all-community-building: [array([['joy', 'man', 'like', 'hi', 'one', 'rollingonthefloorlaughing',
        'morning', 'good', 'yes', 'hello']], dtype=object)]
Top words for all-technical-support: [array([['check', 'okay', 'please', 'link', 'thanks', 'thank', 'try',
        'think', 'joined', 'channel']], dtype=object)]
Top words for all-career-exercises: [array([['link', 'one', 'please', 'morning', 'thank', 'yes', 'joined',
        'good', 'thanks', 'channel']], dtype=object)]
Top words for all-resources: [array([['data', 'good', 'helpful', 'found', 'resource', 'help', 'thank',
        'thanks', 'joined', 'channel']], dtype=object)]
Top words for random: [array([['man', 'joy', 'grin', 'one', 'think', 'time', 'yes',
        'rollingonthefloorlaughing', 'joined', 'channel']], dtype=object)]
Top words for all-ideas: [array([['mean', 'hello', 'glad', 'market', 'dont', 'good', 'one',
        'think', 'joined', 'channel']], dtype=object)]
Top words for all-week1: [array([['task',