In [3]:
%reload_ext autoreload
%autoreload 2 

In [4]:
import os, sys

In [5]:
import re

In [6]:
import json

In [7]:
import glob

In [8]:
import datetime

In [9]:
from collections import Counter

In [10]:
import pandas as pd

In [11]:
from matplotlib import pyplot as plt

In [12]:
import seaborn as sns

In [13]:
from nltk.corpus import stopwords

In [14]:
from wordcloud import WordCloud # error module not found

In [15]:
# pip install wordcloud

In [16]:
from wordcloud import WordCloud #error fixed

In [17]:
# Add parent directory to path to import modules from src
rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

In [18]:
# pip install pick

In [19]:
from src.loader import SlackDataLoader

In [20]:
import src.utils as utils   

### Columns we can get from a slack message<br>

message_type, message_content, sender_id, time_sent, message_distribution, time_thread_start, reply_count, reply_user_count, time_thread_end, reply_users

From a single slack message, we can get <br>

1. The message<br>
2. Type (message, file, link, etc)<br>
3. The sender_id (assigned by slack)<br>
4. The time the message was sent<br>
5. The team (i don't know what that is now)<br>
6. The type of the message (broadcast message, inhouse, just messgae)<br>
7. The thread the message generated (from here we can go):<br>
    7.1 Text/content of the message<br>
    7.2 The thread time of the message<br>
    7.3 The thread count (reply count)<br>
    7.4 The number of user that reply the message (count of users that participated in the thread)<br>
    7.5 The time the last thread message was sent <br>
    7.6 The users that participated in the thread (their ids are stored as well)<br>

In [40]:
def slack_parser(path_channel):
    """Parse Slack data to extract useful information from the JSON files.

    Args:
        path_channel (str): Path to the directory containing the JSON files.

    Returns:
        pd.DataFrame: Extracted information combined into a DataFrame.
    """

    # Create a list to store the combined data from JSON files
    combined_data = []

    # Read all JSON files from the provided path
    json_files = glob.glob(f"{path_channel}/*.json")
    for json_file in json_files:
        try:
            with open(json_file, 'r', encoding="utf8") as slack_data:
                data = json.load(slack_data)
                combined_data.extend(data)
        except Exception as e:
            print(f"Error reading JSON file '{json_file}': {str(e)}")

    # Define lists to store the extracted information
    msg_type, msg_content, sender_id, time_msg, msg_dist, time_thread_st = [], [], [], [], [], []
    reply_count, reply_users_count, reply_users, tm_thread_end = [], [], [], []

    # Loop through the combined data and extract the required information
    for row in combined_data:
        if 'bot_id' in row.keys():
            continue
        else:
            msg_type.append(row.get('type'))
            msg_content.append(row.get('text'))
            sender = row.get('user_profile')
            if sender:
                sender_id.append(sender.get('real_name'))
            else:
                sender_id.append('Not provided')
            time_msg.append(row.get('ts'))
            blocks = row.get('blocks')
            if blocks and len(blocks[0]['elements'][0]['elements']) > 0:
                msg_dist.append(blocks[0]['elements'][0]['elements'][0]['type'])
            else:
                msg_dist.append('reshared')
            thread_ts = row.get('thread_ts')
            if thread_ts:
                time_thread_st.append(thread_ts)
            else:
                time_thread_st.append(0)
            reply_users.append(",".join(row.get('reply_users', [])))
            reply_count.append(row.get('reply_count', 0))
            reply_users_count.append(row.get('reply_users_count', 0))
            tm_thread_end.append(row.get('latest_reply', 0))

    # Create a DataFrame from the extracted information
    data = zip(msg_type, msg_content, sender_id, time_msg, msg_dist, time_thread_st,
               reply_count, reply_users_count, reply_users, tm_thread_end)
    columns = ['msg_type', 'msg_content', 'sender_name', 'msg_sent_time', 'msg_dist_type',
               'time_thread_start', 'reply_count', 'reply_users_count', 'reply_users', 'tm_thread_end']
    df = pd.DataFrame(data=data, columns=columns)

    # Filter out rows where sender_name is 'Not provided'
    df = df[df['sender_name'] != 'Not provided']

    # Add a 'channel' column to the DataFrame indicating the channel name
    channel_name = path_channel.split('/')[-1].split('.')[0]
    df['channel'] = channel_name

    # Reset the index
    df = df.reset_index(drop=True)

    return df

In [58]:
file_path ='../data/anonymized/all-community-building/'

In [71]:
parsed_slack = slack_parser(file_path)

In [72]:
parsed_slack['channel'].head(100)


0      
1      
2      
3      
4      
     ..
95     
96     
97     
98     
99     
Name: channel, Length: 100, dtype: object

In [73]:
def parse_slack_reaction(path, channel):
    """get reactions"""
    dfall_reaction = pd.DataFrame()
    combined = []
    for json_file in glob.glob(f"{path}*.json"):
        with open(json_file, 'r') as slack_data:
            combined.append(slack_data)

    reaction_name, reaction_count, reaction_users, msg, user_id = [], [], [], [], []

    for k in combined:
        slack_data = json.load(open(k.name, 'r', encoding="utf-8"))
        
        for i_count, i in enumerate(slack_data):
            if 'reactions' in i.keys():
                for j in range(len(i['reactions'])):
                    msg.append(i['text'])
                    user_id.append(i['user'])
                    reaction_name.append(i['reactions'][j]['name'])
                    reaction_count.append(i['reactions'][j]['count'])
                    reaction_users.append(",".join(i['reactions'][j]['users']))
                
    data_reaction = zip(reaction_name, reaction_count, reaction_users, msg, user_id)
    columns_reaction = ['reaction_name', 'reaction_count', 'reaction_users_count', 'message', 'user_id']
    df_reaction = pd.DataFrame(data=data_reaction, columns=columns_reaction)
    df_reaction['channel'] = channel
    return df_reaction

In [74]:
parsed_reaction = parse_slack_reaction(file_path, 'channel')

In [84]:
parsed_reaction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3191 entries, 0 to 3190
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reaction_name         3191 non-null   object
 1   reaction_count        3191 non-null   int64 
 2   reaction_users_count  3191 non-null   object
 3   message               3191 non-null   object
 4   user_id               3191 non-null   object
 5   channel               3191 non-null   object
dtypes: int64(1), object(5)
memory usage: 149.7+ KB


In [86]:
parsed_reaction.head(10)

Unnamed: 0,reaction_name,reaction_count,reaction_users_count,message,user_id,channel
0,+1,12,"U03UFV7TUTV,U03U1HAG9TR,U03UFV7HFNF,U03U9EJR36...","hi all, looking forward to starting together, ...",U03U93GNNVB,channel
1,muscle,2,"U03UG0YHAUT,U03UUR571A5","hi all, looking forward to starting together, ...",U03U93GNNVB,channel
2,heart,2,"U03UUMM7Y8H,U03UD68RQH3","hi all, looking forward to starting together, ...",U03U93GNNVB,channel
3,+1,8,"U03UFV7HFNF,U03UG4Q7V42,U03U9EJR362,U03U1J51VF...",hello everyone. it's my hope that you are doin...,U03V1AM5TFA,channel
4,clap,2,"U03U1HAG9TR,U03UG0YHAUT",hello everyone. it's my hope that you are doin...,U03V1AM5TFA,channel
5,+1,5,"U03UG4Q7V42,U03U9EJR362,U03UFV7TUTV,U03UG0YHAU...",*community building session reminder!*:timer_c...,U03V1AM5TFA,channel
6,+1,8,"U03UUMM7Y8H,U03UHB8CXDY,U03UFV7HFNF,U03UG0YHAU...","hello people, congratulations for making it to...",U03V1AM5TFA,channel
7,grinning,1,U03V1AM5TFA,why did my mind go blank at first:joy:,U03U1HAG9TR,channel
8,grinning,1,U03U1HAG9TR,mirrors - justin timberlake:hugging_face:,U03U1FNPEUX,channel
9,100,1,U03U1FNPEUX,sia :rolling_on_the_floor_laughing:- unstoppable,U03U9EJR362,channel


In [91]:
def get_community_participation(path):
    """ specify path to get json files"""
    combined = []
    comm_dict = {}
    for json_file in glob.glob(f"{path}*.json"):
        with open(json_file, 'r') as slack_data:
            combined.append(slack_data)
    print(f"Total json files is {len(combined)}")
    for i in combined:
        a = json.load(open(i.name, 'r', encoding='utf-8'))

        for msg in a:
            if 'replies' in msg.keys():
                for i in msg['replies']:
                    comm_dict[i['user']] = comm_dict.get(i['user'], 0)+1
    return comm_dict

In [92]:

get_community =get_community_participation(file_path)

Total json files is 79


In [89]:
get_community

{'U03UG0YHAUT': 112,
 'U03V1AM5TFA': 515,
 'U03V785NLSU': 118,
 'U03UUS0MZCZ': 106,
 'U03UG32J3PC': 358,
 'U03U1HAG9TR': 135,
 'U03UG1RTXAP': 14,
 'U03UUMM7Y8H': 26,
 'U03UD68RQH3': 224,
 'U03UJGFG2HJ': 7,
 'U03U1FNPEUX': 145,
 'U03UG4Q7V42': 199,
 'U03UFV7HFNF': 53,
 'U03UD5B7C3X': 66,
 'U03UFV7TUTV': 39,
 'U03UHB8CXDY': 77,
 'U03UKGSDGSG': 6,
 'U03UUR571A5': 266,
 'U03UD4FEDHB': 24,
 'U03UP7V9Q57': 2,
 'U03UJN29Y4C': 55,
 'U03UVHCV6KB': 284,
 'U03UJKJGRAQ': 132,
 'U03U1FQKEMV': 3,
 'U03U9FWPNCE': 32,
 'U03TT5KEYCF': 13,
 'U03TEPYRM2P': 14,
 'U03UCCRJME2': 1,
 'U03U93GNNVB': 46,
 'U03U9EJR362': 39,
 'U03T89ACUUW': 39,
 'U03UJGP0C68': 168,
 'U03UJH1EQQL': 24,
 'U03UJGRN5E0': 28,
 'U03V5Q9N516': 44,
 'U03V6HMRPGQ': 164,
 'U03U1J51VFZ': 19,
 'U03UAKATQ22': 34,
 'U03UG0SFHGT': 42,
 'U03UH397319': 99,
 'U03UG5VFN03': 13,
 'U03UUMR26Q1': 15,
 'U03U4GULU3Y': 2,
 'U03UG1Z21JP': 80,
 'U03U1GHT39V': 6,
 'U03UGB3T3MY': 1,
 'U03UUP56MDF': 1,
 'U03V61VGQG0': 4}

In [None]:
def convert_2_timestamp(column, data):
    """convert from unix time to readable timestamp
        args: column: columns that needs to be converted to timestamp
                data: data that has the specified column
    """
    if column in data.columns.values:
        timestamp_ = []
        for time_unix in data[column]:
            if time_unix == 0:
                timestamp_.append(0)
            else:
                a = datetime.datetime.fromtimestamp(float(time_unix))
                timestamp_.append(a.strftime('%Y-%m-%d %H:%M:%S'))
        return timestamp_
    else: 
        print(f"{column} not in data")

In [36]:
def get_tagged_users(df):
    """get all @ in the messages"""

    return df['msg_content'].map(lambda x: re.findall(r'@U\w+', x))


    
def map_userid_2_realname(user_profile: dict, comm_dict: dict, plot=False):
    """
    map slack_id to realnames
    user_profile: a dictionary that contains users info such as real_names
    comm_dict: a dictionary that contains slack_id and total_message sent by that slack_id
    """
    user_dict = {} # to store the id
    real_name = [] # to store the real name
    ac_comm_dict = {} # to store the mapping
    count = 0
    # collect all the real names
    for i in range(len(user_profile['profile'])):
        real_name.append(dict(user_profile['profile'])[i]['real_name'])

    # loop the slack ids
    for i in user_profile['id']:
        user_dict[i] = real_name[count]
        count += 1

    # to store mapping
    for i in comm_dict:
        if i in user_dict:
            ac_comm_dict[user_dict[i]] = comm_dict[i]

    ac_comm_dict = pd.DataFrame(data= zip(ac_comm_dict.keys(), ac_comm_dict.values()),
    columns=['LearnerName', '# of Msg sent in Threads']).sort_values(by='# of Msg sent in Threads', ascending=False)
    
    if plot:
        ac_comm_dict.plot.bar(figsize=(15, 7.5), x='LearnerName', y='# of Msg sent in Threads')
        plt.title('Student based on Message sent in thread', size=20)
        
    return ac_comm_dict

In [None]:
def get_top_20_user(data, channel='Random'):
    """get user with the highest number of message sent to any channel"""

    data['sender_name'].value_counts()[:20].plot.bar(figsize=(15, 7.5))
    plt.title(f'Top 20 Message Senders in #{channel} channels', size=15, fontweight='bold')
    plt.xlabel("Sender Name", size=18); plt.ylabel("Frequency", size=14);
    plt.xticks(size=12); plt.yticks(size=12);
    plt.show()

    data['sender_name'].value_counts()[-10:].plot.bar(figsize=(15, 7.5))
    plt.title(f'Bottom 10 Message Senders in #{channel} channels', size=15, fontweight='bold')
    plt.xlabel("Sender Name", size=18); plt.ylabel("Frequency", size=14);
    plt.xticks(size=12); plt.yticks(size=12);
    plt.show()

In [None]:
def draw_avg_reply_count(data, channel='Random'):
    """who commands many reply?"""

    data.groupby('sender_name')['reply_count'].mean().sort_values(ascending=False)[:20]\
        .plot(kind='bar', figsize=(15,7.5));
    plt.title(f'Average Number of reply count per Sender in #{channel}', size=20, fontweight='bold')
    plt.xlabel("Sender Name", size=18); plt.ylabel("Frequency", size=18);
    plt.xticks(size=14); plt.yticks(size=14);
    plt.show()

In [None]:
def draw_avg_reply_users_count(data, channel='Random'):
    """who commands many user reply?"""

    data.groupby('sender_name')['reply_users_count'].mean().sort_values(ascending=False)[:20].plot(kind='bar',
     figsize=(15,7.5));
    plt.title(f'Average Number of reply user count per Sender in #{channel}', size=20, fontweight='bold')
    plt.xlabel("Sender Name", size=18); plt.ylabel("Frequency", size=18);
    plt.xticks(size=14); plt.yticks(size=14);
    plt.show()

In [None]:
def draw_wordcloud(msg_content, week):    
    # word cloud visualization
    allWords = ' '.join([twts for twts in msg_content])
    wordCloud = WordCloud(background_color='#975429', width=500, height=300, random_state=21, max_words=500, mode='RGBA',
                            max_font_size=140, stopwords=stopwords.words('english')).generate(allWords)
    plt.figure(figsize=(15, 7.5))
    plt.imshow(wordCloud, interpolation="bilinear")
    plt.axis('off')
    plt.tight_layout()
    plt.title(f'WordCloud for {week}', size=30)
    plt.show()

In [81]:
def draw_user_reaction(data, channel='General'):
    data.groupby('sender_name')[['reply_count', 'reply_users_count']].sum()\
        .sort_values(by='reply_count',ascending=False)[:10].plot(kind='bar', figsize=(15, 7.5))
    plt.title(f'User with the most reaction in #{channel}', size=25);
    plt.xlabel("Sender Name", size=18); plt.ylabel("Frequency", size=18);
    plt.xticks(size=14); plt.yticks(size=14);
    plt.show()

## Insight Extraction

Below are some useful questions to answer. Feel free to explore to answer other interesting questions that may be of help to get insight about student's behaviour, need, and future performance 

In [None]:
# which user has the highest number of reply counts?


In [None]:
# Visualize reply counts per user per channel

In [None]:
# what is the time range of the day that most messages are sent?


In [None]:
# what kind of messages are replied faster than others?

In [None]:
# Relationship between # of messages and # of reactions

In [None]:
# Classify messages into different categories such as questions, answers, comments, etc.

In [None]:
# Which users got the most reactions?

In [None]:
# Model topics mentioned in the channel

In [None]:
# What are the topics that got the most reactions?

### Harder questions to look into

In [None]:
# Based on messages, reactions, references shared, and other relevant data such as classification of questions into techical question, comment, answer, aorder stu the python, statistics, and sql skill level of a user?