In [1]:
import os
import glob
import json
import pandas as pd

def slack_parser(paths):
    """Parse Slack data to extract useful information from JSON files."""
    
    combined = []
    
    # Loop through all provided paths
    for path_channel in paths:
        # Use os.path.join to create the full file path
        for json_file in glob.glob(os.path.join(path_channel, '*.json')):
            with open(json_file, 'r', encoding="utf8") as slack_data:
                data = json.load(slack_data)
                combined.extend(data)  # Use extend to append elements of the list

    msg_type, msg_content, sender_id, time_msg, msg_dist, time_thread_st, reply_users, \
    reply_count, reply_users_count, tm_thread_end = [],[],[],[],[],[],[],[],[],[]

    for row in combined:
        if 'bot_id' in row.keys():
            continue
        else:
            msg_type.append(row['type'])
            msg_content.append(row['text'])
            if 'user_profile' in row.keys(): sender_id.append(row['user_profile']['real_name'])
            else: sender_id.append('Not provided')
            time_msg.append(pd.to_datetime(row['ts'], unit='s'))  # Convert timestamp to datetime
            if 'blocks' in row.keys() and len(row['blocks'][0]['elements'][0]['elements']) != 0:
                msg_dist.append(row['blocks'][0]['elements'][0]['elements'][0]['type'])
            else: msg_dist.append('reshared')
            if 'thread_ts' in row.keys():
                time_thread_st.append(pd.to_datetime(row['thread_ts'], unit='s'))
            else:
                time_thread_st.append(pd.NaT)
            if 'reply_users' in row.keys(): reply_users.append(",".join(row['reply_users'])) 
            else: reply_users.append(0)
            if 'reply_count' in row.keys():
                reply_count.append(row['reply_count'])
                reply_users_count.append(row['reply_users_count'])
                tm_thread_end.append(pd.to_datetime(row['latest_reply'], unit='s'))
            else:
                reply_count.append(0)
                reply_users_count.append(0)
                tm_thread_end.append(pd.NaT)

    data = zip(msg_type, msg_content, sender_id, time_msg, msg_dist, time_thread_st,
               reply_count, reply_users_count, reply_users, tm_thread_end)
    columns = ['msg_type', 'msg_content', 'sender_name', 'msg_sent_time', 'msg_dist_type',
               'time_thread_start', 'reply_count', 'reply_users_count', 'reply_users', 'tm_thread_end']

    df = pd.DataFrame(data=data, columns=columns)
    df = df[df['sender_name'] != 'Not provided']

    return df


In [6]:
path = ["../data/" ]
out_put=slack_parser(path)
print(out_put)

KeyError: 'type'

In [4]:
out_put

Unnamed: 0,msg_type,msg_content,sender_name,msg_sent_time,msg_dist_type,time_thread_start,reply_count,reply_users_count,reply_users,tm_thread_end
0,message,<!channel> kindly nominate team members for 10...,Garrett Bell,2022-11-08 08:08:32,broadcast,NaT,0,0,0,NaT
2,message,Guest Talk starting now,Vanessa Norman,2022-11-08 10:57:04,text,NaT,0,0,0,NaT
3,message,<!channel> please join the guest talk if you h...,Carlos Gross,2022-11-08 11:03:28,broadcast,NaT,0,0,0,NaT
4,message,I have two question:\n\n• Regarding week 9 (Go...,Brian Odom,2022-11-08 14:02:40,text,2022-11-08 07:38:40,0,0,0,NaT
5,message,*Independent challenge work Reminder!! <!here>...,Vanessa Norman,2022-11-08 15:19:28,text,NaT,0,0,0,NaT
...,...,...,...,...,...,...,...,...,...,...
554,message,*Independent challenge work Reminder!! <!here>...,Vanessa Norman,2022-09-15 15:19:28,text,NaT,0,0,0,NaT
555,message,*DAILY STANDUP REMINDER*:timer_clock:\n*<!here...,Vanessa Norman,2022-10-25 07:49:20,text,NaT,0,0,0,NaT
556,message,*<!here> REMINDER*:timer_clock: <@U03TEPYRM2P>...,Vanessa Norman,2022-10-25 09:21:04,broadcast,NaT,0,0,0,NaT
557,message,*TUTORIAL REMINDER!!*:timer_clock:\n*<!here>* ...,Vanessa Norman,2022-10-25 12:50:08,text,NaT,0,0,0,NaT
