In [1]:
import glob
import json
import hashlib
import pandas as pd

In [2]:
# Initialize lists to accumulate data across all JSON files
questions = []
question_askers = []
question_timestamps = []
ids = []
answers = []
answered_by = []
answer_timestamps = []


In [3]:
def generate_alphanumeric_id(timestamp):
    """Generate a unique alphanumeric ID from the timestamp."""
    # Convert the timestamp to a string
    timestamp_str = str(timestamp)
    # Create a hash of the timestamp
    hash_object = hashlib.md5(timestamp_str.encode())
    # Convert the hash to a hexadecimal string and use the first 8 characters as the ID
    return hash_object.hexdigest()[:10]

In [4]:
# Process each JSON file
for file_path in glob.glob('../data/sample_data/*.json'):  # Adjust the path pattern to match your file naming convention
    with open(file_path, 'r') as f:
        data = json.load(f)
        # Create a dictionary to quickly access messages by timestamp
        message_dict = {msg["ts"]: msg for msg in data}

        # Iterate through each message in the JSON data
        for message in data:
            if "subtype" not in message and message.get("text"):  # Only consider messages without a subtype and containing text
                if "thread_ts" not in message or message["ts"] == message["thread_ts"]:  # Include standalone messages and root messages of threads
                    question_id = generate_alphanumeric_id(message["ts"])
                    if question_id not in ids:  # Ensure the question is not already added
                        questions.append(message["text"])
                        question_askers.append(message["user_profile"]["real_name"])
                        question_timestamps.append(message["ts"])
                        ids.append(question_id)

                        # Look for answers to the question in replies
                        if "replies" in message:
                            answer_found = False
                            for reply in message["replies"]:
                                reply_message = message_dict.get(reply["ts"])
                                if reply_message:
                                    if reply_message["user_profile"]["real_name"] == "Alexey Grigorev":
                                        answers.append(reply_message["text"])
                                        answered_by.append(reply_message["user_profile"]["real_name"])
                                        answer_timestamps.append(reply_message["ts"])
                                        answer_found = True
                                        break
                            if not answer_found:
                                # If no reply from Alexey Grigorev, take the first reply
                                first_reply = message_dict.get(message["replies"][0]["ts"])
                                if first_reply:
                                    answers.append(first_reply["text"])
                                    answered_by.append(first_reply["user_profile"]["real_name"])
                                    answer_timestamps.append(first_reply["ts"])
                        else:
                            answers.append(None)
                            answered_by.append(None)
                            answer_timestamps.append(None)

# Create a DataFrame from the extracted data
df = pd.DataFrame({
    "id": ids,
    "question": questions,
    "answer": answers,
    "question_asked_by": question_askers,
    "answered_by": answered_by,
    "question_timestamp": question_timestamps,
    "answer_timestamp": answer_timestamps
})

In [5]:
df

Unnamed: 0,id,question,answer,question_asked_by,answered_by,question_timestamp,answer_timestamp
0,ab8fda9401,heya,,Kristian Maglasang,,1712220048.482559,
1,f12b482f76,Hello! nice to be here..:grinning:,,luca pugliese,,1712223185.959429,
2,dbc60593fb,Hey there! So this is the channel to get updat...,Yes that's right,Bhavya Gupta,Alexey Grigorev,1712226627.805659,1712227767.099769
3,ff5f0391a9,Hello everyone :wave::skin-tone-2:,,GÃ¼nal HÄ±nÃ§al,,1712238631.304149,
4,d4dcf112a5,Hi :wave: <@U01AXE0P5M3> when the course ia go...,"TBA\n\nOnce we know the date, I'll create a ch...",Tahir,Alexey Grigorev,1712310923.174359,1712311144.243969
5,8a6e3031dd,"Hello, nice to be here :smiley:\n\nGreetings f...",,Humberto Rodriguez,,1712589025.293289,
6,90b20add2e,"Hello everyone, nice to join this channel! :gr...",,murat kahraman,,1713040186.854499,
7,ef6f3dcf5a,"Hi <@U01AXE0P5M3>, can you please let me know ...",The course page has the requirements. What do ...,Ifra Saifi,Alexey Grigorev,1713179914.616589,1713180442.404689
