# Template Jupyter Notebook

In [10]:
import json
import codecs
import unicodedata
import string
from collections import Counter
import re
import pandas as pd
import os

In [11]:
def clean_text(text):
    if not isinstance(text, str):
        return text
    try:
        text = text.encode("latin1").decode("utf-8")
    except Exception:
        pass
    text = text.replace("\u200d", "")   # remove zero-width joiner
    return text

In [12]:
def parse_json(path: str):
    with open(path, "r", encoding="utf-8") as file:
        raw_data = json.load(file)

    messages = pd.DataFrame(raw_data.get("messages", []))
    participants = pd.DataFrame(raw_data["participants"])
    thread_path = raw_data.get("thread_path", os.path.basename(path))

    messages["content"] = messages["content"].apply(clean_text)
    messages["sender_name"] = messages["sender_name"].apply(clean_text)
    participants["name"] = participants["name"].apply(clean_text)

    return messages, participants, thread_path

In [13]:
def get_conversation(messages: pd.DataFrame, totalConversation: dict):
    for index, row in messages.iterrows():
        sender = row.get("sender_name")
        content = row.get("content")
        if content == None:
            continue
        totalConversation.setdefault(sender, []).append(content)
    return totalConversation

In [14]:
def word_frequency(conversations: dict, targetWord: str):
    targetWord = targetWord.lower().strip()
    word_count = {}

    for sender, messages in conversations.items():  # ← FIXED
        count = 0
        for msg in messages:
            # Skip invalid values
            if not isinstance(msg, str) or not msg.strip():
                continue

            lower_msg = msg.lower()

            # Count if target word appears
            if targetWord in lower_msg:
                count += 1

        # Always include user even if count = 0
        word_count[sender] = count

    return word_count

In [15]:
def top_word_per_user(conversation: dict, user: str, top_n: int = 10):
    # Check if user exists
    if user not in conversation:
        print(f"User '{user}' not found.")
        return {}

    messages = conversation[user]
    words = []

    for msg in messages:
        if not isinstance(msg, str) or not msg.strip():
            continue

        lower_msg = msg.lower()

        for w in lower_msg.split():
            clean_word = w.strip(string.punctuation)
            if clean_word:
                words.append(clean_word)

    counter = Counter(words)
    return dict(counter.most_common(top_n))

In [16]:
def update_conversation_counts(messages: pd.DataFrame, participants: pd.DataFrame, totalConversation: dict):
    if len(participants) != 2:
        return totalConversation

    p1 = participants["name"].iloc[0]

    if p1 not in totalConversation:
        totalConversation[p1] = {"sent": 0, "received": 0}

    for _, row in messages.iterrows():
        sender = row.get("sender_name")
        content = row.get("content")
        if content is None:
            continue    

        if sender == p1:
            totalConversation[p1]["sent"] += 1
        else:
            totalConversation[p1]["received"] += 1

    return totalConversation


In [17]:
messages, participants, thread_path = parse_json("../sample-data/chillwalknightday_5554251577921385/message_1.json")
result = get_conversation(messages, {})
word_freq = word_frequency(result, "fuck")
word_per = top_word_per_user(result,"Austen Brown")
print(word_per)

{'to': 125, 'a': 119, 'you': 98, 'i': 91, 'the': 80, 'be': 58, 'anyone': 54, 'and': 52, 'is': 47, 'in': 41}


In [18]:
# ngan test
folder_path = r"c:\Users\nlhng\OneDrive\Desktop\UVEC2025\dev\test"
totalConversation = {}

for path in Path(folder_path).glob("*.json"):
    messages, participants, thread_path = parse_json(path)
    if len(participants) == 2:
        totalConversation = update_conversation_counts(messages, participants, totalConversation)

df = pd.DataFrame.from_dict(totalConversation, orient="index")
print(df)

## divide recieved by sent then sort


NameError: name 'Path' is not defined