In [70]:
import pandas as pd
import re

## Eksplorasi Dataset

In [71]:
with open('../data/human_chat.txt', 'r', encoding='latin-1') as file:
    lines = file.readlines()

conversation_text = "\n".join([line.strip() for line in lines if line.strip() != ""])

## Regex for Q&A Segmentation

In [72]:
question_pattern = re.compile(
    r"(?i)\b(what|where|when|why|who|how|is|are|do|does|did|can|could|will|would|should|have|has|had)\b.*?\?|" 
    r".*?\?", re.MULTILINE
)

In [None]:
def segment_conversation_qna(conversation_text):
    segments = []
    current_question = None
    current_answers = []

    parts = re.split(turn_split_pattern, conversation_text)
    dialogue_turns = []
    for i in range(1, len(parts), 2):
        if i + 1 < len(parts):
            utterance = parts[i+1].strip()
            dialogue_turns.append(utterance) 

    for turn in dialogue_turns:
        if question_pattern.search(turn):
            if current_question and current_answers:
                segments.append({'question': current_question, 'answers': current_answers})
            current_question = turn
            current_answers = []
        else:
            if current_question:
                current_answers.append(turn)

    if current_question and current_answers:
        segments.append({'question': current_question, 'answers': current_answers})
    elif current_question:
        segments.append({'question': current_question, 'answers': []})

    return segments

In [78]:
segments = segment_conversation_qna(conversation_text)
df = pd.DataFrame(segments)

## Analisis Statistik

In [79]:
print("Total Q&A pairs:", len(df))
df['num_answers'] = df['answers'].apply(len)
print(df['num_answers'].describe())

Total Q&A pairs: 394
count    394.000000
mean       2.175127
std        1.670669
min        1.000000
25%        1.000000
50%        1.000000
75%        3.000000
max       11.000000
Name: num_answers, dtype: float64


## Explanation

count = 394
→ Total number of question–answer segments.

mean ≈ 2.17
→ On average, each question has about 2.17 answers.

std ≈ 1.67
→ The number of answers per question varies; not always just one.

min = 1
→ There are questions that only received a single answer.

25%, 50%, 75%
→ Percentiles (Q1, Median, Q3): most questions were answered 1–3 times.

max = 11
→ One particular question received up to 11 answers.

In [80]:
df.to_csv("../data/segmented_chat.csv", index=False)