In [21]:
import pandas as pd
import re
from io import StringIO
from dateutil import parser as date_parser
import uuid

In [23]:
def parse_chat_log(file_path):
    """
    Reads a chat log file and robustly parses each message line into structured data.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            chat_content = f.read()
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found. Please ensure the file is present.")
        return pd.DataFrame()

    pattern = re.compile(
        r'^(?:.*?)\s*\[(\d{1,2}/\d{1,2}/\d{4}),\s*(\d{1,2}:\d{2}:\d{2})\s*(?:[AP]M)?\]\s*(.+?):\s*(.*)',
        re.IGNORECASE | re.MULTILINE
    )

    data = []

    for line in StringIO(chat_content):
        line = line.strip()
        if not line:
            continue

        match = pattern.match(line)

        if match:
            date_str, time_str, speaker, message = match.groups()

            clean_speaker = re.sub(r'\[.*?\]|\(|\)|\u200e|\u200f', '', speaker).strip()

            data.append({
                'Date': date_str,
                'Time': time_str,
                'Speaker': clean_speaker,
                'Message': message.strip()
            })
        elif data:
            data[-1]['Message'] += ' ' + line
    return pd.DataFrame(data)


def transform_to_sql_schema(df, user_map=None):
    """
    Transforms the raw DataFrame into the required 'messages' table schema.

    Behavior:
    - If `user_map` is None, builds a deterministic mapping from original names to
      internal ids `u_1, u_2, ...` used for `conv_id` and receiver pairing.
    - The final returned DataFrame uses `sender_user_id` = original sender name
      and `receiver_user_id` = original receiver name as well, while internal
      `u_*` ids are used internally for conv logic.
    """
    if df.empty:
        return pd.DataFrame()

    # Build a user_map from speakers if not supplied.
    speakers = list(df['Speaker'].dropna().unique())
    if user_map is None:
        # Sort for determinism across runs
        speakers_sorted = sorted(speakers)
        user_map = {s: f"u_{i+1}" for i, s in enumerate(speakers_sorted)}

    # Keep original speaker names
    df['speaker_name'] = df['Speaker']
    # Internal mapped id used for conv_id/receiver logic
    df['sender_internal'] = df['Speaker'].map(user_map)

    # For two-person chats, map each sender_internal to the other as receiver_internal.
    unique_sender_ids = list(df['sender_internal'].dropna().unique())
    if len(unique_sender_ids) == 2:
        receiver_internal_map = {unique_sender_ids[0]: unique_sender_ids[1],
                                 unique_sender_ids[1]: unique_sender_ids[0]}
    else:
        # For group chats or single-participant logs, set receiver_internal to None.
        receiver_internal_map = {uid: None for uid in unique_sender_ids}

    df['receiver_internal'] = df['sender_internal'].map(receiver_internal_map)

    def generate_id_and_timestamp(row):
        msg_id = str(uuid.uuid4())
        try:
            dt_obj = date_parser.parse(f"{row['Date']} {row['Time']}", dayfirst=True)
            sent_at = dt_obj.strftime('%Y-%m-%d %H:%M:%S')
        except Exception:
            dt_obj = pd.NaT
            sent_at = None
        return pd.Series([msg_id, sent_at, dt_obj])

    df[['msg_id', 'sent_at','dt_object']] = df.apply(generate_id_and_timestamp, axis=1)

    # Use the (sorted) internal user ids from the map to create a deterministic conv_id
    user_ids = sorted(set(user_map.values()))
    df['conv_id'] = 'chat:' + '_'.join(user_ids) if user_ids else None

    df = df.sort_values(by='dt_object').reset_index(drop=True)
    df = df.rename(columns={'Message': 'text'})
    df = df.sort_values(['conv_id', 'sent_at']).reset_index(drop=True)

    block_id = (
        (df['conv_id'] != df['conv_id'].shift()) |
        (df['sender_internal'] != df['sender_internal'].shift())
    ).cumsum()

    df['block_id'] = block_id
    df_merged = (
        df.groupby('block_id', as_index=False)
          .agg({
              'msg_id': 'first',
              'conv_id': 'first',
              'sender_internal': 'first',
              'receiver_internal': 'first',
              'sent_at': 'min',
              'text': lambda x: "\n".join(x),
          })
    )
    df_merged = df_merged.drop(columns=['block_id'])
    df_merged = df_merged.sort_values(by='sent_at').reset_index(drop=True)

    # conv_turn calculation: increment whenever the sender_internal is same as previous receiver_internal
    df_merged['conv_turn'] = (df_merged['sender_internal'] == df_merged['receiver_internal'].shift()).cumsum() + 1

    df_merged['next_text'] = df_merged['text'].shift(-1)
    df_merged['next_sender_internal'] = df_merged['sender_internal'].shift(-1)
    df_merged['answer'] = df_merged['next_text']

    # Map internal ids back to original names for the final `sender_user_id` and `receiver_user_id` columns
    reverse_map = {v: k for k, v in user_map.items()}
    df_merged['sender_user_id'] = df_merged['sender_internal'].map(reverse_map)
    df_merged['receiver_user_id'] = df_merged['receiver_internal'].map(reverse_map)

    # Drop helper columns
    df_merged = df_merged.drop(columns=['next_text', 'next_sender_internal', 'sender_internal', 'receiver_internal'])

    df_merged['answer'] = df_merged['answer'].fillna('<EOC>') #END OF CONVERSATION

    final_columns = [
        'msg_id',
        'conv_id',
        'conv_turn',
        'sender_user_id',
        'receiver_user_id',
        'sent_at',
        'text',
        'answer'
    ]

    final_df = df_merged[final_columns]
    return final_df

In [None]:
# Process every .txt chat file in the raw data directory and concatenate results
from pathlib import Path
RAW_DATA_DIR = r"C:\Users\Cyber_User\Documents\GitHub\Whatsapp_webApp_-Django-\RAG\RAG_data\raw_data"
all_txt_files = sorted(Path(RAW_DATA_DIR).glob('*.txt'))

dfs = []
mapping_rows = []
for p in all_txt_files:
    df_raw = parse_chat_log(str(p))
    if df_raw.empty:
        continue

    # Build a deterministic user_map per chat from speakers
    speakers = list(df_raw['Speaker'].dropna().unique())
    speakers_sorted = sorted(speakers)
    user_map_local = {s: f"u_{i+1}" for i, s in enumerate(speakers_sorted)}

    # Transform using the local mapping
    df_trans = transform_to_sql_schema(df_raw, user_map=user_map_local)
    if not df_trans.empty:
        df_trans['source_file'] = p.name
        dfs.append(df_trans)

    # Build mapping rows to save later
    user_ids_sorted = sorted(set(user_map_local.values()))
    conv_id_local = 'chat:' + '_'.join(user_ids_sorted) if user_ids_sorted else None
    for orig, uid in user_map_local.items():
        mapping_rows.append({
            'conv_id': conv_id_local,
            'chat_file': p.name,
            'original_name': orig,
            'user_id': uid
        })

if dfs:
    df_messages = pd.concat(dfs, ignore_index=True)
else:
    df_messages = pd.DataFrame()

# Create mapping DataFrame and remove duplicates
if mapping_rows:
    df_mappings = pd.DataFrame(mapping_rows)
    df_mappings = df_mappings.drop_duplicates().reset_index(drop=True)
else:
    df_mappings = pd.DataFrame(columns=['conv_id', 'chat_file', 'original_name', 'user_id'])

In [25]:
# Write combined DataFrame to KB_data.csv in the RAG_data folder and save mappings
from pathlib import Path
output_dir = Path(RAW_DATA_DIR).parent
output_csv = output_dir / 'KB_data.csv'
mapping_csv = output_dir / 'user_mappings.csv'

if not df_messages.empty:
    df_messages.to_csv(str(output_csv), index=False)
    print(f"Wrote combined CSV to: {output_csv}")
else:
    print("No messages were processed; no CSV written.")

# Save sender name mappings for each chat
if not df_mappings.empty:
    df_mappings.to_csv(str(mapping_csv), index=False)
    print(f"Wrote sender mappings to: {mapping_csv}")
else:
    print("No sender mappings to write.")

Wrote combined CSV to: C:\Users\Cyber_User\Documents\GitHub\Whatsapp_webApp_-Django-\RAG\RAG_data\KB_data.csv
Wrote sender mappings to: C:\Users\Cyber_User\Documents\GitHub\Whatsapp_webApp_-Django-\RAG\RAG_data\user_mappings.csv
