# Preprocessing

## 1. Libs

In [31]:
# Install required packages
!pip install sentence-transformers chromadb langchain pandas numpy tqdm



In [32]:
import json
import os
import pandas as pd
import numpy as np
from typing import List, Dict, Any
from pathlib import Path
import re
from tqdm import tqdm

from sentence_transformers import SentenceTransformer
import chromadb
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

In [41]:
# Load JSON data
with open('apec2025_scraped_data.json', 'r', encoding='utf-8') as f:
    json_data = json.load(f)

## Data Processing 

In [None]:


def process_table_content(table_content):
    """Process table content to make it more readable for embeddings"""
    lines = table_content.strip().split('\n')
    processed_lines = []
    
    # Add table header
    processed_lines.append('\n\nTABLE:')
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
            
        # Process headers
        if line.startswith('HEADERS:'):
            header_content = line.replace('HEADERS:', '').strip()
            if '|' in header_content:
                # Clean up pipe-separated headers
                headers = [h.strip() for h in header_content.split('|')]
                processed_lines.append(f"Columns: {' | '.join(headers)}")
            else:
                processed_lines.append(f"Columns: {header_content}")
        
        # Process rows
        elif line.startswith('ROW '):
            row_match = re.match(r'ROW (\d+):\s*(.*)', line)
            if row_match:
                row_num, row_content = row_match.groups()
                if '|' in row_content:
                    row_data = [cell.strip() for cell in row_content.split('|')]
                    processed_lines.append(f"Row {row_num}: {' | '.join(row_data)}")
                else:
                    processed_lines.append(f"Row {row_num}: {row_content}")
        
        # Handle separator lines
        elif line.startswith('--'):
            continue  
        
        else:
            if line and not line.startswith('ROW') and not line.startswith('HEADERS'):
                processed_lines.append(line)
    
    return '\n'.join(processed_lines) + '\n'

In [None]:
def clean_content(content):
    content = re.sub(r'[ \t]+', ' ', content)
    content = re.sub(r'\n\s*\n', '\n\n', content)
    
    # Clean section markers first
    content = re.sub(r'=== (.*?) ===', r'\n\n## \1\n', content)
    content = re.sub(r'--- (.*?) ---', r'\n\n### \1\n', content)
    
    if '[TABLE START]' in content and '[TABLE END]' in content:
        # Extract table content between markers
        table_pattern = r'\[TABLE START\](.*?)\[TABLE END\]'
        tables = re.findall(table_pattern, content, re.DOTALL)
        
        for table_content in tables:
            # Process the table content
            processed_table = process_table_content(table_content)
            # Replace the original table 
            content = content.replace(f'[TABLE START]{table_content}[TABLE END]', processed_table)
    
    return content.strip()

In [35]:
def process_all_data(json_data):
    documents = []
    
    if json_data:
        for item in tqdm(json_data, desc="Processing JSON documents"):
            cleaned_content = clean_content(item.get('content', ''))
            
            doc = Document(
                page_content=cleaned_content,
                metadata={
                    'source': 'json',
                    'url': item.get('url', ''),
                    'title': item.get('title', ''),
                    'word_count': item.get('word_count', 0),
                    'link_text': item.get('link_text', '')
                }
            )
            documents.append(doc)

    return documents

In [None]:
def chunk_documents(documents, chunk_size=1000, chunk_overlap=200, min_chunk_size=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " ", ""]
    )
    
    chunked_docs = []
    
    for doc in tqdm(documents, desc="chunking documents"):
        content = doc.page_content
        
        # If content has a table, keep as one piece
        if 'TABLE:' in content:
            doc.metadata.update({
                'chunk_id': f"{doc.metadata.get('title', 'unknown')}_0",
                'chunk_index': 0,
                'total_chunks': 1,
                'original_doc_length': len(doc.page_content),
                'chunk_length': len(content),
                'contains_table': True
            })
            chunked_docs.append(doc)
        else:
            # Split the document
            chunks = text_splitter.split_documents([doc])
            
            merged_chunks = []
            i = 0
            
            while i < len(chunks):
                current_content = chunks[i].page_content.strip()
                
                # If current chunk is short, collect all consecutive short chunks
                if len(current_content) < min_chunk_size:
                    buffer_parts = [current_content]
                    j = i + 1
                    
                    # Collect consecutive short chunks
                    while j < len(chunks) and len(chunks[j].page_content.strip()) < min_chunk_size:
                        buffer_parts.append(chunks[j].page_content.strip())
                        j += 1
                    
                    # Merge buffer with next long chunk 
                    if j < len(chunks):
                        long_content = chunks[j].page_content.strip()
                        final_content = "\n\n".join(buffer_parts) + "\n\n" + long_content
                        
                        if len(final_content) <= chunk_size * 1.2:
                            merged_chunk = Document(
                                page_content=final_content,
                                metadata=chunks[i].metadata.copy()
                            )
                            merged_chunks.append(merged_chunk)
                            i = j + 1  
                            continue
                    
                    # If no long chunk or merge too big, create chunk from buffer only
                    buffer_content = "\n\n".join(buffer_parts)
                    buffer_chunk = Document(
                        page_content=buffer_content,
                        metadata=chunks[i].metadata.copy()
                    )
                    merged_chunks.append(buffer_chunk)
                    i = j  # Skip processed short chunks
                else:
                    # Long chunk - keep as is
                    merged_chunks.append(chunks[i])
                    i += 1
            
            # Add metadata to all chunks
            for idx, chunk in enumerate(merged_chunks):
                chunk.metadata.update({
                    'chunk_id': f"{doc.metadata.get('title', 'unknown')}_{idx}",
                    'chunk_index': idx,
                    'total_chunks': len(merged_chunks),
                    'original_doc_length': len(doc.page_content),
                    'chunk_length': len(chunk.page_content),
                    'contains_table': False
                })
                
            chunked_docs.extend(merged_chunks)
        
    return chunked_docs

In [None]:
all_documents = process_all_data(json_data)
chunked_documents = chunk_documents(all_documents, chunk_size=1200, chunk_overlap=200)

Processing JSON documents: 100%|██████████| 24/24 [00:00<00:00, 2284.12it/s]


chunking documents: 100%|██████████| 24/24 [00:00<00:00, 2934.62it/s]


## Save Processed Data

In [None]:
import pickle

os.makedirs('data/processed', exist_ok=True)

with open('data/processed/chunked_documents.pkl', 'wb') as f:
    pickle.dump(chunked_documents, f)

print(f"Saved {len(chunked_documents)} chunked documents to data/processed/chunked_documents.pkl")

documents_json = []
for doc in chunked_documents:
    documents_json.append({
        'content': doc.page_content,
        'metadata': doc.metadata
    })

with open('data/processed/chunked_documents.json', 'w', encoding='utf-8') as f:
    json.dump(documents_json, f, ensure_ascii=False, indent=2)

