# **IMPORTS & PATHS** 

In [43]:
import os
import zipfile
import subprocess
import json
import pandas as pd
from langchain_ollama.llms import OllamaLLM
import shutil


# Paths and model settings\ nZIP_PATH = 'I-Soon-data.zip'      # Path to your downloaded zip file
ZIP_PATH = '0.zip'    # Directory to extract contents
EXTRACT_DIR = 'I-Soon-data'        # Directory to extract contents
ORGANIZED_DIR = 'organized_data'  # Directory to group files by extension
MODEL_NAME = 'llama3.2'             # Local Ollama model identifier
OUTPUT_JSON = 'parsed_md.json'    # Aggregated JSON output
OUTPUT_CSV = 'parsed_md.csv'      # CSV output for DataFrame

# Initialize Ollama LLM via LangChain
model = OllamaLLM(model=MODEL_NAME)

# **ZIP FILE EXTRACTION**

In [44]:
if not os.path.isdir(EXTRACT_DIR):
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(EXTRACT_DIR)
    print(f"Extracted archive to '{EXTRACT_DIR}'")
else:
    print(f"Extraction directory '{EXTRACT_DIR}' already exists")

Extraction directory 'I-Soon-data' already exists


# **DATA TYPE CATEGORIZATION**

In [45]:
import os
import shutil
import re

# regex to detect any HTML tag
html_re = re.compile(r'<[A-Za-z/][^>]*>')

for root, _, files in os.walk(EXTRACT_DIR):
    for fname in files:
        ext = os.path.splitext(fname)[1].lower().lstrip('.') or 'no_extension'
        src_path = os.path.join(root, fname)

        if ext == 'md':
            # classify Markdown by content into md/html or md/non-html
            with open(src_path, 'r', encoding='utf-8', errors='ignore') as f:
                text = f.read()
            subfolder = 'html' if html_re.search(text) else 'non-html'
            dest_folder = os.path.join(ORGANIZED_DIR, 'md', subfolder)
        else:
            # preserve original extension grouping for non-MD
            dest_folder = os.path.join(ORGANIZED_DIR, ext)

        os.makedirs(dest_folder, exist_ok=True)
        dst_path = os.path.join(dest_folder, fname)
        shutil.copy2(src_path, dst_path)

print(f"Organized files under '{ORGANIZED_DIR}':\n"
      " • Markdown with HTML → md/html/\n"
      " • Markdown without HTML → md/non-html/\n"
      " • Other extensions → <extension>/")

Organized files under 'organized_data':
 • Markdown with HTML → md/html/
 • Markdown without HTML → md/non-html/
 • Other extensions → <extension>/


# **MD DATA TRANSFORMATIO TO JSON**

In [46]:
import os
import json
import shutil
from docling.document_converter import DocumentConverter

# Initialize converter
converter = DocumentConverter()

# Define input/output directories
input_dir = "organized_data/md/html"
output_dir = "html_markdown_to_json"
failed_dir = "failed_parsing"
os.makedirs(output_dir, exist_ok=True)
os.makedirs(failed_dir, exist_ok=True)

# Process each Markdown file
for fname in os.listdir(input_dir):
    if fname.endswith(".md"):
        input_path = os.path.join(input_dir, fname)
        
        try:
            result = converter.convert(input_path)
            doc_dict = result.document.model_dump()
            
            # Output filename
            base = os.path.splitext(fname)[0]
            output_path = os.path.join(output_dir, f"{base}_docling.json")
            
            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(doc_dict, f, indent=2)
            
            print(f"✅ Converted: {fname} → {output_path}")
        
        except Exception as e:
            # Move to failed folder
            shutil.move(input_path, os.path.join(failed_dir, fname))
            print(f"❌ Failed to convert {fname}: {e} (moved to {failed_dir})")

✅ Converted: dbc9c90e-a3e6-4d71-bb93-5fb8394095ac.md → html_markdown_to_json/dbc9c90e-a3e6-4d71-bb93-5fb8394095ac_docling.json
✅ Converted: 28.md → html_markdown_to_json/28_docling.json
✅ Converted: 5.md → html_markdown_to_json/5_docling.json
✅ Converted: 38.md → html_markdown_to_json/38_docling.json
✅ Converted: 9d7bc879-3250-4013-ac04-5ff9bd6dff40.md → html_markdown_to_json/9d7bc879-3250-4013-ac04-5ff9bd6dff40_docling.json
✅ Converted: 18.md → html_markdown_to_json/18_docling.json
❌ Failed to convert ._3.md: 'utf-8' codec can't decode byte 0xfb in position 37: invalid start byte (moved to failed_parsing)
✅ Converted: 9fd06037-11f1-4ad5-9a7d-cbfb3fa4193b.md → html_markdown_to_json/9fd06037-11f1-4ad5-9a7d-cbfb3fa4193b_docling.json
✅ Converted: 3348953d-66e9-4cac-8675-65bb5f2ef929.md → html_markdown_to_json/3348953d-66e9-4cac-8675-65bb5f2ef929_docling.json
✅ Converted: 1.md → html_markdown_to_json/1_docling.json
✅ Converted: 19.md → html_markdown_to_json/19_docling.json
✅ Converted: 07f

# **JSON TO DATAFRAME WITH AI**
# *Here we use AI to analyze all of the JSON files and identify which headers are usefull to keep. We store all the identified headers in a dictionary with the name of the json file as key and value the list contaning the headers*

In [50]:
import os
import json
import ast  # To safely evaluate header string to list

# Folder path
folder_path = "html_markdown_to_json"

# Store headers proposed for each file
file_headers = {}

# Loop through each file in the directory
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        file_path = os.path.join(folder_path, filename)
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                doc_json = json.load(f)

            # Safely navigate the structure
            tables = doc_json.get("tables", [])
            if not tables:
                print(f"Skipped {filename}: no tables found.")
                continue

            cells = tables[0].get("data", {}).get("table_cells", [])
            sample_cells = cells[:20]  # Use a subset to limit input size

            # Format as JSON string
            cell_str = json.dumps(sample_cells, indent=2, ensure_ascii=False)

            # Prepare the prompt
            prompt = f"""
You are a law enforcement officer that specializes in investigating cybersecurity data leaks. Analyze the following JSON table cell data and infer what column headers would be appropriate
if this were converted into a pandas DataFrame. 

Only return a list of column names that are relevant to the data. Do not include any other text or explanation.

JSON data:
{cell_str}
"""

            # Get headers from the model
            headers_response = model(prompt)

            # Convert stringified list to actual Python list safely
            try:
                headers_list = ast.literal_eval(headers_response.strip())
                if isinstance(headers_list, list):
                    file_headers[filename] = headers_list
                else:
                    print(f"Warning: Invalid header format in {filename}")
            except Exception as eval_err:
                print(f"Header parsing error in {filename}: {eval_err}")

        except Exception as e:
            print(f"Error processing {filename}: {e}")

# Print the headers dictionary
for fname, headers in file_headers.items():
    print(f"{fname}: {headers}")

# Print the headers dictionary
print("\\n --- Headers Dicitonary --- \\n")
print(file_headers)

Skipped 9fe6b262-9944-417d-a0c4-9f2de1de2994_docling.json: no tables found.
Skipped 6d7fc7b3-c892-4cb5-bd4b-a5713c089d88_docling.json: no tables found.
Skipped 912204cb-8ab7-48b8-9abf-d803f3804d08_docling.json: no tables found.
Skipped b3031e66-40b6-45e8-9bcd-891dc1a280da_docling.json: no tables found.
Skipped dbc9c90e-a3e6-4d71-bb93-5fb8394095ac_docling.json: no tables found.
Skipped eda5b003-9250-4913-b724-74cca86240af_docling.json: no tables found.
Skipped 12756724-394c-4576-b373-7c53f1abbd94_docling.json: no tables found.
Skipped 5a6b122c-39c1-4581-8c1f-2d6f36a9f8a0_docling.json: no tables found.
Skipped 547aba02-6757-49c1-acb5-6df217cebfc7_docling.json: no tables found.
Skipped 54990932-71af-48dd-9a7a-2617b1407c54_docling.json: no tables found.
Skipped 585875ff-f8c5-4a02-acd7-fef37dc9ff11_docling.json: no tables found.
Skipped fe245192-1f9c-4f28-9b32-046fb7ce7e1e_docling.json: no tables found.
Skipped 3f451a52-d210-48d9-b56e-d28b9570bdc4_docling.json: no tables found.
Skipped 2db2

In [None]:
import ast
import pandas as pd
import json
from typing import Dict

def load_docling_tables_with_llm_headers(json_path: str, headers_response: str) -> Dict[str, pd.DataFrame]:
    """
    Reads a Docling JSON export and returns a dict mapping
    "table_1", "table_2", … → pandas.DataFrame for each table found.
    Uses LLM-generated headers instead of inferring from content.
    """
    # Parse LLM headers from string
    try:
        headers = ast.literal_eval(headers_response.strip())
        if not isinstance(headers, list):
            raise ValueError("LLM response did not evaluate to a list")
    except Exception as e:
        raise ValueError(f"Failed to parse headers from LLM output: {e}")

    # Load the JSON file
    with open(json_path, 'r', encoding='utf-8') as f:
        doc = json.load(f)

    tables = doc.get("tables", [])
    dfs: Dict[str, pd.DataFrame] = {}

    for idx, tbl in enumerate(tables, start=1):
        # Flatten to DataFrame
        cells = tbl["data"]["table_cells"]
        df_cells = pd.DataFrame(cells)

        # Pivot into grid
        grid = df_cells.pivot(
            index="start_row_offset_idx",
            columns="start_col_offset_idx",
            values="text"
        )

        # Remove the first row (assumed to be header row in JSON, already handled by LLM)
        body = grid.iloc[1:].reset_index(drop=True)

        # Apply LLM-inferred headers
        body.columns = headers

        # Store result
        dfs[f"table_{idx}"] = body

    return dfs

In [37]:
JSON_PATH = "34_docling.json"

# Example LLM output from earlier cell
headers_response = '["Time", "From", "To", "Message"]'

# Load and display
dataframes = load_docling_tables_with_llm_headers(JSON_PATH, headers_response)

for name, df in dataframes.items():
    print(f"\n=== {name} (shape: {df.shape}) ===")
    display(df)  # Displayed properly in Jupyter if this is the last line


=== table_1 (shape: (426, 4)) ===


Unnamed: 0,Time,From,To,Message
0,2018-11-08 01:36:58,qq78263462,wxid_5390224027312,我昨天一天都在忙....
1,2018-11-08 01:37:12,qq78263462,wxid_5390224027312,你好久回呢？
2,2018-11-08 01:37:14,wxid_5390224027312,qq78263462,没事
3,2018-11-08 01:37:19,wxid_5390224027312,qq78263462,我可能要下周
4,2018-11-08 01:37:23,qq78263462,wxid_5390224027312,我擦
...,...,...,...,...
421,2018-11-08 16:22:57,qq78263462,wxid_5390224027312,你不要跟别的人说起这些哈
422,2018-11-08 16:23:06,wxid_5390224027312,qq78263462,嗯嗯不得
423,2018-11-08 16:23:19,qq78263462,wxid_5390224027312,尤其是一楼的女的......
424,2018-11-08 16:23:21,qq78263462,wxid_5390224027312,哈哈哈哈哈


# **DATAFRAME TO CSV**

In [39]:
import os

# Define output directory
OUTPUT_DIR = "parsed_html_to_csv"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Save each table to a CSV inside the directory
for name, df in dataframes.items():
    csv_path = os.path.join(OUTPUT_DIR, f"{name}.csv")
    df.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")

Saved: parsed_html_to_csv/table_1.csv
