# **IMPORTS & PATHS** 

In [27]:
import os
import zipfile
import subprocess
import json
import pandas as pd
from langchain_ollama.llms import OllamaLLM
import shutil


# Paths and model settings\ nZIP_PATH = 'I-Soon-data.zip'      # Path to your downloaded zip file
ZIP_PATH = '0.zip'    # Directory to extract contents
EXTRACT_DIR = 'I-Soon-data'        # Directory to extract contents
ORGANIZED_DIR = 'organized_data'  # Directory to group files by extension
MODEL_NAME = 'llama3.2'             # Local Ollama model identifier
OUTPUT_JSON = 'parsed_md.json'    # Aggregated JSON output
OUTPUT_CSV = 'parsed_md.csv'      # CSV output for DataFrame

# Initialize Ollama LLM via LangChain
model = OllamaLLM(model=MODEL_NAME)

# **ZIP FILE EXTRACTION**

In [28]:
if not os.path.isdir(EXTRACT_DIR):
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(EXTRACT_DIR)
    print(f"Extracted archive to '{EXTRACT_DIR}'")
else:
    print(f"Extraction directory '{EXTRACT_DIR}' already exists")

Extraction directory 'I-Soon-data' already exists


# **DATA TYPE CATEGORIZATION**

In [29]:
import os
import shutil
import re

# regex to detect any HTML tag
html_re = re.compile(r'<[A-Za-z/][^>]*>')

for root, _, files in os.walk(EXTRACT_DIR):
    for fname in files:
        ext = os.path.splitext(fname)[1].lower().lstrip('.') or 'no_extension'
        src_path = os.path.join(root, fname)

        if ext == 'md':
            # classify Markdown by content into md/html or md/non-html
            with open(src_path, 'r', encoding='utf-8', errors='ignore') as f:
                text = f.read()
            subfolder = 'html' if html_re.search(text) else 'non-html'
            dest_folder = os.path.join(ORGANIZED_DIR, 'md', subfolder)
        else:
            # preserve original extension grouping for non-MD
            dest_folder = os.path.join(ORGANIZED_DIR, ext)

        os.makedirs(dest_folder, exist_ok=True)
        dst_path = os.path.join(dest_folder, fname)
        shutil.copy2(src_path, dst_path)

print(f"Organized files under '{ORGANIZED_DIR}':\n"
      " • Markdown with HTML → md/html/\n"
      " • Markdown without HTML → md/non-html/\n"
      " • Other extensions → <extension>/")

Organized files under 'organized_data':
 • Markdown with HTML → md/html/
 • Markdown without HTML → md/non-html/
 • Other extensions → <extension>/


# **MD DATA TRANSFORMATIO TO JSON**

In [31]:
import json
from docling.document_converter import DocumentConverter

converter = DocumentConverter()
result    = converter.convert("organized_data/md/html/34.md")

# In Pydantic v2 the public API is .model_dump()
doc_dict  = result.document.model_dump()

# Now serialize however you like:
with open("34_docling.json", "w", encoding="utf-8") as f:
    json.dump(doc_dict, f, indent=2)

# **JSON TO DATAFRAME WITH AI**

In [32]:
# Load the JSON file
with open("34_docling.json", "r", encoding="utf-8") as f:
    doc_json = json.load(f)

# Extract a manageable portion of the table cells
cells = doc_json["tables"][0]["data"]["table_cells"]
sample_cells = cells[:20]  # reduce size for prompt

# Format as JSON string for input to LLM
cell_str = json.dumps(sample_cells, indent=2, ensure_ascii=False)

# Prompt for LLM: only ask for DataFrame headers
prompt = f"""
You are a law inforcement officer that specializez in investigating cybersecurity data leaks. Analyze the following JSON table cell data and infer what column headers would be appropriate
if this were converted into a pandas DataFrame. 

Only return a list of column names that are relevant to the data. Do not include any other text or explanation.

JSON data:
{cell_str}
"""

# Run the prompt through Ollama
headers_response = model(prompt)

# Output the suggested headers
print("Proposed DataFrame headers:")
print(headers_response.strip())

Proposed DataFrame headers:
["Time", "From", "To", "Message"]


In [33]:
import ast
import pandas as pd
import json
from typing import Dict

def load_docling_tables_with_llm_headers(json_path: str, headers_response: str) -> Dict[str, pd.DataFrame]:
    """
    Reads a Docling JSON export and returns a dict mapping
    "table_1", "table_2", … → pandas.DataFrame for each table found.
    Uses LLM-generated headers instead of inferring from content.
    """
    # Parse LLM headers from string
    try:
        headers = ast.literal_eval(headers_response.strip())
        if not isinstance(headers, list):
            raise ValueError("LLM response did not evaluate to a list")
    except Exception as e:
        raise ValueError(f"Failed to parse headers from LLM output: {e}")

    # Load the JSON file
    with open(json_path, 'r', encoding='utf-8') as f:
        doc = json.load(f)

    tables = doc.get("tables", [])
    dfs: Dict[str, pd.DataFrame] = {}

    for idx, tbl in enumerate(tables, start=1):
        # Flatten to DataFrame
        cells = tbl["data"]["table_cells"]
        df_cells = pd.DataFrame(cells)

        # Pivot into grid
        grid = df_cells.pivot(
            index="start_row_offset_idx",
            columns="start_col_offset_idx",
            values="text"
        )

        # Remove the first row (assumed to be header row in JSON, already handled by LLM)
        body = grid.iloc[1:].reset_index(drop=True)

        # Apply LLM-inferred headers
        body.columns = headers

        # Store result
        dfs[f"table_{idx}"] = body

    return dfs


In [34]:
JSON_PATH = "34_docling.json"

# Example LLM output from earlier cell
headers_response = '["Time", "From", "To", "Message"]'

# Load and display
dataframes = load_docling_tables_with_llm_headers(JSON_PATH, headers_response)

for name, df in dataframes.items():
    print(f"\n=== {name} (shape: {df.shape}) ===")
    display(df)  # Displayed properly in Jupyter if this is the last line


=== table_1 (shape: (426, 4)) ===


Unnamed: 0,Time,From,To,Message
0,2018-11-08 01:36:58,qq78263462,wxid_5390224027312,我昨天一天都在忙....
1,2018-11-08 01:37:12,qq78263462,wxid_5390224027312,你好久回呢？
2,2018-11-08 01:37:14,wxid_5390224027312,qq78263462,没事
3,2018-11-08 01:37:19,wxid_5390224027312,qq78263462,我可能要下周
4,2018-11-08 01:37:23,qq78263462,wxid_5390224027312,我擦
...,...,...,...,...
421,2018-11-08 16:22:57,qq78263462,wxid_5390224027312,你不要跟别的人说起这些哈
422,2018-11-08 16:23:06,wxid_5390224027312,qq78263462,嗯嗯不得
423,2018-11-08 16:23:19,qq78263462,wxid_5390224027312,尤其是一楼的女的......
424,2018-11-08 16:23:21,qq78263462,wxid_5390224027312,哈哈哈哈哈


# **DATAFRAME TO CSV**

In [37]:
import os

# Define output directory
OUTPUT_DIR = "parsed_html_to_csv"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Save each table to a CSV inside the directory
for name, df in dataframes.items():
    csv_path = os.path.join(OUTPUT_DIR, f"{name}.csv")
    df.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")

Saved: parsed_html_to_csv/table_1.csv
