# Polis Data Project

# 0. Setup

In [2]:
%pip install docling
%pip install onnxruntime rapidocr-onnxruntime
%pip install easyocr
%pip install opencv-python-headless

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
import cv2
import numpy as np
import torch
import re
import os
import pandas as pd
import csv
import json

from pathlib import Path

from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions

  from .autonotebook import tqdm as notebook_tqdm


# 1. Pre-processing and conversion export
## SKIP PDF CONVERSION AND USE HN_section_cleaned.md

#### Load source document and specify conversion format

In [3]:
# source = "[corpus file].pdf" #
#source = "HN_sample.pdf" # 3 page sample
source = "HN_section.pdf" # Full inventory

converter = DocumentConverter()
doc = converter.convert(source)

# Choose conversion format (markdown default) #
raw_text = doc.document.export_to_markdown()
#raw_text = doc.document.export_to_text()

# raw text check #
print("     === Raw text from pdf ===")
print(raw_text[:1000])

2026-01-19 15:36:01,386 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2026-01-19 15:36:15,570 - INFO - Going to convert document batch...
2026-01-19 15:36:15,573 - INFO - Initializing pipeline for StandardPdfPipeline with options hash e15bc6f248154cc62f8db15ef18a8ab7
2026-01-19 15:36:15,641 - INFO - Loading plugin 'docling_defaults'
2026-01-19 15:36:15,731 - INFO - Registered picture descriptions: ['vlm', 'api']
2026-01-19 15:36:15,757 - INFO - Loading plugin 'docling_defaults'
2026-01-19 15:36:16,004 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2026-01-19 15:37:52,885 - INFO - Accelerator device: 'cuda:0'
[32m[INFO] 2026-01-19 15:37:52,962 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2026-01-19 15:37:53,301 [RapidOCR] download_file.py:60: File exists and is valid: /work/PDP/.venv/lib/python3.12/site-packages/rapidocr/models/ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2026-01-19 15:37:53,303 [Rapid

     === Raw text from pdf ===
## I. The Region

The region under consideration here is made up of at least three different areas, partially recognised as such in antiquity: Iberia, Southern Gaul and the island of Corsica. The only common elements characterising these different areas are colonisation by Phokaia (no.  ) and the fact that both Iberia and southern Gaul were,at least to some extent, within the area dominated commercially and possibly politically by Massalia (no.  ). Moreover, these areas mark the westernmost limit of the Greek presence in the Mediterranean. ¹ Although Greek contacts with Tartessos in Iberia are attested  for  C  l  by  Herodotos' mention  of Kolaios' voyage (  .  .  ) and for C  l/C  e by his mention of Phokaian travellers (  .  .  ), Greek poleis do not seem to have existed in Tartessian  territory, despite  the  offer  made  by  the  native king,Arganthonios,to the Phokaians,that they could establish themselves in his territory wherever 

#### Fix numerical and greek character errors

In [4]:
def clean_text(raw_text):
    # Fix digits
    digit_map = str.maketrans({
        '': '0', '': '1', '': '2', '': '3', '': '4',
        '': '5', '': '6', '': '7', '': '8', '': '9'
    })
    raw_text = raw_text.translate(digit_map)
    
    # Fix lenis corruption
    raw_text = re.sub(r'lenis', '', raw_text)
    raw_text = re.sub(r'/([Α-Ωα-ω])', r'\1', raw_text)
    
    # Fix Greek letters
    greek_map = {
        '/Alpha': 'Α', '/alpha': 'α', '/alphaacute': 'ά',
        '/Beta': 'Β', '/beta': 'β',
        '/Gamma': 'Γ', '/gamma': 'γ',
        '/Delta': 'Δ', '/delta': 'δ',
        '/Epsilon': 'Ε', '/epsilon': 'ε', '/epsilonacute': 'έ',
        '/Zeta': 'Ζ', '/zeta': 'ζ',
        '/Eta': 'Η', '/eta': 'η', '/etaacute': 'ή',
        '/Theta': 'Θ', '/theta': 'θ',
        '/Iota': 'Ι', '/iota': 'ι', '/iotaacute': 'ί', '/iotatilde': 'ῖ',
        '/Kappa': 'Κ', '/kappa': 'κ',
        '/Lambda': 'Λ', '/lambda': 'λ',
        '/Mu': 'Μ', '/mu': 'μ',
        '/Nu': 'Ν', '/nu': 'ν',
        '/Xi': 'Ξ', '/xi': 'ξ',
        '/Omicron': 'Ο', '/omicron': 'ο', '/omicronacute': 'ό',
        '/Pi': 'Π', '/pi': 'π',
        '/Rho': 'Ρ', '/rho': 'ρ',
        '/Sigma': 'Σ', '/sigma': 'σ', '/sigmaf': 'ς',
        '/Tau': 'Τ', '/tau': 'τ',
        '/Upsilon': 'Υ', '/upsilon': 'υ',
        '/Phi': 'Φ', '/phi': 'φ',
        '/Chi': 'Χ', '/chi': 'χ',
        '/Psi': 'Ψ', '/psi': 'ψ',
        '/Omega': 'Ω', '/omega': 'ω', '/omegatilde': 'ῶ', '/omegaacute': 'ώ',
    }
    
    for greek_code, greek_char in sorted(greek_map.items(), key=lambda x: len(x[0]), reverse=True):
        raw_text = raw_text.replace(greek_code, greek_char)
    
    return raw_text

# Clean the text
cleaned_text = clean_text(raw_text)

# Check text #
print(f"Document character length: {len(cleaned_text)}")
print(f"Word count (approximate): {len(cleaned_text.split())}")
print(f"Number of lines: {len(cleaned_text.splitlines())}")
print("     === Cleaned Text ===")
print(cleaned_text[:1000])

Document character length: 6038515
Word count (approximate): 1195072
Number of lines: 25569
     === Cleaned Text ===
## I. The Region

The region under consideration here is made up of at least three different areas, partially recognised as such in antiquity: Iberia, Southern Gaul and the island of Corsica. The only common elements characterising these different areas are colonisation by Phokaia (no. 859 ) and the fact that both Iberia and southern Gaul were,at least to some extent, within the area dominated commercially and possibly politically by Massalia (no. 3 ). Moreover, these areas mark the westernmost limit of the Greek presence in the Mediterranean. ¹ Although Greek contacts with Tartessos in Iberia are attested  for  C 7 l  by  Herodotos' mention  of Kolaios' voyage ( 4 . 152 . 2 ) and for C 7 l/C 6 e by his mention of Phokaian travellers ( 1 . 163 . 1 ), Greek poleis do not seem to have existed in Tartessian  territory, despite  the  offer  made  by  the  native king,Argant

#### Export cleaned text as markdown file and set as new source

In [5]:
# MAKE SURE THIS IS OVERWRITING THE CORRECT FILE

with open('HN_section_cleaned.md', 'w', encoding='utf-8') as f:
    f.write(cleaned_text)

print("cleaned text saved as markdown")



cleaned text saved as markdown


In [5]:
# Set cleaned md as cleaned variable
with open('HN_section_cleaned.md', 'r', encoding='utf-8') as f:
    cleaned_text = f.read()

# Check text #
print(f"Document character length: {len(cleaned_text)}")
print(f"Word count (approximate): {len(cleaned_text.split())}")
print(f"Number of lines: {len(cleaned_text.splitlines())}")
print("     === Cleaned Text ===")
print(cleaned_text[:1000])

Document character length: 6038515
Word count (approximate): 1195072
Number of lines: 25569
     === Cleaned Text ===
## I. The Region

The region under consideration here is made up of at least three different areas, partially recognised as such in antiquity: Iberia, Southern Gaul and the island of Corsica. The only common elements characterising these different areas are colonisation by Phokaia (no. 859 ) and the fact that both Iberia and southern Gaul were,at least to some extent, within the area dominated commercially and possibly politically by Massalia (no. 3 ). Moreover, these areas mark the westernmost limit of the Greek presence in the Mediterranean. ¹ Although Greek contacts with Tartessos in Iberia are attested  for  C 7 l  by  Herodotos' mention  of Kolaios' voyage ( 4 . 152 . 2 ) and for C 7 l/C 6 e by his mention of Phokaian travellers ( 1 . 163 . 1 ), Greek poleis do not seem to have existed in Tartessian  territory, despite  the  offer  made  by  the  native king,Argant

## Filtering for the Poleis sections

In [None]:
region_heading = r'^##\s+([A-Z\s\(\)]+)\s*$'
poleis_texts = r'(II\. The Poleis.*?)(?=\n##|\Z)'

region_names = re.findall(region_heading, cleaned_text)
sections = re.findall(poleis_texts, cleaned_text, re.DOTALL)

print(f"Found {len(region_names)} regions with {len(sections)} total poleis groups")

# Now you can work with each section
for i, section in enumerate(sections, 1):
    print(f"\n--- Section {i} ---")
    print(section[:200])  # Print first 200 chars of each section
    
# Or save all sections to a new file
with open('filtered_sections.md', 'w', encoding='utf-8') as f:
    f.write('\n\n'.join(sections))

Found 0 regions with 45 total poleis

--- Section 1 ---
II. The Poleis

1 . Alalie Map 48 b. Lat. 42 . 10 , long. 9 . 70 . Size of territory: 3 (?). Type: A: α . The toponym is Αλαλίη , η (Hdt. 1 . 165 . 1 ,

166 . 3 ). A city-ethnic is not attested; Herod

--- Section 2 ---
II. The Poleis

5 . Abakainon (Abakaininos) Map 47 . Lat. 38 . 05 , long. 15 . 05 . Size  of territory: ?  Type: B: β . The  toponym  is Αβακαίνη , ηasper (Diod. 14 . 90 . 3 )  or Αβάκαινον , τό (Diod

--- Section 3 ---
II. The Poleis

52 . Herakleia (Herakleios) Map 45 . Lat. 40 . 15 , long. 16 . 40 . Size of territory: 4 . Type: A. The city may initially have been called by another name (Antiochos ( FGrHist 555 ) f

--- Section 4 ---
II. The Poleis

75 . Adria Map. 40 . Lat. 45 . 05 , long. 12 . 05 . Size of territory: ? Type: C: β . The toponym is Αδρία , ηasper (Hecat. fr. 90 ; Strabo 5 . 1 . 8 , MSS) or Ατρία (Strabo 5 . 1 . 8 

--- Section 5 ---
II. The Poleis

86 . Amantia (Amantieus) Map 49 . Lat. 40 . 25 ,

In [None]:
import re

with open('HN_section.md', 'r', encoding='utf-8') as f:
    content = f.read()

region_heading = r'^##\s+([A-Z\s\(\)]+)\s*$'
poleis_texts = r'(II\. The Poleis.*?)(?=\n##|\Z)'

region_names = re.findall(region_heading, content, re.MULTILINE)
sections = re.findall(poleis_texts, content, re.DOTALL)

print(f"Found {len(region_names)} regions with {len(sections)} total poleis sections")

for i, section in enumerate(sections, 1):
    print(f"\n--- Section {i} ---")
    print(section[:200])

with open('filtered_sections.md', 'w', encoding='utf-8') as f:
    f.write('\n\n'.join(sections))

Found 26 regions with 45 total poleis sections

--- Section 1 ---
II. The Poleis

1 . Alalie Map 48 b. Lat. 42 . 10 , long. 9 . 70 . Size of territory: 3 (?). Type: A: α . The toponym is Αλαλίη , η (Hdt. 1 . 165 . 1 ,

166 . 3 ). A city-ethnic is not attested; Herod

--- Section 2 ---
II. The Poleis

5 . Abakainon (Abakaininos) Map 47 . Lat. 38 . 05 , long. 15 . 05 . Size  of territory: ?  Type: B: β . The  toponym  is Αβακαίνη , ηasper (Diod. 14 . 90 . 3 )  or Αβάκαινον , τό (Diod

--- Section 3 ---
II. The Poleis

52 . Herakleia (Herakleios) Map 45 . Lat. 40 . 15 , long. 16 . 40 . Size of territory: 4 . Type: A. The city may initially have been called by another name (Antiochos ( FGrHist 555 ) f

--- Section 4 ---
II. The Poleis

75 . Adria Map. 40 . Lat. 45 . 05 , long. 12 . 05 . Size of territory: ? Type: C: β . The toponym is Αδρία , ηasper (Hecat. fr. 90 ; Strabo 5 . 1 . 8 , MSS) or Ατρία (Strabo 5 . 1 . 8 

--- Section 5 ---
II. The Poleis

86 . Amantia (Amantieus) Map 49 . Lat.

## VVVVV FIX SECTIONING FILTER TO INCLUDE 45TH REGION VVVVV

In [None]:
# Filter sections to only Poleis lists

region_heading = r'##\s+([A-Z\s\(\)]+)\s*(?=\n## II\. The Poleis)'
poleis_texts = r'(II\. The Poleis.*?)(?=\n##|\Z)'

# Find all sections and region names
sections = re.findall(poleis_texts, cleaned_text, re.DOTALL)
region_names = re.findall(region_heading, content)

print(f"Found {len(sections)} sections")
print(f"Found {len(region_names)} region names")

# Combine them
combined_sections = []
for i, (region, section) in enumerate(zip(region_names, sections), 1):
    combined = f"{region.strip()}\n{section}"
    combined_sections.append(combined)
    print(f"\n--- Section {i}: {region.strip()} ---")
    print(combined[:200])  # Print first 200 chars of each section

# Save all sections to a new file
with open('filtered_sections.md', 'w', encoding='utf-8') as f:
    f.write('\n\n'.join(combined_sections))

Found 45 sections
Found 0 region names


```

**How it works:**
- `##\s+[A-Z\s\(\)]+\s*\n` matches the all-caps region heading (including spaces and parentheses)
- `## II\. The Poleis.*?` captures the Poleis section
- The lookahead stops at the next region header or end of file

This will give you sections like:
```
##  SPAIN AND FRANCE (INCLUDING CORSICA)
## II. The Poleis
[content...]

In [11]:


# Pattern to match region name + "## II. The Poleis" sections
pattern = r'(##\s+[A-Z\s\(\)]+\s*\n## II\. The Poleis.*?)(?=\n##\s+[A-Z\s\(\)]+\s*\n## II\. The Poleis|\Z)'

sections = re.findall(pattern, cleaned_text, re.DOTALL)

print(f"Found {len(sections)} sections")

# Preview each section
for i, section in enumerate(sections, 1):
    # Extract just the region name for display
    region_match = re.search(r'##\s+([A-Z\s\(\)]+)', section)
    region_name = region_match.group(1).strip() if region_match else "Unknown"
    print(f"\n--- Section {i}: {region_name} ---")
    print(section[:300])  # Print first 300 chars

# If you want all sections combined as a single string:
all_sections = '\n\n'.join(sections)

# Or work with the list of sections:
# sections[0] = first region section
# sections[1] = second region section
# etc.

Found 0 sections


# 2. Sectioning by polis

In [None]:
# TEST - detect index, city name, and export to csv for all poleis

# NOT ALL HAVE ""MAP""

# Detect enumerated poleis
pattern = r'^(\d+)\s*\.\s+(.+?)\s+Map\b'

cities = []
for line in cleaned_text.split('\n'):
    match = re.match(pattern, line.strip())
    if match:
        index = match.group(1)
        city_name = match.group(2).strip()
        cities.append([index, city_name])

print(f"Found {len(cities)} cities")

# Preview first 5
for city in cities[:5]:
    print(f"{city[0]}. {city[1]}")

# Write to CSV
with open('PDP_inventory.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['Index', 'City'])
    writer.writerows(cities)

print(f"\n✓ Saved {len(cities)} cities to PDP_inventory.csv")

Found 968 cities
1. Alalie
2. Emporion (Emporites)
3. Massalia (Massaliotes)
4. Rhode (Rhodetes)
5. Abakainon (Abakaininos)

✓ Saved 968 cities to PDP_inventory.csv


In [9]:
pattern = re.compile(
    r'(?m)^(\d+)\s*\.\s+'        # city index
    r'(.+?)\s+Map\b'             # city name (must be before "Map")
    r'(.*?)'                     # rest of entry
    r'(?=^\d+\s*\.\s+.+?\s+Map\b|\Z)',  # next city header or EOF
    re.DOTALL
)


cities = []

for m in pattern.finditer(cleaned_text):
    index = int(m.group(1))
    header = m.group(2).strip()
    body = m.group(3).strip()
    cities.append((index, header, body))


print(f"Found {len(cities)} cities\n")

for index, header, _ in cities[:5]:
    print(f"{index}. {header}")



Found 970 cities

1. Alalie
166. 3 ). A city-ethnic is not attested; Herodotos refers to the community as οι Φωκαιεῖς ( 1 . 166 . 2 bis ).

Alalie is called a polis in the urban sense at Hdt. 1 . 165 . 1 ( ανεστήσαντο πόλιν ). The  name  of the  territory is unknown, but its extent, after C 6 m when the refugees from Phokaia arrived,has been calculated at 200 km ² (Gras ( 1985 ) 404 ), and it may have been divided into lots (Jehasse and Jehasse ( 1987 ) 380 ). The 'epoecised' city put a fleet of sixty ships to sea (Hdt. 1 . 166 . 2 ), which indicates a population of c. 20 , 000 inhabitants  (Gras  ( 1985 ) 400 -6 )  or  even  more (Domínguez ( 1985 ) 375 -76 ).

Alalie was founded by Phokaia (no. 859 ), in accordance with an oracle, 20 years before the fall of Phokaia, i.e. c. 560 (Hdt. 1 . 165 . 1 ). In c. 546 the  Phokaians  decided  to  relocate their city in reaction to the Persian threat, and almost half of the  Phokaians  were  received  at  Alalie. From  the  Alalian point of vi

In [11]:
import re

pattern = re.compile(
    r'(?m)'                         # multiline mode
    r'^\s*(\d+)\s*\.\s*'            # (1) city number
    r'([^\n(]+?)'                   # (2) city name (before parentheses)
    r'(?:\s*\([^)]+\))?'            # optional parentheses
    r'\s*'                          # spacing
    r'(.*?)'                        # (3) city text
    r'(?=^\s*\d+\s*\.|\Z)',         # until next city or EOF
    re.DOTALL
)

cities = []

for m in pattern.finditer(cleaned_text):
    number = int(m.group(1))
    name = m.group(2).strip()
    text = m.group(3).strip()
    cities.append((number, name, text))


In [None]:
# THIS ONE WORKS

def chunk_cities(cleaned_text):  # One chunk per city
    chunks = []
    lines = cleaned_text.split('\n')

    city_pattern = r'^(\d+)\s*\.\s+(.+?)\s+Map\b'

    current_index = None
    current_city = None
    current_lines = []

    for line in lines:
        match = re.match(city_pattern, line)

        if match:
            # Save previous chunk
            if current_index is not None:
                chunks.append({
                    'index': current_index,
                    'city': current_city.replace('*', '').strip(),
                    'text': '\n'.join(current_lines).strip()
                })

            # Start new chunk
            current_index = match.group(1)
            current_city = match.group(2)
            current_lines = [line]

        elif current_index is not None:
            current_lines.append(line)

    # Save last chunk
    if current_index is not None:
        chunks.append({
            'index': current_index,
            'city': current_city.replace('*', '').strip(),
            'text': '\n'.join(current_lines).strip()
        })

    return chunks

# Use it
city_chunks = chunk_cities(cleaned_text)

print(f"Found {len(city_chunks)} cities\n")

# Show first 2 as examples
for chunk in city_chunks:
    print(f"Index: {chunk['index']}")
    print(f"City: {chunk['city']}")
    print(f"Text preview: {chunk['text'][:150]}...")
    print("\n" + "-" * 80 + "\n")


Found 968 cities

Index: 1
City: Alalie
Text preview: 1 . Alalie Map 48 b. Lat. 42 . 10 , long. 9 . 70 . Size of territory: 3 (?). Type: A: α . The toponym is Αλαλίη , η (Hdt. 1 . 165 . 1 ,

166 . 3 ). A ...

--------------------------------------------------------------------------------

Index: 2
City: Emporion (Emporites)
Text preview: 2 . Emporion (Emporites) Map 25 . Lat. 42 . 10 , long. 3 . 10 . Size of territory: 2 . Type: A: β . The toponym is ' Ε µ πόριον , τό (Ps.-Skylax 2 ; P...

--------------------------------------------------------------------------------

Index: 3
City: Massalia (Massaliotes)
Text preview: 3 . Massalia (Massaliotes) Map 15 . Lat. 43 . 28 , long. 5 . 22 . Size of territory: 2 . Type: A: α . The toponym is Μασσαλία , ηasper (Hecat. fr. 55 ...

--------------------------------------------------------------------------------

Index: 4
City: Rhode (Rhodetes)
Text preview: 4 . Rhode (Rhodetes) Map 25 . Lat. 42 . 15 , long. 23 . 20 . Size of territory: ?  Type

In [6]:
def chunk_cities(cleaned_text): # One chunk per city
   
    
    chunks = []
    lines = cleaned_text.split('\n')
    
    city_pattern = r'^(\*?[A-Z][a-zA-Z\s]+?)\s+\(([^)]+)\)'
    
    current_city = None
    current_greek = None
    current_lines = []
    
    for line in lines:
        match = re.match(city_pattern, line)
        
        if match:
            # Save previous chunk
            if current_city and current_lines:
                chunks.append({
                    'city': current_city.replace('*', '').strip(),
                    'greek_name': current_greek.replace(' ', '').strip(),
                    'text': '\n'.join(current_lines).strip()
                })
            
            # Start new chunk
            current_city = match.group(1)
            current_greek = match.group(2)
            current_lines = [line]
        
        elif current_city:
            current_lines.append(line)
    
    # Save last chunk
    if current_city and current_lines:
        chunks.append({
            'city': current_city.replace('*', '').strip(),
            'greek_name': current_greek.replace(' ', '').strip(),
            'text': '\n'.join(current_lines).strip()
        })
    
    return chunks

# Use it
city_chunks = chunk_cities(cleaned_text)

print(f"Found {len(city_chunks)} cities\n")

# Show first 2 as examples
for chunk in city_chunks:
    print(f"City: {chunk['city']}")
    print(f"Greek: {chunk['greek_name']}")
    print(f"Text preview: {chunk['text'][:150]}...")
    print("\n" + "-" * 80 + "\n")

NameError: name 'cleaned_text' is not defined

#### Save chunks to individual files

In [39]:
md_folder = "sections_md"
json_folder = "sections_json"

In [None]:
#OLD, with greek names
def make_filenames(name: str) -> str:
    name = name.lower().strip()
    name = re.sub(r"\s+", "_", name)
    name = re.sub(r"[^a-z0-9_]", "", name)
    return name

def chunks_to_md(chunks, output_dir="sections_md"):
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    for i, chunk in enumerate(city_chunks, start=1):
        city_slug = make_filenames(chunk["city"])
        filename = f"{i:03}_{city_slug}.md"
        file_path = output_path / filename

        markdown = f"""---
city: "{chunk['city']}"
greek name: "{chunk['greek_name']}"
inventory index: {i}
---

{chunk['text']}
"""

        file_path.write_text(markdown, encoding="utf-8")

    print(f"Saved {len(chunks)} markdown files to '{output_path}'")

chunks_to_md(city_chunks)

KeyError: 'greek_name'

In [17]:
import re
from pathlib import Path

def make_filenames(name: str) -> str:
    name = name.lower().strip()
    name = re.sub(r"\s+", "_", name)
    name = re.sub(r"[^a-z0-9_]", "", name)
    return name


def chunks_to_md(chunks, output_dir="sections_md"):
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    for chunk in chunks:
        city_slug = make_filenames(chunk["city"])
        index = int(chunk["index"])

        filename = f"{index:03}_{city_slug}.md"
        file_path = output_path / filename

        markdown = f"""---
city: "{chunk['city']}"
inventory_index: {index}
---

{chunk['text']}
"""

        file_path.write_text(markdown, encoding="utf-8")

    print(f"Saved {len(chunks)} markdown files to '{output_path}'")


chunks_to_md(city_chunks)


Saved 968 markdown files to 'sections_md'


#### Save chunk metadata, from md to json

In [49]:
def parse_frontmatter(content):
    """Manually parse YAML frontmatter from markdown"""
    
    # Check if content starts with frontmatter
    if not content.startswith('---'):
        return {}, content
    
    # Split frontmatter and content
    parts = content.split('---', 2)
    if len(parts) < 3:
        return {}, content
    
    frontmatter_text = parts[1].strip()
    body = parts[2].strip()
    
    # Parse frontmatter fields
    metadata = {}
    for line in frontmatter_text.split('\n'):
        if ':' in line:
            key, value = line.split(':', 1)
            key = key.strip()
            value = value.strip().strip('"')
            
            # Convert chunk_index to int
            if key == 'chunk_index':
                value = int(value)
            
            metadata[key] = value
    
    return metadata, body

def md_to_json(markdown_dir, json_dir):
    """Convert markdown files to JSON"""
    
    markdown_path = Path(markdown_dir)
    json_path = Path(json_dir)
    json_path.mkdir(parents=True, exist_ok=True)
    
    md_files = sorted(markdown_path.glob("*.md"))
    
    if not md_files:
        print(f"No markdown files found in {markdown_dir}")
        return
    
    for md_file in md_files:
        try:
            # Read markdown file
            content = md_file.read_text(encoding='utf-8')
            
            # Parse frontmatter and content
            metadata, body = parse_frontmatter(content)
            
            # Create JSON structure
            data = {
                #"filename": md_file.name,
                "city": metadata.get("city", ""),
                "greek name": metadata.get("greek name", ""),
                "inventory index": metadata.get("inventory index", 0),
                "content": body,
                "word count": len(body.split()),
                "character count": len(body)
            }
            
            # Write to JSON
            json_filename = md_file.stem + ".json"
            json_file_path = json_path / json_filename
            
            with open(json_file_path, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)
            
            print(f"✓ {md_file.name} -> {json_filename}")
            
        except Exception as e:
            print(f"✗ Error processing {md_file.name}: {str(e)}")
    
    print(f"\nConverted {len(md_files)} files to JSON")

# Run conversion
md_to_json(md_folder, json_folder)

✓ 001_massalia.md -> 001_massalia.json
✓ 002_agathe.md -> 002_agathe.json
✓ 003_antipolis.md -> 003_antipolis.json
✓ 004_athenopolis.md -> 004_athenopolis.json
✓ 005_avenion.md -> 005_avenion.json
✓ 006_azania.md -> 006_azania.json
✓ 007_kabellion.md -> 007_kabellion.json
✓ 008_kyrene.md -> 008_kyrene.json
✓ 009_monoikos.md -> 009_monoikos.json
✓ 010_nikaia.md -> 010_nikaia.json
✓ 011_olbia.md -> 011_olbia.json
✓ 012_rhodanousia.md -> 012_rhodanousia.json
✓ 013_sekoanos.md -> 013_sekoanos.json
✓ 014_stoichades_islands.md -> 014_stoichades_islands.json
✓ 015_tauroeis.md -> 015_tauroeis.json
✓ 016_theline.md -> 016_theline.json
✓ 017_troizen.md -> 017_troizen.json
✓ 018_if_the_late_and_hardly_reliable_references_to_rhodian_colonisation_in_iberia.md -> 018_if_the_late_and_hardly_reliable_references_to_rhodian_colonisation_in_iberia.json
✓ 019_alonis.md -> 019_alonis.json
✓ 020_hemeroskopeion.md -> 020_hemeroskopeion.json

Converted 20 files to JSON


# 3. Extract and update .csv

#### City names from whole text *OUTDATED*

In [None]:
# OUTDATED?
city_pattern = r'^([A-Z][^\n(]+?)\s+\('

cities = re.findall(city_pattern, cleaned_text, re.MULTILINE)

# Clean up the city names (remove extra whitespace)
cities = [city.strip() for city in cities]

print("     === CITIES FOUND ===")
for i, city in enumerate(cities, 1):
    print(f"{i}. {city}")

# Write to CSV
with open('PDP.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['City'])
    for city in cities:
        writer.writerow([city])

print(f"\n✓ Extracted {len(cities)} cities to PDP.csv")

=== CITIES FOUND ===
1. Massalia
2. However, the  foundation  of Massalia  marked  the  real beginning of Greek presence and expansion in the region, an  expansion  that  was  basically  a  coastal  phenomenon; besides, the expansion of Massalia was a long-term process running from C 6 to the beginnings of the Roman occupation  of the  region. Massaliote  activity, moreover, varied from period to period: during C 6 and C 5 e it was directed towards  the  development  of commercial  interests
3. Agathe
4. Antipolis
5. Avenion
6. Azania
7. Kabellion
8. Kyrene
9. Monoikos
10. Nikaia
11. Olbia
12. Rhodanousia
13. Sekoanos
14. Stoichades Islands
15. Tauroeis
16. Troizen
17. The Greeks referred to the coast of the Iberian peninsula by the generic name ' Ιβηρία , a term which, as time went on, ended up as the name of the whole peninsula
18. If the  late  and  hardly  reliable  references  to  Rhodian colonisation in Iberia
19. Alonis
20. Hemeroskopeion

✓ Extracted 20 cities to PDP.csv


In [None]:
# TEST - detect index, city name, and export to csv for all poleis

# Read cleaned md file
with open('HN_section.md', 'r', encoding='utf-8') as f:
    text = f.read()

# Detect enumerated poleis
pattern = r'^(\d+)\.\s+(.+?)$'

cities = []
for line in text.split('\n'):
    match = re.match(pattern, line.strip())
    if match:
        index = match.group(1)
        city_name = match.group(2).strip()
        cities.append([index, city_name])

print(f"Found {len(cities)} cities")

# Preview first 5
for city in cities[:5]:
    print(f"{city[0]}. {city[1]}")

# Write to CSV
with open('PDP_inventory.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['Index', 'City'])
    writer.writerows(cities)

print(f"\n✓ Saved {len(cities)} cities to PDP_inventory.csv")

#### City chunks to spreadsheet

In [None]:
def chunks_to_csv(chunks, output_file="PDP.csv"):
    output_path = Path(output_file)

    with output_path.open("w", encoding="utf-8-sig", newline="") as f:
        writer = csv.writer(f)

        writer.writerow([
            "index",
            "city",
            "greek_name",
            "section_text"
        ])

        for i, chunk in enumerate(city_chunks, start=1):
            writer.writerow([
                i,
                chunk["city"],
                chunk["greek_name"],
                chunk["text"]
            ])

    #print(f"Saved {len(chunks)} rows to '{output_path}' (UTF-8 with BOM)")
    print(f"\n✓ Extracted {len(cities)} cities to PDP.csv")

chunks_to_csv(city_chunks)

Saved 20 rows to 'PDP.csv' (UTF-8 with BOM)

✓ Extracted 20 cities to PDP.csv


#### Metadata to csv

In [None]:
def json_to_csv(json_dir="sections_json", output_file="PDP_from_json.csv"):
    
    json_path = Path(json_dir)
    json_files = sorted(json_path.glob("*.json"))
    
    if not json_files:
        print(f"No JSON files found in {json_dir}")
        return
    
    output_path = Path(output_file)
    
    with output_path.open("w", encoding="utf-8-sig", newline="") as f:
        writer = csv.writer(f)
        
        # Write header
        writer.writerow([
            "index",
            "city",
            "greek_name",
            "section_text",
            "word_count",
            "character_count",
            "filename"
        ])
        
        # Read each JSON and write row
        for json_file in json_files:
            try:
                with open(json_file, 'r', encoding='utf-8') as jf:
                    data = json.load(jf)
                
                writer.writerow([
                    data.get("chunk_index", 0),
                    data.get("city", ""),
                    data.get("greek_name", ""),
                    data.get("content", ""),
                    data.get("word_count", 0),
                    data.get("character_count", 0),
                    data.get("filename", "")
                ])
                
            except Exception as e:
                print(f"Error reading {json_file}: {e}")
    
    print(f"✓ Saved {len(json_files)} rows to '{output_path}' (UTF-8 with BOM)")

# Create CSV from JSON files
json_to_csv(json_folder, output_file="PDP.csv")

In [42]:
def json_to_csv_with_full_metadata(json_dir="sections_json", output_file="PDP.csv"):
    """Create CSV with one column containing the full JSON metadata per city"""
    
    json_path = Path(json_dir)
    json_files = sorted(json_path.glob("*.json"))
    
    if not json_files:
        print(f"No JSON files found in {json_dir}")
        return
    
    output_path = Path(output_file)
    
    with output_path.open("w", encoding="utf-8-sig", newline="") as f:
        writer = csv.writer(f)
        
        # Write header
        writer.writerow([
            "index",
            "city",
            "greek_name",
            "section_text",
            "metadata"  # Full JSON metadata column
        ])
        
        # Read each JSON and write row
        for json_file in json_files:
            try:
                with open(json_file, 'r', encoding='utf-8') as jf:
                    data = json.load(jf)
                
                # Convert entire JSON object to string
                metadata_str = json.dumps(data, ensure_ascii=False)
                
                writer.writerow([
                    data.get("chunk_index", 0),
                    data.get("city", ""),
                    data.get("greek_name", ""),
                    data.get("content", ""),
                    metadata_str  # Full JSON as string
                ])
                
            except Exception as e:
                print(f"Error reading {json_file}: {e}")
    
    print(f"✓ Saved {len(json_files)} rows to '{output_path}' (UTF-8 with BOM)")

# Create CSV from JSON files
json_to_csv_with_full_metadata(json_folder, output_file="PDP.csv")

✓ Saved 20 rows to 'PDP.csv' (UTF-8 with BOM)
