# Chemo Rota to EPMA Converter

**Queen Elizabeth Hospital Cancer Centre** — PICS EPMA import tool

This notebook converts a paper chemotherapy rota PDF into:
- **DOCX** template (4-table format)
- **TXT** PICS EPMA upload file

## How to use

1. **Run Cell 1** — installs dependencies (takes ~30 seconds)
2. **Run Cell 2** — gets the converter code from GitHub (or upload manually)
3. **Run Cell 3** — upload your PDF
4. **Run Cell 4** — extract and review the config
5. **Edit Cell 5** — fill in the required fields marked `CHANGE_ME`
6. **Run Cell 6** — generate and download the ZIP

In [None]:
#@title **Cell 1: Install dependencies**
!apt-get install -y -qq tesseract-ocr poppler-utils > /dev/null 2>&1
!pip install -q pdfplumber python-docx pyyaml pytesseract pdf2image beautifulsoup4
print("Dependencies installed.")

In [None]:
#@title **Cell 2: Get converter code**
#@markdown **Option A** (recommended): Clone from GitHub.
#@markdown Replace the URL below with your repo URL.

import os

REPO_URL = "https://github.com/YOUR_USERNAME/chemo-rotas.git"  #@param {type:"string"}

if not os.path.exists("converter"):
    if REPO_URL and "YOUR_USERNAME" not in REPO_URL:
        !git clone --depth 1 "$REPO_URL" _repo
        !cp -r _repo/converter .
        !rm -rf _repo
        print("Converter code cloned from GitHub.")
    else:
        print("=" * 60)
        print("Option B: Manual upload")
        print("=" * 60)
        print("1. In the Colab file browser (left panel), create a")
        print("   folder called 'converter'")
        print("2. Upload these files into it:")
        print("   - __init__.py")
        print("   - models.py")
        print("   - extract_pdf.py")
        print("   - generate_txt.py")
        print("   - generate_docx.py")
        print("3. Re-run this cell to confirm.")
else:
    print("Converter code already present.")

# Verify
assert os.path.exists("converter/models.py"), "converter/models.py not found — see instructions above"
assert os.path.exists("converter/extract_pdf.py"), "converter/extract_pdf.py not found"
assert os.path.exists("converter/generate_txt.py"), "converter/generate_txt.py not found"
assert os.path.exists("converter/generate_docx.py"), "converter/generate_docx.py not found"
print("All converter modules verified.")

In [None]:
#@title **Cell 3: Upload your PDF**
from google.colab import files

print("Select your chemo rota PDF:")
uploaded = files.upload()

pdf_filename = list(uploaded.keys())[0]
print(f"\nUploaded: {pdf_filename} ({len(uploaded[pdf_filename]):,} bytes)")

In [None]:
#@title **Cell 4: Extract config from PDF**
import json
from converter.extract_pdf import extract_to_yaml

yaml_path = pdf_filename.rsplit('.', 1)[0] + '_config.yaml'
config = extract_to_yaml(pdf_filename, yaml_path)

# Show what was extracted
print("=" * 60)
print("EXTRACTED CONFIG")
print("=" * 60)
print(f"Document code:  {config.get('document_code', '?')}")
print(f"Drug name:      {config.get('drug_full_name', '?')}")
print(f"Indication:     {config.get('indication', '?')}")
print(f"Cycle delay:    {config.get('cycle_delay', '?')}")
print(f"Templates:      {len(config.get('templates', []))}")
print(f"Blood tests:    {len(config.get('blood_tests', []))}")
print()

# Warn about fields needing human input
for field in ['drug_prefix', 'ticket_number', 'specialty_class']:
    val = config.get(field, 'CHANGE_ME')
    if val == 'CHANGE_ME' or not val:
        print(f"  ** {field} needs your input (currently: {val!r})")

print()
print("Edit the config in Cell 5 below, then run Cell 6 to generate.")

In [None]:
#@title **Cell 5: Edit required fields**
#@markdown Fill in the fields below. These cannot be extracted from the PDF.

drug_prefix = "CHANGE_ME"  #@param {type:"string"}
ticket_number = "CHANGE_ME"  #@param {type:"string"}
specialty_class = "CHANGE_ME"  #@param {type:"string"}
default_cycles = 12  #@param {type:"integer"}
directorate = "ONC"  #@param {type:"string"}

# Apply edits to the config
config['drug_prefix'] = drug_prefix.strip().upper()
config['ticket_number'] = ticket_number.strip()
config['specialty_class'] = specialty_class.strip().upper()
config['default_cycles'] = default_cycles
config['directorate'] = directorate.strip().upper()

# Validate
ok = True
for field in ['drug_prefix', 'ticket_number', 'specialty_class']:
    val = config.get(field, 'CHANGE_ME')
    if val == 'CHANGE_ME' or not val:
        print(f"ERROR: {field} is still '{val}' — please fill it in above.")
        ok = False
    else:
        print(f"  {field} = {val}")

if ok:
    print("\nAll required fields set. Run Cell 6 to generate.")
else:
    print("\nFix the fields above and re-run this cell.")

In [None]:
#@title **Cell 6: Generate and download**
import io
import zipfile
from google.colab import files as colab_files
from converter.generate_txt import generate_txt
from converter.generate_docx import generate_docx
from converter.models import RotaConfig

# Validate required fields are filled
for field in ['drug_prefix', 'ticket_number', 'specialty_class']:
    val = config.get(field, 'CHANGE_ME')
    assert val != 'CHANGE_ME' and val, f"{field} is not set — go back to Cell 5"

# Build outputs
rota = RotaConfig.from_dict(config)

txt_content = generate_txt(rota)
txt_filename = f"#{rota.ticket_number}{rota.drug_prefix}.txt"

docx_filename = f"{rota.document_code} {rota.drug_full_name}.docx"
generate_docx(rota, docx_filename)

# Write TXT file
with open(txt_filename, 'w', newline='') as f:
    f.write(txt_content)

# Bundle into ZIP
zip_name = f"{rota.document_code}_{rota.drug_full_name}.zip"
with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zf:
    zf.write(txt_filename)
    zf.write(docx_filename)

print(f"Generated:")
print(f"  - {txt_filename}")
print(f"  - {docx_filename}")
print(f"  - {zip_name}")
print()
print("Downloading ZIP...")

colab_files.download(zip_name)