In [1]:
#!/usr/bin/env python3
import sys
import re
from pathlib import Path

import pandas as pd

def read_text(path: str) -> str:
    return Path(path).read_text(encoding="utf-8", errors="ignore")

def extract_with_bs4(html: str):
    try:
        from bs4 import BeautifulSoup
    except ImportError:
        return None  # signal fallback

    soup = BeautifulSoup(html, "html.parser")
    boxes = soup.select("div.memberbox")  # matches <div class="... memberbox ...">

    rows = []
    for box in boxes:
        box_html = str(box)

        # --- image src ---
        img_src = None
        # Try to find any tag with a src attribute (img OR malformed tags that still keep attrs)
        tag_with_src = box.find(attrs={"src": True})
        if tag_with_src and tag_with_src.get("src"):
            img_src = tag_with_src.get("src")

        # Regex fallback (more reliable for malformed HTML)
        if not img_src:
            m = re.search(r'src\s*=\s*"([^"]+)"', box_html)
            if m:
                img_src = m.group(1)

        # --- title ---
        title = None
        off_tag = box.select_one(".off")
        if off_tag:
            title = off_tag.get_text(strip=True)

        # Fallback: sometimes name is in .head with no <a class="off">
        if not title:
            head = box.select_one(".head")
            if head:
                title = head.get_text(" ", strip=True)

        # --- position ---
        position = None
        note = box.select_one("p.note")
        if note:
            position = note.get_text(" ", strip=True)

        # Skip completely empty rows
        if any([img_src, title, position]):
            rows.append(
                {"image": img_src, "title": title, "position": position}
            )

    return rows

def extract_with_regex(html: str):
    # Split by memberbox blocks (works even if HTML is messy)
    parts = re.split(r'<div\s+class="[^"]*\bmemberbox\b[^"]*"\s*>', html, flags=re.I)
    rows = []
    for part in parts[1:]:
        # Try to limit to roughly this box: cut at next memberbox start if present
        part = re.split(r'<div\s+class="[^"]*\bmemberbox\b[^"]*"\s*>', part, maxsplit=1, flags=re.I)[0]

        img = None
        m = re.search(r'src\s*=\s*"([^"]+/images/team/[^"]+)"', part, flags=re.I)
        if m:
            img = m.group(1)
        else:
            m = re.search(r'src\s*=\s*"([^"]+)"', part, flags=re.I)
            if m:
                img = m.group(1)

        title = None
        m = re.search(r'class\s*=\s*"off"\s*>\s*([^<]+)', part, flags=re.I)
        if m:
            title = m.group(1).strip()

        position = None
        m = re.search(r'<p\s+class\s*=\s*"note"\s*>\s*([^<]+)\s*</p>', part, flags=re.I)
        if m:
            position = m.group(1).strip()

        if any([img, title, position]):
            rows.append({"image": img, "title": title, "position": position})

    return rows

def main():
    if len(sys.argv) < 2:
        print("Usage: python extract_team.py <input.html> [output.csv]")
        sys.exit(1)

    in_path = '/Users/fantastic-lin/Documents/Andrew/Lab website/aet21.github.io-master/team/index.html'
    out_path = '/Users/fantastic-lin/Documents/Andrew/Lab website/aet21.github.io-master/Lin/team_members.csv'

    html = read_text(in_path)

    rows = extract_with_bs4(html)
    if rows is None:  # bs4 not installed
        rows = extract_with_regex(html)

    df = pd.DataFrame(rows, columns=["image", "title", "position"])

    # Optional: drop duplicates / empty titles
    df = df.dropna(how="all")
    df = df.drop_duplicates()
    print(df.head)
    #df.to_csv(out_path, index=False)
    #print(f"Extracted {len(df)} rows -> {out_path}")

if __name__ == "__main__":
    main()


<bound method NDFrame.head of                              image                title  \
0          /images/team/Andrew.jpg  Andrew Teschendorff   
1           /images/team/luoqi.jpg               Qi Luo   
2       /images/team/tonghuige.jpg           Huige Tong   
3     /images/team/guoxiaolong.jpg         Xiaolong Guo   
4      /images/team/duzhaozhen.jpg          Zhaozhen Du   
5           /images/team/jason.jpg  Jason tham han kiat   
6          /images/team/naveed.jpg          Naveed Alam   
7       /images/team/liangyuhu.jpg           Yuhu Liang   
8    /images/team/wangkangying.jpg        Kangying Wang   
9         /images/team/jinghan.jpg             Han Jing   
10      /images/team/zhutianyu.jpg           Tianyu Zhu   
11          /images/team/huxue.jpg               Xue Hu   
12   /images/team/youchenglong.jpg        Chenglong You   
13           /images/team/alok.jpg     Alok Kumar Maity   
14    /images/team/zhengshijie.JPG         Shijie Zheng   
15     /images/team/dongda

In [None]:
#!/usr/bin/env python3
import math
import argparse
from pathlib import Path
from PIL import Image

SUPPORTED = {".jpg", ".jpeg", ".png", ".webp", ".tif", ".tiff", ".bmp"}

def infer_ratio(image_paths):
    """Infer a common aspect ratio r = W/H that minimizes total crop loss (median in log-space)."""
    logs = []
    for p in image_paths:
        with Image.open(p) as im:
            w, h = im.size
        if w > 0 and h > 0:
            logs.append(math.log(w / h))
    if not logs:
        raise ValueError("No valid images found to infer ratio.")
    logs.sort()
    mid = len(logs) // 2
    median_log = logs[mid] if len(logs) % 2 == 1 else (logs[mid - 1] + logs[mid]) / 2
    return math.exp(median_log)

def center_crop_to_ratio(im, ratio):
    """
    Center-crop image to target ratio (W/H) without resizing.
    Keeps the largest possible crop that matches the ratio.
    """
    w, h = im.size
    current = w / h

    if abs(current - ratio) < 1e-9:
        return im  # already matches

    if current > ratio:
        # Image is too wide -> crop width
        new_w = int(round(ratio * h))
        new_h = h
    else:
        # Image is too tall -> crop height
        new_w = w
        new_h = int(round(w / ratio))

    left = (w - new_w) // 2
    top = (h - new_h) // 2
    right = left + new_w
    bottom = top + new_h

    return im.crop((left, top, right, bottom))

def main():
    ap = argparse.ArgumentParser(description="Infer a common aspect ratio and center-crop all images in a directory.")
    ap.add_argument("input_dir", help="Directory containing images")
    ap.add_argument("-o", "--output_dir", default="cropped", help="Output directory (default: cropped)")
    ap.add_argume_


In [None]:
# ============================
# Batch center-crop images in Jupyter
# - infers a common aspect ratio (W/H) that minimizes overall cropping
# - center-crops each image to that ratio (no resizing)
# - saves outputs + writes a CSV summary
# - FIXED: safe saving for JPEG (no RGBA)
# ============================

#Inferred target ratio (W/H): 0.770489
#Approx ratio ≈ 37:48 (err=0.00034403)

#!pip -q install pillow pandas

from pathlib import Path
import math
from PIL import Image
import pandas as pd

# ---- 1) CONFIG: set your paths here ----
INPUT_DIR = Path("/Users/fantastic-lin/Documents/Andrew/Lab website/aet21.github.io-master/images/team")   # <-- CHANGE THIS
OUTPUT_DIR = Path("/Users/fantastic-lin/Documents/Andrew/Lab website/aet21.github.io-master/Lin/cropped_imagegs")
RECURSIVE = False
SUPPORTED = {".jpg", ".jpeg", ".png", ".webp", ".tif", ".tiff", ".bmp"}

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# ---- 2) Collect image files ----
if RECURSIVE:
    paths = [p for p in INPUT_DIR.rglob("*") if p.is_file() and p.suffix.lower() in SUPPORTED]
else:
    paths = [p for p in INPUT_DIR.iterdir() if p.is_file() and p.suffix.lower() in SUPPORTED]

if not paths:
    raise ValueError(f"No supported images found in: {INPUT_DIR}")

print(f"Found {len(paths)} images.")

# ---- 3) Infer a common ratio (W/H) ----
def infer_ratio(image_paths):
    logs = []
    for p in image_paths:
        with Image.open(p) as im:
            w, h = im.size
        if w > 0 and h > 0:
            logs.append(math.log(w / h))
    if not logs:
        raise ValueError("No valid images found to infer ratio.")
    logs.sort()
    mid = len(logs) // 2
    median_log = logs[mid] if len(logs) % 2 == 1 else (logs[mid - 1] + logs[mid]) / 2
    return math.exp(median_log)

ratio = infer_ratio(paths)
print(f"Inferred target ratio (W/H): {ratio:.6f}")

# Optional: pretty W:H approximation
def ratio_to_wh(r, max_den=60):
    best_w, best_h, best_err = 1, 1, float("inf")
    for h in range(1, max_den + 1):
        w = round(r * h)
        err = abs(r - (w / h))
        if err < best_err:
            best_w, best_h, best_err = w, h, err
    return best_w, best_h, best_err

w_approx, h_approx, err = ratio_to_wh(ratio, max_den=60)
print(f"Approx ratio ≈ {w_approx}:{h_approx} (err={err:.6g})")

# ---- 4) Center-crop function ----
def center_crop_to_ratio(im, ratio):
    w, h = im.size
    cur = w / h

    if abs(cur - ratio) < 1e-9:
        return im.copy(), (w, h)

    if cur > ratio:
        # too wide -> crop width
        new_w = int(round(ratio * h))
        new_h = h
    else:
        # too tall -> crop height
        new_w = w
        new_h = int(round(w / ratio))

    left = (w - new_w) // 2
    top = (h - new_h) // 2
    right = left + new_w
    bottom = top + new_h

    return im.crop((left, top, right, bottom)), (new_w, new_h)

# ---- 5) Safe save helper (fixes RGBA->JPEG issue) ----
def prepare_for_saving(im, out_path: Path, background=(255, 255, 255)):
    """
    Ensure image mode is compatible with the target format based on extension.
    - JPEG: must be RGB (no alpha). If has alpha, composite on background.
    - PNG/WebP: can keep RGBA.
    """
    ext = out_path.suffix.lower()

    # If target is JPEG, force RGB
    if ext in {".jpg", ".jpeg"}:
        if im.mode in ("RGBA", "LA") or ("transparency" in im.info):
            # Composite onto a solid background
            rgba = im.convert("RGBA")
            bg = Image.new("RGBA", rgba.size, background + (255,))
            im = Image.alpha_composite(bg, rgba).convert("RGB")
        else:
            im = im.convert("RGB")
        return im

    # Non-JPEG targets: generally safe, but normalize some modes
    if im.mode == "P":
        # palette -> RGBA to preserve transparency if any
        im = im.convert("RGBA")
    elif im.mode == "CMYK":
        im = im.convert("RGB")

    return im

# ---- 6) Crop all images + save + summary ----
summary = []
failures = []

for p in paths:
    try:
        with Image.open(p) as im:
            orig_w, orig_h = im.size
            cropped, (new_w, new_h) = center_crop_to_ratio(im, ratio)

            out_path = OUTPUT_DIR / p.name
            cropped_to_save = prepare_for_saving(cropped, out_path)

            save_kwargs = {}
            if out_path.suffix.lower() in {".jpg", ".jpeg"}:
                save_kwargs = {"quality": 95, "optimize": True}

            cropped_to_save.save(out_path, **save_kwargs)

            summary.append({
                "filename": p.name,
                "input_path": str(p),
                "output_path": str(out_path),
                "orig_w": orig_w,
                "orig_h": orig_h,
                "orig_ratio": (orig_w / orig_h) if orig_h else None,
                "crop_w": new_w,
                "crop_h": new_h,
                "target_ratio": ratio,
            })
    except Exception as e:
        failures.append({"filename": p.name, "input_path": str(p), "error": repr(e)})

print(f"Done. Cropped {len(summary)} images -> {OUTPUT_DIR.resolve()}")
if failures:
    print(f"⚠️ Failed on {len(failures)} images. See failures dataframe below.")

df = pd.DataFrame(summary)
csv_path = OUTPUT_DIR / "crop_summary.csv"
df.to_csv(csv_path, index=False)

fail_df = pd.DataFrame(failures)
df.head(), f"Saved CSV: {csv_path}", fail_df.head()


Found 32 images.
Inferred target ratio (W/H): 0.770489
Approx ratio ≈ 37:48 (err=0.00034403)
Done. Cropped 32 images -> /Users/fantastic-lin/Documents/Andrew/Lab website/aet21.github.io-master/Lin/cropped_imagegs


(        filename                                         input_path  \
 0     linlin.jpg  /Users/fantastic-lin/Documents/Andrew/Lab webs...   
 1  zhutianyu.jpg  /Users/fantastic-lin/Documents/Andrew/Lab webs...   
 2     Andrew.jpg  /Users/fantastic-lin/Documents/Andrew/Lab webs...   
 3  liangyuhu.jpg  /Users/fantastic-lin/Documents/Andrew/Lab webs...   
 4  tonghuige.jpg  /Users/fantastic-lin/Documents/Andrew/Lab webs...   
 
                                          output_path  orig_w  orig_h  \
 0  /Users/fantastic-lin/Documents/Andrew/Lab webs...    3072    3950   
 1  /Users/fantastic-lin/Documents/Andrew/Lab webs...     960    1280   
 2  /Users/fantastic-lin/Documents/Andrew/Lab webs...     720    1080   
 3  /Users/fantastic-lin/Documents/Andrew/Lab webs...     668     950   
 4  /Users/fantastic-lin/Documents/Andrew/Lab webs...     831    1080   
 
    orig_ratio  crop_w  crop_h  target_ratio  
 0    0.777722    3043    3950      0.770489  
 1    0.750000     960    1246  

In [1]:
import csv, re, textwrap
from pathlib import Path

INPUT_CSV = "/Users/fantastic-lin/Documents/Andrew/Lab_website/aet21.github.io-master/Lin/team_members.csv"          # <-- your CSV filename/path
OUTPUT_DIR = "/Users/fantastic-lin/Documents/Andrew/Lab_website/aet21.github.io-master/_people"        # <-- folder to create

FIXED_CATEGORY = "people"

def slugify(s: str) -> str:
    s = s.strip().lower()
    s = re.sub(r"[’'`]", "", s)
    s = re.sub(r"[^a-z0-9]+", "-", s)
    s = re.sub(r"-{2,}", "-", s).strip("-")
    return s or "person"

out_dir = Path(OUTPUT_DIR)
out_dir.mkdir(parents=True, exist_ok=True)

used = {}  # handle duplicate names like "Lin Lin" / "Linlin Li"
created = []

with open(INPUT_CSV, "r", encoding="utf-8-sig", newline="") as f:
    reader = csv.DictReader(f, delimiter=",")
    print("Detected headers:", reader.fieldnames)

    for row in reader:
        # normalize keys (avoid issues like " title " vs "title")
        row = {(k or "").strip().lower(): (v or "").strip() for k, v in row.items()}

        title = row["title"]
        position = row["position"]
        image = "/people"+row["image"]
        alumni = row["alumni"]
        role_key = row["role_key"]

        base = slugify(title)
        used[base] = used.get(base, 0) + 1
        slug = base if used[base] == 1 else f"{base}-{used[base]}"

        filename = f"{slug}.md"  # or f"2026-01-09-{slug}.md" if you want date prefix
        content = textwrap.dedent(f"""\
        ---
        title: {title}
        categories:
          - {FIXED_CATEGORY}
        position: {position}
        image: {image}
        alumni: {alumni}
        role_key: {role_key}
        ---
        """)

        (out_dir / filename).write_text(content, encoding="utf-8")
        created.append(filename)

print(f"Done. Wrote {len(created)} files to: {out_dir.resolve()}")


Detected headers: ['image', 'title', 'position', 'alumni', 'role_key']
Done. Wrote 31 files to: /Users/fantastic-lin/Documents/Andrew/Lab_website/aet21.github.io-master/_people
