In [6]:
import os
import fitz       # PyMuPDF
import pandas as pd
from PIL import Image
from io import BytesIO
from tqdm import tqdm
from docx2pdf import convert

def preprocess_docs(input_dir: str):
    """
    Convert all .docx/.doc files in input_dir into PDFs
    with the same base filename.
    """
    for filename in os.listdir(input_dir):
        lower = filename.lower()
        if lower.endswith((".docx", ".doc")):
            doc_path = os.path.join(input_dir, filename)
            pdf_path = os.path.join(
                input_dir,
                os.path.splitext(filename)[0] + ".pdf"
            )
            try:
                print(f"🔧 Converting {filename} → {os.path.basename(pdf_path)} …")
                convert(doc_path, pdf_path)
            except Exception as e:
                print(f"⚠️ Failed to convert {filename}: {e}")

def extract_images_from_pdfs(
    input_dir: str,
    output_dir: str,
    min_width: int = 1200,
    metadata_filename: str = "image_metadata.csv"
) -> pd.DataFrame:
    """
    1. Convert any .doc/.docx to .pdf.
    2. Walk through each PDF in input_dir, extract all embedded images,
    3. Save images under output_dir/<pdf_basename>/…,
    4. Compile a CSV of metadata (file, page, dimensions, resolution, high-res flag).
    Returns the metadata DataFrame.
    """
    # Step 1: convert Word docs
    preprocess_docs(input_dir)

    os.makedirs(output_dir, exist_ok=True)
    records = []

    for filename in os.listdir(input_dir):
        if not filename.lower().endswith(".pdf"):
            continue

        pdf_path    = os.path.join(input_dir, filename)
        base_name   = os.path.splitext(filename)[0]
        doc         = fitz.open(pdf_path)

        # Create subfolder for this PDF’s images
        pdf_img_dir = os.path.join(output_dir, base_name)
        os.makedirs(pdf_img_dir, exist_ok=True)

        print(f"\n📄 Extracting images from {filename} ({len(doc)} pages)…")
        for page_idx in tqdm(range(len(doc)), desc="Pages", leave=False):
            page   = doc[page_idx]
            images = page.get_images(full=True)
            if not images:
                continue

            for img_idx, img_info in enumerate(images, start=1):
                xref      = img_info[0]
                img_data  = doc.extract_image(xref)
                img_bytes = img_data["image"]

                try:
                    img = Image.open(BytesIO(img_bytes))
                except Exception as e:
                    print(f" ⚠️ Couldn’t open image on page {page_idx+1}, idx {img_idx}: {e}")
                    continue

                w, h     = img.size
                img_name = f"{base_name}_p{page_idx+1}_img{img_idx}.png"
                img_path = os.path.join(pdf_img_dir, img_name)
                img.save(img_path)

                records.append({
                    "pdf_file":    filename,
                    "page":        page_idx + 1,
                    "image_name":  img_name,
                    "image_path":  img_path,
                    "width":       w,
                    "height":      h,
                    "resolution":  f"{w}x{h}",
                    "is_high_res": w >= min_width
                })

        doc.close()

    # Write out metadata CSV
    if records:
        df       = pd.DataFrame(records)
        csv_path = os.path.join(output_dir, metadata_filename)
        df.to_csv(csv_path, index=False)
        print(f"\n✅ Extracted {len(records)} images; metadata saved to {csv_path}")
        return df
    else:
        print("⚠️ No images found in any PDF.")
        return pd.DataFrame()

# ── Example usage ────────────────────────────────────────────────────
if __name__ == "__main__":
    metadata_df = extract_images_from_pdfs(
        input_dir="submissions",
        output_dir="Extracted_images",
        min_width=1200
    )


🔧 Converting vidyalatanvi_LATE_218146_14937211_COGS 160 A1-1.docx → vidyalatanvi_LATE_218146_14937211_COGS 160 A1-1.pdf …


100%|██████████| 1/1 [00:28<00:00, 28.76s/it]



📄 Extracting images from mainayardaniel_127050_14924649_COGS 160 Le Corbusier Doc.pdf (56 pages)…


                                                      


📄 Extracting images from emralinolalaine_LATE_162831_14938886_Glenn Murcutt_ “Touching the Earth Lightly”.pdf (14 pages)…


                                                      


📄 Extracting images from krukjulia_LATE_198551_15046680_Eero Saarinen_ Cogs 160 Research Document.pdf (49 pages)…


                                                      


📄 Extracting images from spavenchristine_LATE_96300_14929508_COGS 160_1 (1).pdf (42 pages)…


                                                      


📄 Extracting images from khirwadkarisha_166304_14925482_Report.pdf (36 pages)…


                                                      


📄 Extracting images from liangmichael_188529_14924464_COGS 160 - Docs.pdf (58 pages)…


                                                      


📄 Extracting images from yangheiman_LATE_190478_14963531_COGS160.pdf (43 pages)…


                                                      


📄 Extracting images from wucynthia_LATE_167097_15019933_COGS 160.pdf (1 pages)…


                                            


📄 Extracting images from khirwadkarisha_166304_14925456_ken yeang cogs 160 in class.pdf (34 pages)…


                                                      


📄 Extracting images from hsucalvin_166834_14967682_Cogs 160 Slide and Doc Links for Turn-in.pdf (1 pages)…


                                            


📄 Extracting images from dasilvatheo_LATE_171930_14930244_HW A1.pdf (46 pages)…


                                                      


📄 Extracting images from davidmatthew_LATE_134808_14949557_COGS 160_ A1.pdf (43 pages)…


                                                      


📄 Extracting images from marvanalicia_212624_14925657_A.Marvan_LuisBarragan.pdf (3 pages)…


                                                    


📄 Extracting images from delacruzrenier_LATE_226065_14930691_Paul Rudolph_ Life, Work, and Enduring Influence.pdf (6 pages)…


                                            


📄 Extracting images from vidyalatanvi_LATE_218146_14937211_COGS 160 A1-1.pdf (61 pages)…


                                                      


📄 Extracting images from liangmichael_188529_14924465_Michael - COGS 160.pdf (103 pages)…


                                                        


✅ Extracted 913 images; metadata saved to Extracted_images/image_metadata.csv




In [4]:
import sys
!{sys.executable} -m pip install docx2pdf

Collecting docx2pdf
  Using cached docx2pdf-0.1.8-py3-none-any.whl.metadata (3.3 kB)
Collecting appscript>=1.1.0 (from docx2pdf)
  Downloading appscript-1.3.0-cp311-cp311-macosx_10_9_universal2.whl.metadata (417 bytes)
Collecting lxml>=4.7.1 (from appscript>=1.1.0->docx2pdf)
  Downloading lxml-5.4.0-cp311-cp311-macosx_10_9_universal2.whl.metadata (3.5 kB)
Using cached docx2pdf-0.1.8-py3-none-any.whl (6.7 kB)
Downloading appscript-1.3.0-cp311-cp311-macosx_10_9_universal2.whl (99 kB)
Downloading lxml-5.4.0-cp311-cp311-macosx_10_9_universal2.whl (8.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.1/8.1 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: lxml, appscript, docx2pdf
Successfully installed appscript-1.3.0 docx2pdf-0.1.8 lxml-5.4.0


In [7]:
import os
import fitz      # PyMuPDF
from PIL import Image
import io

def compress_all_pdfs(input_dir, output_dir, dpi=100, downscale_factor=2):
    """
    Compress all PDF files in `input_dir` by rendering each page to an image,
    optionally downscaling, and reassembling into a new PDF in `output_dir`.

    :param input_dir: Folder containing original PDF files.
    :param output_dir: Folder to write compressed PDFs.
    :param dpi: DPI for rendering pages.
    :param downscale_factor: Factor by which to downscale rendered images.
    """
    os.makedirs(output_dir, exist_ok=True)

    for filename in os.listdir(input_dir):
        if not filename.lower().endswith(".pdf"):
            continue

        input_path = os.path.join(input_dir, filename)
        output_path = os.path.join(output_dir, filename)

        print(f"🔄 Compressing {filename} ...")
        try:
            doc = fitz.open(input_path)
            new_pdf = fitz.open()

            for page in doc:
                pix = page.get_pixmap(dpi=dpi)
                # create PIL image from raw samples
                img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)

                # downscale using LANCZOS filter
                new_size = (pix.width // downscale_factor, pix.height // downscale_factor)
                img = img.resize(new_size, Image.LANCZOS)

                # save into a one-page PDF in memory
                buffer = io.BytesIO()
                img.save(buffer, format="PDF", resolution=dpi)
                buffer.seek(0)

                img_pdf = fitz.open("pdf", buffer)
                new_pdf.insert_pdf(img_pdf)

            new_pdf.save(output_path)
            new_pdf.close()
            doc.close()
            print(f"✅ Saved compressed PDF: {output_path}")

        except Exception as e:
            print(f"❌ Failed to compress {filename}: {e}")


INPUT_DIR  = "submissions"
OUTPUT_DIR = "compressed_files"

compress_all_pdfs(
    input_dir=INPUT_DIR,
    output_dir=OUTPUT_DIR,
    dpi=100,
    downscale_factor=2
)


🔄 Compressing mainayardaniel_127050_14924649_COGS 160 Le Corbusier Doc.pdf ...
✅ Saved compressed PDF: compressed_files/mainayardaniel_127050_14924649_COGS 160 Le Corbusier Doc.pdf
🔄 Compressing emralinolalaine_LATE_162831_14938886_Glenn Murcutt_ “Touching the Earth Lightly”.pdf ...
✅ Saved compressed PDF: compressed_files/emralinolalaine_LATE_162831_14938886_Glenn Murcutt_ “Touching the Earth Lightly”.pdf
🔄 Compressing krukjulia_LATE_198551_15046680_Eero Saarinen_ Cogs 160 Research Document.pdf ...
✅ Saved compressed PDF: compressed_files/krukjulia_LATE_198551_15046680_Eero Saarinen_ Cogs 160 Research Document.pdf
🔄 Compressing spavenchristine_LATE_96300_14929508_COGS 160_1 (1).pdf ...
✅ Saved compressed PDF: compressed_files/spavenchristine_LATE_96300_14929508_COGS 160_1 (1).pdf
🔄 Compressing khirwadkarisha_166304_14925482_Report.pdf ...
✅ Saved compressed PDF: compressed_files/khirwadkarisha_166304_14925482_Report.pdf
🔄 Compressing liangmichael_188529_14924464_COGS 160 - Docs.pdf ..

In [13]:
import pandas as pd
from difflib import get_close_matches

# ── 1) Load CSVs ──────────────────────────────────────────────────────
meta_df   = pd.read_csv("image_metadata.csv")
roster_df = pd.read_csv("student_info.csv")

# ── 2) Extract login_id from your PDF filenames ──────────────────────
#    e.g. "mainayardaniel_127050_14924649_COGS160…" → "mainayardaniel"
meta_df["login_id"] = meta_df["pdf_file"].str.split("_").str[0]

# ── 3) Tidy roster columns ────────────────────────────────────────────
roster_df = roster_df.rename(columns={
    "SIS Login ID": "login_id",
    "Student":      "student_name",
    "SIS User ID":  "pid"
})
# ensure no NaNs and strip whitespace
roster_df["student_name"] = roster_df["student_name"].fillna("").astype(str).str.strip()

# ── 4) Exact‐match merge ───────────────────────────────────────────────
merged = pd.merge(
    meta_df,
    roster_df[["login_id","student_name","pid"]],
    on="login_id",
    how="left"
)

# ── 5) Which login_ids still have no PID? ─────────────────────────────
unmatched = merged.loc[merged["pid"].isna(), "login_id"].unique()
print("🔍 Unmatched login_ids:", unmatched)

# ── 6) Prepare for fuzzy matching ────────────────────────────────────
#    create a normalized key (lowercase, no punctuation/space)
roster_df["norm_name"] = (
    roster_df["student_name"]
      .str.lower()
      .str.replace(r"[^a-z0-9]", "", regex=True)
)

# map norm_name → (login_id, student_name, pid)
roster_map = {
    row.norm_name: (row.login_id, row.student_name, row.pid)
    for row in roster_df.itertuples()
}

# ── 7) Build fuzzy‐match suggestions ──────────────────────────────────
suggestions = {}
for uid in unmatched:
    key = str(uid).lower()
    # 7a) exact normalized-name
    if key in roster_map:
        suggestions[uid] = [roster_map[key]]
        continue
    # 7b) substring match
    hits = [roster_map[n] for n in roster_map if key in n or n in key]
    if hits:
        suggestions[uid] = hits
        continue
    # 7c) difflib fallback
    best = get_close_matches(key, roster_map.keys(), n=1, cutoff=0.6)
    suggestions[uid] = [roster_map[best[0]]] if best else []

# ── 8) Auto-fill those with exactly one candidate ─────────────────────
for uid, matches in suggestions.items():
    if len(matches) == 1:
        _, name, pid = matches[0]
        merged.loc[merged["login_id"] == uid, "student_name"] = name
        merged.loc[merged["login_id"] == uid, "pid"]          = pid

# ── 9) Print any truly ambiguous or missing cases ────────────────────
for uid, matches in suggestions.items():
    if len(matches) > 1:
        print(f"\n⚠️ {uid!r} HAS MULTIPLE MATCHES:")
        for _, name, pid in matches:
            print(f"    {name} → {pid}")
    elif not matches:
        print(f"\n❌ {uid!r} HAS NO CLOSE MATCH")

# ──10) Save the finished CSV ─────────────────────────────────────────
out_path = "image_metadata_with_name_pid.csv"
merged.to_csv(out_path, index=False)
print(f"\n✅ Done! Check → {out_path}")


🔍 Unmatched login_ids: ['mainayardaniel' 'emralinolalaine' 'krukjulia' 'spavenchristine'
 'khirwadkarisha' 'liangmichael' 'yangheiman' 'dasilvatheo' 'davidmatthew'
 'marvanalicia' 'vidyalatanvi']

✅ Done! Check → image_metadata_with_name_pid.csv
