Chunk 1 – Setup & Imports

In [None]:
!pip -q install openai pdfplumber pandas

from pathlib import Path
import pdfplumber, re, json, os
import pandas as pd
from openai import OpenAI
from google.colab import userdata

api_key = userdata.get("sandra")
client = OpenAI(api_key=api_key)


pdf_path = Path("/content/2007_Tshibubudze_THE MARKOYE FAULT_2007.pdf")
assert pdf_path.exists(), " PDF file not found. Upload it again to /content."
print(" Key loaded & PDF found!")


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m84.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m74.5 MB/s[0m eta [36m0:00:00[0m
[?25h✅ Key loaded & PDF found!


In [None]:
# Stage 2: Page-Aware Text Extraction
def extract_clean_text_pages(pdf_path):
    """Extract text per page and clean each, returning list of (page_num, text)"""
    pages = []
    with pdfplumber.open(str(pdf_path)) as pdf:
        for i, page in enumerate(pdf.pages, start=1):
            t = page.extract_text() or ""
            t = re.sub(r"(\w)-\n(\w)", r"\1\2", t)
            t = re.sub(r"\n{2,}", "\n", t)
            pages.append((i, t.strip()))
    # Remove references & abstracts globally
    joined = "\n".join(p[1] for p in pages)
    cut = re.split(r"(?i)references|bibliography", joined)[0]
    cut = re.sub(r"(?is)(abstract|acknowledgements|declaration).*?(?=\n1\.)", "", cut)
    # Re-split text back into pages (approximate match)
    result = []
    for i, (pnum, _) in enumerate(pages, start=1):
        result.append((pnum, cut.splitlines()))
    return pages

page_texts = extract_clean_text_pages(pdf_path)
print(f"Extracted {len(page_texts)} pages of clean text")


✅ Extracted 78 pages of clean text


In [None]:
# Stage 3: Page-Referenced AI Extraction (Fixed Import)
from tqdm import tqdm
import textwrap, json

schema = {
  "page_number": "integer",
  "metadata": {
    "title": "string",
    "author": "string",
    "year": "integer",
    "supervisor": "string",
    "institution": "string",
    "location": "string"
  },
  "geology": {
    "region": "string",
    "formation": "string",
    "rock_types": ["list of strings"],
    "minerals": ["list of strings"],
    "structures": ["list of strings"],
    "tectonic_setting": "string"
  },
  "geochronology": {
    "sample_id": "string",
    "method": "string",
    "age_Ma": "float",
    "error_Ma": "float",
    "rock_unit": "string",
    "evidence": "string"
  },
  "geochemistry": {
    "sample_id": "string",
    "analyte": "string",
    "value": "float",
    "unit": "string",
    "method": "string",
    "context": "string"
  },
  "metallogeny": {
    "mineralisation_type": "string",
    "associated_structures": ["list of strings"],
    "host_rocks": ["list of strings"],
    "ore_minerals": ["list of strings"],
    "alteration": "string"
  }
}

records = []

for pnum, text in tqdm(page_texts, desc="Extracting"):
    messages = [
        {"role": "system", "content": (
            "You are a geology data extraction AI. "
            "Always respond in pure JSON only (no text or explanations).")},
        {"role": "user", "content": f"""
Extract all information strictly according to this schema:
{json.dumps(schema, indent=2)}

Rules:
- Include "page_number": {pnum} for every record.
- If multiple related terms (e.g., many rock types) appear, include them as lists.
- If nothing relevant, return [].

TEXT (page {pnum}):
{text[:5500]}
"""}
    ]

    try:
        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=messages,
            temperature=0.1,
            max_tokens=1200
        )
        raw = resp.choices[0].message.content.strip()
        if raw.startswith("```"):
            raw = raw.strip("`").replace("json", "").strip()
        data = json.loads(raw)
        if data:
            records.append(data)
    except Exception as e:
        print(f" Page {pnum}: {e}")
        continue

with open("/content/extracted.json", "w", encoding="utf-8") as f:
    json.dump(records, f, ensure_ascii=False, indent=2)

print(f"\n Extraction done! Saved {len(records)} pages → /content/extracted.json")


Extracting:  68%|██████▊   | 53/78 [05:39<05:18, 12.75s/it]

⚠️ Page 53: Unterminated string starting at: line 191 column 7 (char 4285)


Extracting:  71%|███████   | 55/78 [06:11<05:43, 14.93s/it]

⚠️ Page 55: Unterminated string starting at: line 177 column 16 (char 4055)


Extracting:  72%|███████▏  | 56/78 [06:33<06:20, 17.28s/it]

⚠️ Page 56: Expecting property name enclosed in double quotes: line 150 column 23 (char 3999)


Extracting:  73%|███████▎  | 57/78 [06:52<06:14, 17.86s/it]

⚠️ Page 57: Unterminated string starting at: line 173 column 16 (char 4155)


Extracting:  74%|███████▍  | 58/78 [07:21<07:02, 21.14s/it]

⚠️ Page 58: Expecting value: line 164 column 31 (char 4046)


Extracting:  76%|███████▌  | 59/78 [07:46<07:00, 22.15s/it]

⚠️ Page 59: Expecting property name enclosed in double quotes: line 158 column 18 (char 4024)


Extracting: 100%|██████████| 78/78 [09:59<00:00,  7.68s/it]


✅ Extraction done! Saved 59 pages → /content/extracted.json





📊 Stage 4 – Flatten & Merge with Page References

In [None]:
import json, pandas as pd
from collections import defaultdict

#  Stage 4: Reference-Aware Flatten (handles nested lists)
with open("/content/extracted.json", "r", encoding="utf-8") as f:
    raw_data = json.load(f)

# Flatten in case each page’s output is a list of dicts
data = []
for item in raw_data:
    if isinstance(item, list):
        data.extend(item)   # unpack lists
    elif isinstance(item, dict):
        data.append(item)

combined = defaultdict(lambda: {"section": None, "field": None, "value": None, "pages": set()})

for rec in data:
    page = rec.get("page_number", 1)
    for section, fields in rec.items():
        if section == "page_number":
            continue
        if isinstance(fields, dict):
            for k, v in fields.items():
                if v:
                    key = (section, k, str(v))
                    combined[key]["section"] = section
                    combined[key]["field"] = k
                    combined[key]["value"] = v
                    combined[key]["pages"].add(str(page))
        elif isinstance(fields, list):
            for item in fields:
                if item:
                    key = (section, "list_item", str(item))
                    combined[key]["section"] = section
                    combined[key]["field"] = "list_item"
                    combined[key]["value"] = item
                    combined[key]["pages"].add(str(page))

flat = [
    {
        "section": info["section"],
        "field": info["field"],
        "value": info["value"],
        "pages": ", ".join(sorted(info["pages"]))
    }
    for info in combined.values()
]

df = pd.DataFrame(flat)
df.to_csv("/content/extracted_flat_ref.csv", index=False, encoding="utf-8-sig")

print(" Reference-aware CSV saved to /content/extracted_flat_ref.csv")
df.head(10)


✅ Reference-aware CSV saved to /content/extracted_flat_ref.csv


Unnamed: 0,section,field,value,pages
0,metadata,title,RELATIVE TIMING OF STRUCTURAL EVENTS: THE MARK...,1
1,metadata,author,ASINNE TSHIBUBUDZE,1
2,metadata,year,2007,"1, 17"
3,metadata,supervisor,Prof. Kim A.A Hein,1
4,metadata,institution,University of the Witwatersrand,1
5,metadata,location,"Johannesburg, South Africa",1
6,geology,rock_types,"[Greywacke, Siltstone, Volcanoclastic Greywack...",5
7,geology,structures,"[Markoye Fault, Markoye Shear Zone]",5
8,metadata,title,Regional Lithological and Structural Mapping o...,6
9,metadata,author,Unknown,"21, 22, 33, 35, 47, 50, 51, 54, 6, 7, 73, 8"
