In [1]:
!pip install google-generativeai
!pip install -q google-generativeai pypdf pdf2image pillow python-dotenv


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m305.5/305.5 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!apt-get install -y poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.8 [186 kB]
Fetched 186 kB in 1s (271 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 126281 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.8_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.8) ...
Setting up poppler-utils (22.02.0-2ubuntu0.8) ...
Processing triggers for man-db (2.10.2-1) ...


Code for 1st page, page112

In [3]:

# 🔐 Setup Gemini API
from google.colab import userdata
import google.generativeai as genai
import os
import json
from PIL import Image
from pdf2image import convert_from_path
import re

# ✅ Load API key
api_key = userdata.get("GOOGLE_API_KEY")
if not api_key:
    raise ValueError("❌ GOOGLE_API_KEY not found. Make sure it's stored in Colab secrets.")
genai.configure(api_key=api_key)

# 🎯 Initialize Gemini model
model = genai.GenerativeModel("gemini-2.0-flash")

# 📄 Convert PDF to images
def convert_pdf_to_images(pdf_path, output_folder, dpi=300):
    os.makedirs(output_folder, exist_ok=True)
    images = convert_from_path(pdf_path, dpi=dpi)
    image_paths = []
    for i, image in enumerate(images):
        path = os.path.join(output_folder, f"page_{len(image_paths)+1}.jpg")
        image.save(path, "JPEG")
        image_paths.append(path)
    return image_paths

In [5]:
# Step 1: Extract abbreviation dictionary from page 111
abbrev_pdf_path = "/content/1900_page111.pdf"
abbrev_img_paths = convert_pdf_to_images(abbrev_pdf_path, "abbrev_imgs")
abbrev_image = Image.open(abbrev_img_paths[0])

abbrev_prompt = """
You are reading a historical city directory page of abbreviations.
Extract all abbreviations into a flat JSON dictionary like:

{
  "acct": "accountant",
  "adv": "advertisement",
  "slsmn": "salesman",
  ...
}

- Keep punctuation/case in abbreviations (e.g., "h.", "res.")
- Split forms like "e or E" into two separate keys
- Only return valid abbreviation dictionary (no extra formatting)
"""

abbrev_response = model.generate_content([abbrev_prompt, abbrev_image])
abbrev_text = abbrev_response.text.strip()
if abbrev_text.startswith("```json"):
    abbrev_text = abbrev_text[7:].strip("` \n")
abbrev_dict = json.loads(abbrev_text)

# Save abbreviation dictionary
with open("abbreviations.json", "w") as f:
    json.dump(abbrev_dict, f, indent=2)

print(f"✅ Extracted {len(abbrev_dict)} abbreviations.")


✅ Extracted 98 abbreviations.


In [6]:

# 🔍 Load abbreviations from page 111
def extract_abbreviations(pdf_path):
    image_path = convert_pdf_to_images(pdf_path, "abbrev_imgs")[0]
    image = Image.open(image_path)

    prompt = """
    Extract all abbreviations and full forms as a JSON dictionary. Format:
    {
      "h.": "home",
      "res.": "residence",
      "slsmn": "salesman"
    }
    Split forms like "e or E - East" as two keys. Do not return markdown or explanation.
    """
    response = model.generate_content([prompt, image])
    text = response.text.strip()
    if text.startswith("```json"):
        text = text[7:].strip("` \n")
    return json.loads(text)

# 🧠 Gemini Prompt + Post-Processing Cleanup
def split_image_vertically(image_path, parts=3):
    image = Image.open(image_path)
    width, height = image.size
    segment_height = height // parts
    return [image.crop((0, i * segment_height, width, (i + 1) * segment_height)) for i in range(parts)]

def expand_abbreviations(text, abbr_dict):
    if not text:
        return text
    words = text.split()
    return " ".join([abbr_dict.get(word.strip(".,").lower(), word) for word in words])

def clean_company_name(name):
    if not name:
        return None
    # Remove trailing business descriptions
    exclusions = ["full line of", "canned goods", "manufacturers of", "and bakers", "grocers"]
    name_clean = name.lower()
    for excl in exclusions:
        if excl in name_clean:
            name_clean = name_clean[:name_clean.find(excl)]
            break
    return name_clean.strip(", ").title()

def extract_structured_info(image_paths, abbrev_dict, directory_name="Minneapolis 1900", page_number=104, parts=3):
    all_entries = []

    for image_path in image_paths:
        slices = split_image_vertically(image_path, parts=parts)

        for idx, img_slice in enumerate(slices):
            prompt = f"""
You are analyzing a 1900 Minneapolis city directory page column. Extract each entry as JSON using this format:

{{
  "FirstName": "Peter D",
  "LastName": "Aadland",
  "Spouse": "Pearl R",
  "Occupation": "Salesman",
  "CompanyName": "Lifetime Sls",
  "HomeAddress": {{
    "StreetNumber": "2103",
    "StreetName": "Bryant av S",
    "ApartmentOrUnit": "apt 1",
    "ResidenceIndicator": "h"
  }},
  "WorkAddress": null,
  "Telephone": null,
  "DirectoryName": "{directory_name}",
  "PageNumber": {page_number}
}}

Guidelines:
- Do NOT carry forward last names between entries. If no last name is visible, use null.
- Allow middle initials in first name like "Anna B."
- Do not include phrases like "Full Line of..." in company names.
- Output a clean JSON array.
"""
            try:
                response = model.generate_content([prompt, img_slice])
                raw_output = response.text.strip()
                if raw_output.startswith("```json"):
                    raw_output = raw_output[7:].strip("` \n")
                parsed = json.loads(raw_output)

                last_valid_lastname = None  # track independently for reset logic

                for entry in parsed:
                    # ✅ Expand abbreviations
                    if entry.get("Occupation"):
                        entry["Occupation"] = expand_abbreviations(entry["Occupation"], abbrev_dict)
                    if entry.get("CompanyName"):
                        entry["CompanyName"] = expand_abbreviations(clean_company_name(entry["CompanyName"]), abbrev_dict)
                    if entry.get("HomeAddress") and entry["HomeAddress"].get("StreetName"):
                        entry["HomeAddress"]["StreetName"] = expand_abbreviations(entry["HomeAddress"]["StreetName"], abbrev_dict)

                    # ✅ FIX: Remove carried-over last names
                    fname = entry.get("FirstName", "")
                    lname = entry.get("LastName", "")
                    if lname and lname == last_valid_lastname:
                        # Heuristics: likely a carryover if first name has middle initial and no new last name found
                        if re.match(r'^[A-Z][a-z]+\s+[A-Z]\.?$', fname):  # e.g., "Oscar S." or "Anna B"
                            entry["LastName"] = None
                    elif lname and re.match(r"^[A-Z][a-zA-Z]+$", lname):  # Accept valid proper last name
                        last_valid_lastname = lname  # Update tracker

                all_entries.extend(parsed)
            except Exception as e:
                print(f"❌ Error in slice {idx}: {e}")

    return all_entries


In [7]:
# Run on 1900_page112.pdf
structured_pdf_path = "/content/1900_page112.pdf"
output_dir = "sliced_output"
image_paths = convert_pdf_to_images(structured_pdf_path, output_dir)

# Extract and expand entries
entries = extract_structured_info(image_paths, abbrev_dict, directory_name="Minneapolis 1900", page_number=104)

# Save final structured entries
with open("structured_directory_output1.json", "w") as f:
    json.dump(entries, f, indent=2)

print(f"✅ Done. Extracted {len(entries)} structured entries and expanded abbreviations.")


✅ Done. Extracted 56 structured entries and expanded abbreviations.


Code For all 5 pages

In [9]:
# 🔐 Setup Gemini API
!pip install -q google-generativeai
import google.generativeai as genai
import os
import json
from PIL import Image
from pdf2image import convert_from_path
import re

# ✅ Load API key
from google.colab import userdata
api_key = userdata.get("GOOGLE_API_KEY")
if not api_key:
    raise ValueError("❌ GOOGLE_API_KEY not found. Make sure it's stored in Colab secrets.")
genai.configure(api_key=api_key)

# 🎯 Initialize Gemini model
model = genai.GenerativeModel("gemini-2.0-flash")

# 📄 Convert PDF to images
def convert_pdf_to_images(pdf_path, output_folder, dpi=300):
    os.makedirs(output_folder, exist_ok=True)
    images = convert_from_path(pdf_path, dpi=dpi)
    image_paths = []
    for i, image in enumerate(images):
        path = os.path.join(output_folder, f"{os.path.splitext(os.path.basename(pdf_path))[0]}_page_{i+1}.jpg")
        image.save(path, "JPEG")
        image_paths.append(path)
    return image_paths

# 🧠 Abbreviation extraction (ONCE, from page 111)
def extract_abbreviations(pdf_path):
    image_path = convert_pdf_to_images(pdf_path, "abbrev_imgs")[0]
    image = Image.open(image_path)

    prompt = """
    Extract all abbreviations and full forms as a JSON dictionary. Format:
    {
      "h.": "home",
      "res.": "residence",
      "slsmn": "salesman"
    }
    Split forms like "e or E - East" as two keys. Do not return markdown or explanation.
    """
    response = model.generate_content([prompt, image])
    text = response.text.strip()
    if text.startswith("```json"):
        text = text[7:].strip("` \n")
    return json.loads(text)

def split_image_vertically(image_path, parts=3):
    image = Image.open(image_path)
    width, height = image.size
    segment_height = height // parts
    return [image.crop((0, i * segment_height, width, (i + 1) * segment_height)) for i in range(parts)]

def expand_abbreviations(text, abbr_dict):
    if not text:
        return text
    words = text.split()
    return " ".join([abbr_dict.get(word.strip(".,").lower(), word) for word in words])

def clean_company_name(name):
    if not name:
        return None
    exclusions = ["full line of", "canned goods", "manufacturers of", "and bakers", "grocers"]
    name_clean = name.lower()
    for excl in exclusions:
        if excl in name_clean:
            name_clean = name_clean[:name_clean.find(excl)]
            break
    return name_clean.strip(", ").title()

# 🧠 Main Extraction Function
def extract_structured_info(image_paths, abbrev_dict, directory_name, page_number, parts=3):
    all_entries = []
    for image_path in image_paths:
        slices = split_image_vertically(image_path, parts=parts)
        for idx, img_slice in enumerate(slices):
            prompt = f"""
You are analyzing a 1900 Minneapolis city directory page column. Extract each entry as JSON using this format:

{{
  "FirstName": "Peter D",
  "LastName": "Aadland",
  "Spouse": "Pearl R",
  "Occupation": "Salesman",
  "CompanyName": "Lifetime Sls",
  "HomeAddress": {{
    "StreetNumber": "2103",
    "StreetName": "Bryant av S",
    "ApartmentOrUnit": "apt 1",
    "ResidenceIndicator": "h"
  }},
  "WorkAddress": null,
  "Telephone": null,
  "DirectoryName": "{directory_name}",
  "PageNumber": {page_number}
}}

Guidelines:
- Do NOT carry forward last names between entries. If no last name is visible, use null.
- Allow middle initials in first name like "Anna B."
- Do not include phrases like "Full Line of..." in company names.
- Output a clean JSON array.
"""
            try:
                response = model.generate_content([prompt, img_slice])
                raw_output = response.text.strip()
                if raw_output.startswith("```json"):
                    raw_output = raw_output[7:].strip("` \n")
                parsed = json.loads(raw_output)

                last_valid_lastname = None

                for entry in parsed:
                    if entry.get("Occupation"):
                        entry["Occupation"] = expand_abbreviations(entry["Occupation"], abbrev_dict)
                    if entry.get("CompanyName"):
                        entry["CompanyName"] = expand_abbreviations(clean_company_name(entry["CompanyName"]), abbrev_dict)
                    if entry.get("HomeAddress") and entry["HomeAddress"].get("StreetName"):
                        entry["HomeAddress"]["StreetName"] = expand_abbreviations(entry["HomeAddress"]["StreetName"], abbrev_dict)

                    fname = entry.get("FirstName", "")
                    lname = entry.get("LastName", "")
                    if lname and lname == last_valid_lastname:
                        if re.match(r'^[A-Z][a-z]+\s+[A-Z]\.?$', fname):
                            entry["LastName"] = None
                    elif lname and re.match(r"^[A-Z][a-zA-Z]+$", lname):
                        last_valid_lastname = lname

                all_entries.extend(parsed)

            except Exception as e:
                print(f"❌ Error in slice {idx}: {e}")
    return all_entries

# 📁 Run full pipeline on pages 112–116
pdf_files = [
    "/content/1900_page112.pdf",
    "/content/1900_page113.pdf",
    "/content/1900_page114.pdf",
    "/content/1900_page115.pdf",
    "/content/1900_page116.pdf"
]

# ✅ Step 1: Extract abbreviations ONCE from page 111
abbrev_dict = extract_abbreviations("/content/1900_page111.pdf")
# Save abbreviation dictionary
with open("abbreviations.json", "w") as f:
    json.dump(abbrev_dict, f, indent=2)
print(f"✅ Extracted {len(abbrev_dict)} abbreviations.")

# ✅ Step 2: Process all PDFs and combine results
final_entries = []
for i, pdf_path in enumerate(pdf_files, start=112):
    print(f"🔍 Processing {pdf_path}")
    image_paths = convert_pdf_to_images(pdf_path, f"images_page{i}")
    entries = extract_structured_info(image_paths, abbrev_dict, directory_name="Minneapolis 1900", page_number=i)
    final_entries.extend(entries)

# ✅ Step 3: Save final output
with open("structured_directory_output.json", "w") as f:
    json.dump(final_entries, f, indent=2)

print(f"✅ Done. Extracted {len(final_entries)} total structured entries.")


✅ Extracted 98 abbreviations.
🔍 Processing /content/1900_page112.pdf
🔍 Processing /content/1900_page113.pdf
🔍 Processing /content/1900_page114.pdf
🔍 Processing /content/1900_page115.pdf
❌ Error in slice 2: expected string or bytes-like object, got 'NoneType'
🔍 Processing /content/1900_page116.pdf
✅ Done. Extracted 334 total structured entries.
