In [1]:
! pip install torch transformers pillow numpy einops pulp shapely timm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
from PIL import Image
import numpy as np
from transformers import AutoModel
import torch
from datasets import load_dataset
import os
from collections import defaultdict


model = AutoModel.from_pretrained("ragavsachdeva/magiv2", trust_remote_code=True)
# Load the PopManga dataset
print("Loading PopManga dataset...")
dataset = load_dataset("../custom-dataset/popmanga_test.py")
seen_split = dataset["seen"]  # or use "unseen"

# Extract unique characters and their images
print("Building character bank...")
character_dict = defaultdict(list)

# First, identify all unique characters in the dataset
all_characters = set()
for example in seen_split:
    for char_name in example["character_names"]:
        all_characters.add(char_name)

print(f"Found {len(all_characters)} unique characters in the dataset")

# Create a mapping from character cluster ID to character name
# This will help us map bounding boxes to character names
def create_char_cluster_mapping(example):
    char_map = {}
    for i, cluster_id in enumerate(example["character_clusters"]):
        if i < len(example["character_names"]):
            char_map[cluster_id] = example["character_names"][i]
    return char_map

# Iterate through the dataset to build character references
for i, example in enumerate(seen_split):
    if i % 100 == 0:
        print(f"Processing example {i}/{len(seen_split)}")
        
    image_path = example["image_path"]
    
    # Create mapping from cluster ID to character name
    char_cluster_map = create_char_cluster_mapping(example)
    
    # Skip if no annotations
    if not example["magi_annotations"]["bboxes_as_x1y1x2y2"]:
        continue
    
    try:
        # Read the image
        img = Image.open(image_path).convert("L").convert("RGB")
        img_array = np.array(img)
        
        # Extract character crops using bounding boxes
        for j, (bbox, label) in enumerate(zip(
            example["magi_annotations"]["bboxes_as_x1y1x2y2"], 
            example["magi_annotations"]["labels"]
        )):
            # Check if this is a character box (label 0) and we can match it to a character
            if label == 0:
                # Look up which character cluster this belongs to
                for cluster_idx, char_indices in enumerate(example["text_char_matches"]):
                    if j in char_indices and cluster_idx in char_cluster_map:
                        char_name = char_cluster_map[cluster_idx]
                        
                        # Extract the bounding box
                        x1, y1, x2, y2 = [int(coord) for coord in bbox]
                        
                        # Ensure coordinates are within image bounds
                        x1, y1 = max(0, x1), max(0, y1)
                        x2, y2 = min(img_array.shape[1], x2), min(img_array.shape[0], y2)
                        
                        # Extract character crop
                        if x2 > x1 and y2 > y1:
                            char_crop = img_array[y1:y2, x1:x2]
                            character_dict[char_name].append(char_crop)
                            break
    except Exception as e:
        print(f"Error processing image {i}: {e}")

# Check how many characters we have reference images for
characters_with_images = sum(1 for char in character_dict if character_dict[char])
print(f"Collected reference images for {characters_with_images}/{len(all_characters)} characters")

# Load the MagiV2 model
print("Loading MagiV2 model...")

if torch.cuda.is_available():
    model = model.cuda()
model = model.eval()

# Set up the output directory
output_dir = "output"
os.makedirs(output_dir, exist_ok=True)

# Process a subset of manga pages
max_pages = 10  # Limit for testing
print(f"Processing up to {max_pages} manga pages...")

for i, example in enumerate(seen_split):
    if i >= max_pages:
        break
        
    image_path = example["image_path"]
    
    try:
        # Read manga page
        with open(image_path, "rb") as file:
            img = Image.open(file).convert("L").convert("RGB")
            chapter_page = np.array(img)
        
        chapter_pages = [chapter_page]
        
        # Get unique characters in this page
        page_characters = set(example["character_names"])
        
        # Create character bank for this page
        char_images = []
        char_names = []
        
        for char_name in page_characters:
            if char_name in character_dict and character_dict[char_name]:
                char_images.append(character_dict[char_name][0])  # Use the first crop for each character
                char_names.append(char_name)
        
        if not char_images:
            print(f"Skipping page {i} - no character references available")
            continue
            
        character_bank = {
            "images": char_images,
            "names": char_names
        }
        
        print(f"Processing page {i} with {len(char_names)} characters")
        
        # Run prediction
        with torch.no_grad():
            per_page_results = model.do_chapter_wide_prediction(
                chapter_pages, 
                character_bank, 
                use_tqdm=True, 
                do_ocr=True
            )
        
        # Generate transcript and visualizations
        transcript = []
        
        for j, (image, page_result) in enumerate(zip(chapter_pages, per_page_results)):
            output_path = os.path.join(output_dir, f"page_{i}.png")
            model.visualise_single_image_prediction(image, page_result, output_path)
            
            speaker_name = {
                text_idx: page_result["character_names"][char_idx] 
                for text_idx, char_idx in page_result["text_character_associations"]
            }
            
            for k in range(len(page_result["ocr"])):
                if not page_result["is_essential_text"][k]:
                    continue
                name = speaker_name.get(k, "unsure") 
                transcript.append(f"<{name}>: {page_result['ocr'][k]}")
        
        # Save transcript
        with open(os.path.join(output_dir, f"page_{i}_transcript.txt"), "w") as fh:
            for line in transcript:
                fh.write(line + "\n")
                
        print(f"Results for page {i} saved to {output_dir}")
    
    except Exception as e:
        print(f"Error processing page {i}: {e}")
        
print("Processing complete. Check the output directory for results.")

  from .autonotebook import tqdm as notebook_tqdm


Loading PopManga dataset...
Building character bank...
Found 196 unique characters in the dataset
Processing example 0/1136
Processing example 100/1136
Processing example 200/1136
Processing example 300/1136
Processing example 400/1136
Processing example 500/1136
Processing example 600/1136
Processing example 700/1136
Processing example 800/1136
Processing example 900/1136
Processing example 1000/1136
Processing example 1100/1136
Collected reference images for 178/196 characters
Loading MagiV2 model...
Processing up to 10 manga pages...
Processing page 0 with 2 characters


100%|██████████| 1/1 [00:02<00:00,  3.00s/it]


Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /Users/fadhelerlanggawibawanto/.pyenv/versions/3.11.6/lib/python3.11/site-packages/pulp/apis/../solverdir/cbc/osx/i64/cbc /var/folders/0j/2ms5f4dx5rlfqts07c12w3t40000gp/T/5230070d26064679ae9b517155616cb4-pulp.mps -timeMode elapsed -branch -printingOptions all -solution /var/folders/0j/2ms5f4dx5rlfqts07c12w3t40000gp/T/5230070d26064679ae9b517155616cb4-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 63 COLUMNS
At line 250 RHS
At line 309 BOUNDS
At line 331 ENDATA
Problem MODEL has 58 rows, 21 columns and 123 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 3.82723 - 0.00 seconds
Cgl0004I processed model has 4 rows, 6 columns (6 integer (6 of which binary)) and 10 elements
Cbc0038I Initial state - 0 integers unsatisfied sum - 0
Cbc0038I Solution found of 3.82723
Cbc0038I Before mini branch and b

100%|██████████| 1/1 [00:16<00:00, 16.61s/it]


Results for page 0 saved to output
Processing page 1 with 2 characters


100%|██████████| 1/1 [00:02<00:00,  2.74s/it]


Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /Users/fadhelerlanggawibawanto/.pyenv/versions/3.11.6/lib/python3.11/site-packages/pulp/apis/../solverdir/cbc/osx/i64/cbc /var/folders/0j/2ms5f4dx5rlfqts07c12w3t40000gp/T/b96567adaa714680ac3915247114cfe9-pulp.mps -timeMode elapsed -branch -printingOptions all -solution /var/folders/0j/2ms5f4dx5rlfqts07c12w3t40000gp/T/b96567adaa714680ac3915247114cfe9-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 17 COLUMNS
At line 72 RHS
At line 85 BOUNDS
At line 95 ENDATA
Problem MODEL has 12 rows, 9 columns and 27 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 2.25 - 0.00 seconds
Cgl0004I processed model has 0 rows, 0 columns (0 integer (0 of which binary)) and 0 elements
Cbc3007W No integer variables - nothing to do
Cuts at root node changed objective from 2.25 to -1.79769e+308
Probing was tried 0 time

100%|██████████| 1/1 [00:07<00:00,  7.09s/it]


Results for page 1 saved to output
Processing page 2 with 3 characters


100%|██████████| 1/1 [00:02<00:00,  2.97s/it]


Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /Users/fadhelerlanggawibawanto/.pyenv/versions/3.11.6/lib/python3.11/site-packages/pulp/apis/../solverdir/cbc/osx/i64/cbc /var/folders/0j/2ms5f4dx5rlfqts07c12w3t40000gp/T/291e0b9f07444624a06e96463859d453-pulp.mps -timeMode elapsed -branch -printingOptions all -solution /var/folders/0j/2ms5f4dx5rlfqts07c12w3t40000gp/T/291e0b9f07444624a06e96463859d453-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 236 COLUMNS
At line 867 RHS
At line 1099 BOUNDS
At line 1148 ENDATA
Problem MODEL has 231 rows, 48 columns and 486 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 8.06813 - 0.00 seconds
Cgl0003I 0 fixed, 0 tightened bounds, 9 strengthened rows, 0 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 9 strengthened rows, 0 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 6 strengthened rows, 0 subs

100%|██████████| 1/1 [00:31<00:00, 31.91s/it]


Results for page 2 saved to output
Processing page 3 with 5 characters


100%|██████████| 1/1 [00:03<00:00,  3.14s/it]


Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /Users/fadhelerlanggawibawanto/.pyenv/versions/3.11.6/lib/python3.11/site-packages/pulp/apis/../solverdir/cbc/osx/i64/cbc /var/folders/0j/2ms5f4dx5rlfqts07c12w3t40000gp/T/96d22dd97a5043aa8deab2752eb48da3-pulp.mps -timeMode elapsed -branch -printingOptions all -solution /var/folders/0j/2ms5f4dx5rlfqts07c12w3t40000gp/T/96d22dd97a5043aa8deab2752eb48da3-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 246 COLUMNS
At line 949 RHS
At line 1191 BOUNDS
At line 1252 ENDATA
Problem MODEL has 241 rows, 60 columns and 522 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 6.51454 - 0.00 seconds
Cgl0003I 0 fixed, 0 tightened bounds, 30 strengthened rows, 0 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 37 strengthened rows, 0 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 26 strengthened rows, 0 s

100%|██████████| 1/1 [00:10<00:00, 10.85s/it]


Results for page 3 saved to output
Processing page 4 with 4 characters


100%|██████████| 1/1 [00:03<00:00,  3.37s/it]


Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /Users/fadhelerlanggawibawanto/.pyenv/versions/3.11.6/lib/python3.11/site-packages/pulp/apis/../solverdir/cbc/osx/i64/cbc /var/folders/0j/2ms5f4dx5rlfqts07c12w3t40000gp/T/032b601428f14084935ddf3037cf4bba-pulp.mps -timeMode elapsed -branch -printingOptions all -solution /var/folders/0j/2ms5f4dx5rlfqts07c12w3t40000gp/T/032b601428f14084935ddf3037cf4bba-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 345 COLUMNS
At line 1260 RHS
At line 1601 BOUNDS
At line 1667 ENDATA
Problem MODEL has 340 rows, 65 columns and 719 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 8.04546 - 0.00 seconds
Cgl0003I 0 fixed, 0 tightened bounds, 24 strengthened rows, 0 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 28 strengthened rows, 0 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 20 strengthened rows, 0 

100%|██████████| 1/1 [00:36<00:00, 36.75s/it]


Results for page 4 saved to output
Processing page 5 with 5 characters


100%|██████████| 1/1 [00:03<00:00,  3.29s/it]


Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /Users/fadhelerlanggawibawanto/.pyenv/versions/3.11.6/lib/python3.11/site-packages/pulp/apis/../solverdir/cbc/osx/i64/cbc /var/folders/0j/2ms5f4dx5rlfqts07c12w3t40000gp/T/9fb8a0e1a1064066986bd19697664e44-pulp.mps -timeMode elapsed -branch -printingOptions all -solution /var/folders/0j/2ms5f4dx5rlfqts07c12w3t40000gp/T/9fb8a0e1a1064066986bd19697664e44-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 478 COLUMNS
At line 1733 RHS
At line 2207 BOUNDS
At line 2292 ENDATA
Problem MODEL has 473 rows, 84 columns and 1002 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 9.09463 - 0.00 seconds
Cgl0003I 0 fixed, 0 tightened bounds, 211 strengthened rows, 0 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 193 strengthened rows, 0 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 181 strengthened rows

100%|██████████| 1/1 [00:23<00:00, 23.98s/it]


Results for page 5 saved to output
Processing page 6 with 2 characters


100%|██████████| 1/1 [00:04<00:00,  4.40s/it]


Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /Users/fadhelerlanggawibawanto/.pyenv/versions/3.11.6/lib/python3.11/site-packages/pulp/apis/../solverdir/cbc/osx/i64/cbc /var/folders/0j/2ms5f4dx5rlfqts07c12w3t40000gp/T/1fc93ad88d334955ab57394d195971af-pulp.mps -timeMode elapsed -branch -printingOptions all -solution /var/folders/0j/2ms5f4dx5rlfqts07c12w3t40000gp/T/1fc93ad88d334955ab57394d195971af-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 76 COLUMNS
At line 299 RHS
At line 371 BOUNDS
At line 396 ENDATA
Problem MODEL has 71 rows, 24 columns and 150 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 5.61343 - 0.00 seconds
Cgl0003I 0 fixed, 0 tightened bounds, 6 strengthened rows, 0 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 6 strengthened rows, 0 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 4 strengthened rows, 0 substitu

100%|██████████| 1/1 [00:19<00:00, 19.55s/it]


Results for page 6 saved to output
Processing page 7 with 4 characters


100%|██████████| 1/1 [00:03<00:00,  3.15s/it]


Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /Users/fadhelerlanggawibawanto/.pyenv/versions/3.11.6/lib/python3.11/site-packages/pulp/apis/../solverdir/cbc/osx/i64/cbc /var/folders/0j/2ms5f4dx5rlfqts07c12w3t40000gp/T/ca34cac9b3c7436983f866ac4764551f-pulp.mps -timeMode elapsed -branch -printingOptions all -solution /var/folders/0j/2ms5f4dx5rlfqts07c12w3t40000gp/T/ca34cac9b3c7436983f866ac4764551f-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 129 COLUMNS
At line 522 RHS
At line 647 BOUNDS
At line 688 ENDATA
Problem MODEL has 124 rows, 40 columns and 272 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 5.33604 - 0.00 seconds
Cgl0003I 0 fixed, 0 tightened bounds, 24 strengthened rows, 0 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 28 strengthened rows, 0 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 20 strengthened rows, 0 sub

100%|██████████| 1/1 [00:24<00:00, 24.20s/it]


Results for page 7 saved to output
Processing page 8 with 3 characters


100%|██████████| 1/1 [00:02<00:00,  2.73s/it]


Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /Users/fadhelerlanggawibawanto/.pyenv/versions/3.11.6/lib/python3.11/site-packages/pulp/apis/../solverdir/cbc/osx/i64/cbc /var/folders/0j/2ms5f4dx5rlfqts07c12w3t40000gp/T/348eaad4dfb3496a89ece4edfae2f67e-pulp.mps -timeMode elapsed -branch -printingOptions all -solution /var/folders/0j/2ms5f4dx5rlfqts07c12w3t40000gp/T/348eaad4dfb3496a89ece4edfae2f67e-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 28 COLUMNS
At line 131 RHS
At line 155 BOUNDS
At line 172 ENDATA
Problem MODEL has 23 rows, 16 columns and 54 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 2.39594 - 0.00 seconds
Cgl0003I 0 fixed, 0 tightened bounds, 3 strengthened rows, 0 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 3 strengthened rows, 0 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 3 strengthened rows, 0 substitut

100%|██████████| 1/1 [00:22<00:00, 22.15s/it]


Results for page 8 saved to output
Processing page 9 with 3 characters


100%|██████████| 1/1 [00:02<00:00,  2.92s/it]


Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /Users/fadhelerlanggawibawanto/.pyenv/versions/3.11.6/lib/python3.11/site-packages/pulp/apis/../solverdir/cbc/osx/i64/cbc /var/folders/0j/2ms5f4dx5rlfqts07c12w3t40000gp/T/593e173a7c174a35b6d7f15fdcc2b98d-pulp.mps -timeMode elapsed -branch -printingOptions all -solution /var/folders/0j/2ms5f4dx5rlfqts07c12w3t40000gp/T/593e173a7c174a35b6d7f15fdcc2b98d-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 106 COLUMNS
At line 421 RHS
At line 523 BOUNDS
At line 556 ENDATA
Problem MODEL has 101 rows, 32 columns and 218 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 5.32414 - 0.00 seconds
Cgl0003I 0 fixed, 0 tightened bounds, 3 strengthened rows, 0 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 3 strengthened rows, 0 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 3 strengthened rows, 0 substi

100%|██████████| 1/1 [00:14<00:00, 14.96s/it]


Results for page 9 saved to output
Processing complete. Check the output directory for results.


In [4]:
def extract_panels(example):
    panels = []
    for bbox, label in zip(example["magi_annotations"]["bboxes_as_x1y1x2y2"], 
                          example["magi_annotations"]["labels"]):
        # Assuming label 2 represents panels
        if label == 2:
            x1, y1, x2, y2 = [int(coord) for coord in bbox]
            panels.append((x1, y1, x2, y2))
    return panels

# Example usage
for i, example in enumerate(seen_split):
    image_path = example["image_path"]
    panels = extract_panels(example)
    
    # Now you can process each panel separately
    img = Image.open(image_path).convert("RGB")
    
    for j, (x1, y1, x2, y2) in enumerate(panels):
        panel_img = img.crop((x1, y1, x2, y2))
        os.makedirs("panel_outputs", exist_ok=True)
        panel_img.save(f"panel_outputs/page_{i}_panel_{j}.png")