In [1]:
# Imports
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
import os
import glob
import shutil

In [2]:
# Configuration: Read CSV and setup output directory
df = pd.read_csv('data/final_data.csv')
output_dir = 'data/images'
overwrite_dir = 'data/image overwrites'
os.makedirs(output_dir, exist_ok=True)

print(f"Loaded {len(df)} entries from CSV")
print(f"Output directory: {output_dir}")
print(f"Overwrite directory: {overwrite_dir}")


Loaded 906 entries from CSV
Output directory: data/images
Overwrite directory: data/image overwrites


In [3]:
# Function: Convert Wikipedia thumbnail URL to original image URL
def get_original_wikimedia_url(thumbnail_url):
    """Convert Wikipedia thumbnail URL to original image URL"""
    if 'upload.wikimedia.org' in thumbnail_url and '/thumb/' in thumbnail_url:
        # Remove query parameters first
        url = thumbnail_url.split('?')[0]
        
        # Format:   https://upload.wikimedia.org/wikipedia/commons/thumb/a/ab/Example.jpg/198px-Example.jpg
        # Original: https://upload.wikimedia.org/wikipedia/commons/a/ab/Example.jpg
        
        # Split on /thumb/ to separate base from path
        if '/thumb/' in url:
            base, path_with_size = url.split('/thumb/', 1)
            # Find the last '/' which separates the full path from the sized filename
            last_slash = path_with_size.rfind('/')
            if last_slash != -1:
                # The part before last slash is the full path including original filename
                # e.g., "a/ab/Example.jpg"
                full_path = path_with_size[:last_slash]
                sized_filename = path_with_size[last_slash + 1:]
                
                # Remove size prefix (e.g., "198px-", "640px-") from sized filename to get original
                dash_idx = sized_filename.find('-')
                if dash_idx != -1:
                    size_part = sized_filename[:dash_idx]
                    # Check if it's a size prefix like "198px" or "640px"
                    if size_part.replace('px', '').isdigit():
                        # The original filename is after the size prefix
                        original_filename = sized_filename[dash_idx + 1:]
                    else:
                        # No size prefix, use as-is
                        original_filename = sized_filename
                else:
                    original_filename = sized_filename
                
                # Reconstruct original URL: base + / + full_path (which already includes the filename)
                # Actually, full_path already contains the original filename, so we can use it directly
                # But we need to make sure we're using the original filename, not the sized one
                # So we replace the last component of full_path with original_filename
                # Or simpler: full_path already IS the path with original filename
                # So original_url = base + / + full_path
                original_url = f"{base}/{full_path}"
                return original_url
    
    return thumbnail_url

# Function: Download and process image
def download_and_process_image(image_url, output_path):
    """Download image, crop to square, resize to 128x128, and save"""
    try:
        # Convert Wikipedia thumbnail URLs to original image URLs to avoid 429 errors
        if 'upload.wikimedia.org' in image_url:
            image_url = get_original_wikimedia_url(image_url)
        
        # Download the image with User-Agent header (required for Wikipedia)
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(image_url, headers=headers, timeout=10)
        response.raise_for_status()
        
        # Open image from bytes
        img = Image.open(BytesIO(response.content))
        
        # Convert to RGB if necessary (handles RGBA, P, etc.)
        if img.mode != 'RGB':
            img = img.convert('RGB')
        
        # Get dimensions
        width, height = img.size
        
        # Crop to square (centered horizontally, top-aligned vertically)
        if width > height:
            # Landscape: crop left and right
            left = (width - height) // 2
            right = left + height
            img = img.crop((left, 0, right, height))
        elif height > width:
            # Portrait: crop from top
            top = 0
            bottom = width
            img = img.crop((0, top, width, bottom))
        # If already square, no cropping needed
        
        # Resize to 128x128
        img = img.resize((128, 128), Image.Resampling.LANCZOS)
        
        # Save as PNG
        img.save(output_path, 'PNG')
        return True
    except Exception as e:
        return False


In [4]:
# Main processing loop
successful = 0
failed = 0
skipped = 0
overwrite_used = 0

for idx, row in df.iterrows():
    image_url = row['Image']
    image_id = str(row['ID'])
    output_path = os.path.join(output_dir, f"{image_id}.png")
    
    # Skip if already exists
    if os.path.exists(output_path):
        skipped += 1
        continue
    
    # Check if image exists in overwrites directory
    overwrite_pattern = os.path.join(overwrite_dir, f"{image_id}.*")
    overwrite_files = glob.glob(overwrite_pattern)
    
    if overwrite_files:
        # Use the overwrite image as-is (copy directly)
        overwrite_path = overwrite_files[0]
        try:
            shutil.copy2(overwrite_path, output_path)
            successful += 1
            overwrite_used += 1
            continue
        except Exception as e:
            # Fall through to download from URL
            pass
    
    # Download and process from URL if no overwrite found
    if download_and_process_image(image_url, output_path):
        successful += 1
    else:
        failed += 1

print(f"\nCompleted!")
print(f"Successful: {successful}")
print(f"Failed: {failed}")
print(f"Skipped: {skipped}")
print(f"Overwrites used: {overwrite_used}")
print(f"Total: {len(df)}")



Completed!
Successful: 43
Failed: 0
Skipped: 863
Overwrites used: 1
Total: 906


In [5]:
# Copy images to Next.js public folder
public_images_dir = '../src/talldle/public/images'
os.makedirs(public_images_dir, exist_ok=True)

# Copy all images from data/images to public/images
copied_count = 0
for image_file in os.listdir(output_dir):
    if image_file.endswith('.png'):
        src_path = os.path.join(output_dir, image_file)
        dst_path = os.path.join(public_images_dir, image_file)
        shutil.copy2(src_path, dst_path)
        copied_count += 1

print(f"Copied {copied_count} images to {public_images_dir}")
print(f"Images will be accessible at /images/{{id}}.png in your Next.js app")


Copied 906 images to ../src/talldle/public/images
Images will be accessible at /images/{id}.png in your Next.js app
