# Dura Bulk Detector
Scrape Instagram profile images, detect boats with YOLOv8, OCR for "Dura Bulk" text, and sort results into Google Drive.

**Output folders in Google Drive:**
- `Dura Bulk/Boat/` — images with "Dura Bulk" text on boats
- `Dura Bulk/Other/` — all other images

## 1. Install Dependencies

In [None]:
!pip install -q instaloader ultralytics easyocr pillow

## 2. Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
BOAT_DIR = '/content/drive/MyDrive/Dura Bulk/Boat'
OTHER_DIR = '/content/drive/MyDrive/Dura Bulk/Other'
os.makedirs(BOAT_DIR, exist_ok=True)
os.makedirs(OTHER_DIR, exist_ok=True)
print(f'Boat folder:  {BOAT_DIR}')
print(f'Other folder: {OTHER_DIR}')

## 3. Settings
Enter the Instagram profile name and date range below.

In [None]:
#@title Pipeline Settings { run: "auto" }
PROFILE = "durabulk" #@param {type:"string"}
START_DATE = "2025-01-01" #@param {type:"date"}
END_DATE = "2025-12-31" #@param {type:"date"}
MAX_POSTS = 100 #@param {type:"slider", min:10, max:500, step:10}

print(f'Will scrape @{PROFILE} from {START_DATE} to {END_DATE} (max {MAX_POSTS} posts)')

## 4. Scrape Instagram Profile Images
Uses `Profile.from_username` + `download_pic` — no login required for public profiles.

In [None]:
import tempfile
import shutil
from datetime import datetime
from pathlib import Path
import instaloader

TMP_DIR = tempfile.mkdtemp(prefix='dura_bulk_')

L = instaloader.Instaloader(
    download_videos=False,
    download_video_thumbnails=False,
    download_geotags=False,
    download_comments=False,
    save_metadata=False,
    compress_json=False,
    post_metadata_txt_pattern='',
)

start_dt = datetime.strptime(START_DATE, '%Y-%m-%d')
end_dt = datetime.strptime(END_DATE, '%Y-%m-%d')

print(f'Fetching posts from @{PROFILE}...')
profile = instaloader.Profile.from_username(L.context, PROFILE)

image_paths = []
count = 0

for post in profile.get_posts():
    if count >= MAX_POSTS:
        break
    post_date = post.date_utc
    if post_date.date() > end_dt.date():
        continue
    if post_date.date() < start_dt.date():
        break
    if post.is_video:
        continue

    filename = f"{post.date_utc.strftime('%Y%m%d_%H%M%S')}_{post.shortcode}.jpg"
    filepath = os.path.join(TMP_DIR, filename)

    try:
        L.download_pic(filepath, post.url, post.date_utc)
        # download_pic may append extension
        if os.path.exists(filepath):
            image_paths.append(Path(filepath))
        elif os.path.exists(filepath + '.jpg'):
            shutil.move(filepath + '.jpg', filepath)
            image_paths.append(Path(filepath))
        count += 1
        if count % 5 == 0:
            print(f'  Downloaded {count} images...')
    except Exception as e:
        print(f'  Skipped: {e}')
        continue

print(f'\nDone! Downloaded {len(image_paths)} images from @{PROFILE}')

### 4b. (Alternative) Upload images manually
If scraping doesn't work or you already have images, upload them here instead.

In [None]:
# Uncomment and run this cell to upload images manually instead of scraping

# from google.colab import files
# import tempfile
# from pathlib import Path
#
# TMP_DIR = tempfile.mkdtemp(prefix='dura_bulk_upload_')
# uploaded = files.upload()
# for name, data in uploaded.items():
#     with open(os.path.join(TMP_DIR, name), 'wb') as f:
#         f.write(data)
#
# image_paths = [
#     f for f in Path(TMP_DIR).rglob('*')
#     if f.suffix.lower() in ('.jpg', '.jpeg', '.png', '.webp')
# ]
# print(f'Uploaded {len(image_paths)} images')

## 5. Detect Boats + OCR for "Dura Bulk"

In [None]:
import re
from PIL import Image
from ultralytics import YOLO
import easyocr
from IPython.display import display, HTML

# Load models
print('Loading YOLOv8 model...')
model = YOLO('yolov8n.pt')
print('Loading EasyOCR reader...')
reader = easyocr.Reader(['en'], gpu=True)
print('Models ready!\n')


def fuzzy_match_dura_bulk(text):
    text = text.lower().strip()
    if 'dura' in text and 'bulk' in text:
        return True
    cleaned = re.sub(r'[^a-z0-9]', '', text)
    if 'durabulk' in cleaned:
        return True
    return False


boat_images = []
other_images = []

for i, img_path in enumerate(image_paths):
    print(f'[{i+1}/{len(image_paths)}] {img_path.name}', end=' ')

    try:
        img = Image.open(img_path).convert('RGB')
    except Exception:
        print('- skipped (can\'t open)')
        continue

    results = model(img, verbose=False)
    is_dura = False

    boat_count = 0
    for result in results:
        for box in result.boxes:
            cls_id = int(box.cls[0])
            if cls_id != 8:  # 8 = boat in COCO
                continue
            boat_count += 1

            x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
            crop = img.crop((x1, y1, x2, y2))

            crop_path = str(img_path) + '_crop.jpg'
            crop.save(crop_path)
            try:
                ocr_results = reader.readtext(crop_path)
                all_text = ' '.join([r[1] for r in ocr_results])
                if all_text.strip():
                    print(f'[OCR: "{all_text.strip()}"]', end=' ')
                if fuzzy_match_dura_bulk(all_text):
                    is_dura = True
                    break
            except Exception:
                pass
            finally:
                if os.path.exists(crop_path):
                    os.remove(crop_path)

        if is_dura:
            break

    # Use original filename from Instagram
    dest_name = img_path.name
    if is_dura:
        shutil.copy2(img_path, os.path.join(BOAT_DIR, dest_name))
        boat_images.append(dest_name)
        print(f'-> DURA BULK (boat detected, text matched)')
    else:
        shutil.copy2(img_path, os.path.join(OTHER_DIR, dest_name))
        other_images.append(dest_name)
        if boat_count > 0:
            print(f'-> Other ({boat_count} boat(s), no match)')
        else:
            print(f'-> Other (no boats detected)')

# Cleanup
shutil.rmtree(TMP_DIR, ignore_errors=True)

print(f'\n{"="*50}')
print(f'RESULTS')
print(f'{"="*50}')
print(f'Dura Bulk (Boat folder): {len(boat_images)} images')
print(f'Other:                   {len(other_images)} images')
print(f'Total processed:         {len(boat_images) + len(other_images)} images')
print(f'\nFiles saved to Google Drive:')
print(f'  {BOAT_DIR}')
print(f'  {OTHER_DIR}')

## 6. Preview Results

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

def show_gallery(folder, title, file_list, max_show=12):
    files = file_list[:max_show]
    if not files:
        print(f'{title}: no images')
        return

    cols = min(4, len(files))
    rows = (len(files) + cols - 1) // cols
    fig, axes = plt.subplots(rows, cols, figsize=(4*cols, 4*rows))
    fig.suptitle(f'{title} ({len(file_list)} total)', fontsize=16, fontweight='bold')

    if rows == 1 and cols == 1:
        axes = [[axes]]
    elif rows == 1:
        axes = [axes]
    elif cols == 1:
        axes = [[ax] for ax in axes]

    for idx, name in enumerate(files):
        r, c = divmod(idx, cols)
        ax = axes[r][c]
        try:
            img = mpimg.imread(os.path.join(folder, name))
            ax.imshow(img)
        except Exception:
            pass
        ax.set_title(name, fontsize=8)
        ax.axis('off')

    # Hide empty subplots
    for idx in range(len(files), rows * cols):
        r, c = divmod(idx, cols)
        axes[r][c].axis('off')

    plt.tight_layout()
    plt.show()

show_gallery(BOAT_DIR, 'Dura Bulk (Boat)', boat_images)
show_gallery(OTHER_DIR, 'Other', other_images)

## 7. (Optional) Run as Backend for GitHub Pages UI
Run this cell to expose the Colab as an API backend. Paste the ngrok URL into the GitHub Pages frontend.

In [None]:
# Uncomment to run as a web backend (requires ngrok auth token)
# Get your free token at https://dashboard.ngrok.com/get-started/your-authtoken

# NGROK_TOKEN = "your_ngrok_token_here"  #@param {type:"string"}
#
# !pip install -q flask flask-cors pyngrok
#
# import threading
# from pyngrok import ngrok
#
# ngrok.set_auth_token(NGROK_TOKEN)
# public_url = ngrok.connect(5001)
# print(f'\n*** Public URL: {public_url} ***')
# print('Paste this URL into the GitHub Pages frontend to connect.\n')
#
# # Run the Flask app from the repo
# !git clone https://github.com/ahnjili/artificialnouveauworkshops.git /tmp/anw 2>/dev/null; \
#  cd /tmp/anw/dura_bulk && python app.py