# IMPORTANT: In the top right corner, click “Connect” and wait until it shows that it is connected with a green checkmark. Once connected, run each code cell by clicking the Play button in the top left of the cell or by pressing Shift + Enter. Be sure to run the cells in order from top to bottom so that all variables and libraries are properly loaded before later code executes.


# Dura Bulk Detector
Scrape Instagram images by hashtag, detect boats with YOLOv8, OCR for "Dura Bulk" text, and sort results into Google Drive.

**Output folders in Google Drive:**
- `Dura Bulk/Boat/` — images with "Dura Bulk" text on boats
- `Dura Bulk/Other/` — all other images

## 1. Install Dependencies

In [None]:
!pip install -q apify-client ultralytics easyocr pillow

## 2. Mount Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import os
BOAT_DIR = '/content/drive/MyDrive/Dura Bulk/Boat'
OTHER_DIR = '/content/drive/MyDrive/Dura Bulk/Other'
os.makedirs(BOAT_DIR, exist_ok=True)
os.makedirs(OTHER_DIR, exist_ok=True)
print(f'Boat folder:  {BOAT_DIR}')
print(f'Other folder: {OTHER_DIR}')

Mounted at /content/drive
Boat folder:  /content/drive/MyDrive/Dura Bulk/Boat
Other folder: /content/drive/MyDrive/Dura Bulk/Other


## 3. Settings
Enter your hashtag and date range below!

### Instagram Login (Session File)
Instagram blocks direct username/password login from scripts. You must create a **session file** first:

1. **On your local computer** (not in Colab), open a terminal and run:
   ```
   pip install instaloader
   instaloader --login YOUR_USERNAME
   ```
   It will ask for your password (and possibly a 2FA code). This creates a session file at:
   - **Mac/Linux:** `~/.config/instaloader/session-YOUR_USERNAME`
   - **Windows:** `%LOCALAPPDATA%\instaloader\session-YOUR_USERNAME`

2. **Upload the session file** to Colab when prompted in the next cell.

**Tip:** Use a throwaway Instagram account for this.

In [None]:
#@title Pipeline Settings { run: "auto" }
HASHTAG = "santacruzdetenerife" #@param {type:"string"}
START_DATE = "2025-08-20" #@param {type:"date"}
END_DATE = "2025-10-01" # @param {"type":"date","placeholder":"31-01-2025"}
MAX_POSTS = 100 #@param {type:"slider", min:10, max:500, step:10}

# Instagram username (must match the session file you created locally)
INSTA_USERNAME = "" #@param {type:"string"}

print(f'Will scrape #{HASHTAG} from {START_DATE} to {END_DATE} (max {MAX_POSTS} posts)')
if INSTA_USERNAME:
    print(f'Will load session for @{INSTA_USERNAME}')
else:
    print('⚠️  No username set — fill in INSTA_USERNAME above')

## 3b. Upload Session File
Run this cell to upload the session file you created on your local machine.

In [None]:
import tempfile
from datetime import datetime
from pathlib import Path
import instaloader

TMP_DIR = tempfile.mkdtemp(prefix='dura_bulk_')

L = instaloader.Instaloader(
    dirname_pattern=TMP_DIR,
    download_videos=False,
    download_video_thumbnails=False,
    download_geotags=False,
    download_comments=False,
    save_metadata=False,
    post_metadata_txt_pattern='',
)

# Load session file (created locally with: instaloader --login USERNAME)
if INSTA_USERNAME:
    try:
        L.load_session_from_file(INSTA_USERNAME)
        print(f'Loaded session for @{INSTA_USERNAME}')
    except FileNotFoundError:
        print('Session file not found! Make sure you uploaded it in the previous step.')
        raise
else:
    print('No username set — scraping without login (will likely fail)')

start_dt = datetime.strptime(START_DATE, '%Y-%m-%d')
end_dt = datetime.strptime(END_DATE, '%Y-%m-%d')

print(f'Scraping #{HASHTAG}...')
count = 0
try:
    posts = instaloader.Hashtag.from_name(L.context, HASHTAG).get_posts()
    for post in posts:
        if count >= MAX_POSTS:
            break
        post_date = post.date_local
        if post_date.date() > end_dt.date():
            continue
        if post_date.date() < start_dt.date():
            break
        try:
            L.download_post(post, target='')
            count += 1
            if count % 5 == 0:
                print(f'  Downloaded {count} posts...')
        except Exception as e:
            print(f'  Skipped post: {e}')
            continue
except Exception as e:
    print(f'Scrape error: {e}')

image_paths = [
    f for f in Path(TMP_DIR).rglob('*')
    if f.suffix.lower() in ('.jpg', '.jpeg', '.png', '.webp')
]
print(f'\nDone! Downloaded {len(image_paths)} images from {count} posts')

In [5]:
import tempfile
from datetime import datetime
from pathlib import Path
import instaloader

TMP_DIR = tempfile.mkdtemp(prefix='dura_bulk_')

L = instaloader.Instaloader(
    dirname_pattern=TMP_DIR,
    download_videos=False,
    download_video_thumbnails=False,
    download_geotags=False,
    download_comments=False,
    save_metadata=False,
    post_metadata_txt_pattern='',
)

if INSTA_USERNAME and INSTA_PASSWORD:
    try:
        L.login(INSTA_USERNAME, INSTA_PASSWORD)
        print('Logged in to Instagram')
    except Exception as e:
        print(f'Login failed (continuing without login): {e}')

start_dt = datetime.strptime(START_DATE, '%Y-%m-%d')
end_dt = datetime.strptime(END_DATE, '%Y-%m-%d')

print(f'Scraping #{HASHTAG}...')
count = 0
try:
    posts = instaloader.Hashtag.from_name(L.context, HASHTAG).get_posts()
    for post in posts:
        if count >= MAX_POSTS:
            break
        post_date = post.date_local
        if post_date.date() > end_dt.date():
            continue
        if post_date.date() < start_dt.date():
            break
        try:
            L.download_post(post, target='')
            count += 1
            if count % 5 == 0:
                print(f'  Downloaded {count} posts...')
        except Exception as e:
            print(f'  Skipped post: {e}')
            continue
except Exception as e:
    print(f'Scrape error: {e}')

image_paths = [
    f for f in Path(TMP_DIR).rglob('*')
    if f.suffix.lower() in ('.jpg', '.jpeg', '.png', '.webp')
]
print(f'\nDone! Downloaded {len(image_paths)} images from {count} posts')

Login failed (continuing without login): Login error: "fail" status, message "CSRF token missing or incorrect".
Scraping #santacruzdetenerife...


JSON Query to api/v1/tags/web_info/: 403 Forbidden - "fail" status, message "login_required" when accessing https://i.instagram.com/api/v1/tags/web_info/?__a=1&__d=dis&tag_name=santacruzdetenerife [retrying; skip with ^C]
JSON Query to api/v1/tags/web_info/: 403 Forbidden - "fail" status, message "login_required" when accessing https://i.instagram.com/api/v1/tags/web_info/?__a=1&__d=dis&tag_name=santacruzdetenerife [retrying; skip with ^C]


Scrape error: JSON Query to api/v1/tags/web_info/: 403 Forbidden - "fail" status, message "login_required" when accessing https://i.instagram.com/api/v1/tags/web_info/?__a=1&__d=dis&tag_name=santacruzdetenerife

Done! Downloaded 0 images from 0 posts


### 4b. (Alternative) Upload images manually
If scraping doesn't work or you already have images, upload them here instead.

In [6]:
# Uncomment and run this cell to upload images manually instead of scraping

# from google.colab import files
# import tempfile
# from pathlib import Path
#
# TMP_DIR = tempfile.mkdtemp(prefix='dura_bulk_upload_')
# uploaded = files.upload()
# for name, data in uploaded.items():
#     with open(os.path.join(TMP_DIR, name), 'wb') as f:
#         f.write(data)
#
# image_paths = [
#     f for f in Path(TMP_DIR).rglob('*')
#     if f.suffix.lower() in ('.jpg', '.jpeg', '.png', '.webp')
# ]
# print(f'Uploaded {len(image_paths)} images')

## 5. Detect Boats + OCR for "Dura Bulk"

In [None]:
import re
import shutil
from PIL import Image
from ultralytics import YOLO
import easyocr
from IPython.display import display, HTML

# Load models
print('Loading YOLOv8 model...')
model = YOLO('yolov8n.pt')
print('Loading EasyOCR reader...')
reader = easyocr.Reader(['en'], gpu=True)
print('Models ready!\n')


def fuzzy_match_dura_bulk(text):
    text = text.lower().strip()
    if 'dura' in text and 'bulk' in text:
        return True
    cleaned = re.sub(r'[^a-z0-9]', '', text)
    if 'durabulk' in cleaned:
        return True
    return False


boat_images = []
other_images = []

for i, img_path in enumerate(image_paths):
    print(f'[{i+1}/{len(image_paths)}] {img_path.name}', end=' ')

    try:
        img = Image.open(img_path).convert('RGB')
    except Exception:
        print('- skipped (can\'t open)')
        continue

    results = model(img, verbose=False)
    is_dura = False

    boat_count = 0
    for result in results:
        for box in result.boxes:
            cls_id = int(box.cls[0])
            if cls_id != 8:  # 8 = boat in COCO
                continue
            boat_count += 1

            x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
            crop = img.crop((x1, y1, x2, y2))

            crop_path = str(img_path) + '_crop.jpg'
            crop.save(crop_path)
            try:
                ocr_results = reader.readtext(crop_path)
                all_text = ' '.join([r[1] for r in ocr_results])
                if all_text.strip():
                    print(f'[OCR: "{all_text.strip()}"]', end=' ')
                if fuzzy_match_dura_bulk(all_text):
                    is_dura = True
                    break
            except Exception:
                pass
            finally:
                if os.path.exists(crop_path):
                    os.remove(crop_path)

        if is_dura:
            break

    # Sort into Google Drive folders
    dest_name = f'{HASHTAG}_{i:04d}{img_path.suffix}'
    if is_dura:
        shutil.copy2(img_path, os.path.join(BOAT_DIR, dest_name))
        boat_images.append(dest_name)
        print(f'-> DURA BULK (boat detected, text matched)')
    else:
        shutil.copy2(img_path, os.path.join(OTHER_DIR, dest_name))
        other_images.append(dest_name)
        if boat_count > 0:
            print(f'-> Other ({boat_count} boat(s), no match)')
        else:
            print(f'-> Other (no boats detected)')

# Cleanup
shutil.rmtree(TMP_DIR, ignore_errors=True)

print(f'\n{"="*50}')
print(f'RESULTS')
print(f'{"="*50}')
print(f'Dura Bulk (Boat folder): {len(boat_images)} images')
print(f'Other:                   {len(other_images)} images')
print(f'Total processed:         {len(boat_images) + len(other_images)} images')
print(f'\nFiles saved to Google Drive:')
print(f'  {BOAT_DIR}')
print(f'  {OTHER_DIR}')

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


## 6. Preview Results

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

def show_gallery(folder, title, file_list, max_show=12):
    files = file_list[:max_show]
    if not files:
        print(f'{title}: no images')
        return

    cols = min(4, len(files))
    rows = (len(files) + cols - 1) // cols
    fig, axes = plt.subplots(rows, cols, figsize=(4*cols, 4*rows))
    fig.suptitle(f'{title} ({len(file_list)} total)', fontsize=16, fontweight='bold')

    if rows == 1 and cols == 1:
        axes = [[axes]]
    elif rows == 1:
        axes = [axes]
    elif cols == 1:
        axes = [[ax] for ax in axes]

    for idx, name in enumerate(files):
        r, c = divmod(idx, cols)
        ax = axes[r][c]
        try:
            img = mpimg.imread(os.path.join(folder, name))
            ax.imshow(img)
        except Exception:
            pass
        ax.set_title(name, fontsize=8)
        ax.axis('off')

    # Hide empty subplots
    for idx in range(len(files), rows * cols):
        r, c = divmod(idx, cols)
        axes[r][c].axis('off')

    plt.tight_layout()
    plt.show()

show_gallery(BOAT_DIR, 'Dura Bulk (Boat)', boat_images)
show_gallery(OTHER_DIR, 'Other', other_images)