# Jan-Sunwai AI — Dataset Re-Sorter (Kaggle Edition)

Re-sorts the civic complaint image dataset using the **two-step Ollama AI pipeline** — upgraded for Kaggle's T4 GPU (16 GB VRAM):

| Step | Model | Size | vs local |
|------|-------|------|----------|
| Vision (eyes) | `qwen2.5vl:7b` | 4.9 GB | 2× larger than local 3b |
| Reasoning (brain) | `llama3.1:8b` | 4.7 GB | 8× larger than local 1b |

Both models run sequentially (~9.6 GB peak VRAM), well within T4 16 GB.

## Before running
1. Upload your `sorted_dataset/` folder as a **Kaggle Dataset**:  
   - Kaggle → Datasets → New Dataset → upload the zip of `backend/sorted_dataset/`  
   - Name it e.g. `jan-sunwai-sorted-dataset`
2. Add that dataset to this notebook: **Add Data → Your Datasets → select it**
3. Enable **GPU accelerator** (Settings → Accelerator → GPU T4 x1)

4. Enable **Internet access** (Settings → Internet → On)- Download from the Output tab after the run completes

5. Run All- `/kaggle/working/ai_resort_report.csv` — full classification history  

- `/kaggle/working/ai_sorted_dataset/<Category>/` — correctly sorted images  
## Output

## 1 — Install Ollama and pull models

In [None]:
import subprocess, time, os, sys

# Install Ollama
print('Installing Ollama...')
result = subprocess.run(
    'curl -fsSL https://ollama.com/install.sh | sh',
    shell=True, capture_output=True, text=True
)
print(result.stdout[-500:] if result.stdout else 'done')
if result.returncode != 0:
    print('STDERR:', result.stderr[-300:])

In [None]:
# Start Ollama server in background
server = subprocess.Popen(
    ['ollama', 'serve'],
    stdout=subprocess.DEVNULL,
    stderr=subprocess.DEVNULL
)
print(f'Ollama server PID: {server.pid}')
time.sleep(5)  # wait for server to be ready

# Verify it's up
r = subprocess.run(['ollama', 'list'], capture_output=True, text=True)
print(r.stdout or 'Server ready')

In [None]:
# Pull models (~9.6 GB total — takes 10-20 min depending on Kaggle bandwidth)
# Kaggle T4 has 16 GB VRAM — using full-size models for best accuracy
print('Pulling qwen2.5vl:7b (vision model, 4.9 GB)...')
subprocess.run(['ollama', 'pull', 'qwen2.5vl:7b'], check=True)

print('\nPulling llama3.1:8b (reasoning model, 4.7 GB)...')
subprocess.run(['ollama', 'pull', 'llama3.1:8b'], check=True)

print('\nLoaded models:')
subprocess.run(['ollama', 'list'])

## 2 — Locate dataset

In [None]:
from pathlib import Path

# Kaggle mounts datasets under /kaggle/input/<dataset-slug>/
# Adjust DATASET_SLUG to match what you named your Kaggle dataset
DATASET_SLUG = 'jan-sunwai-sorted-dataset'  # ← change if you used a different name

INPUT_ROOT = Path('/kaggle/input') / DATASET_SLUG

# Auto-fallback: search for the folder containing category sub-dirs
if not INPUT_ROOT.exists():
    candidates = list(Path('/kaggle/input').glob('**/Municipal_-_Sanitation'))
    if candidates:
        INPUT_ROOT = candidates[0].parent
        print(f'Auto-detected dataset at: {INPUT_ROOT}')
    else:
        print('Available inputs:')
        for p in Path('/kaggle/input').iterdir():
            print(' ', p)
        raise FileNotFoundError(
            f'Dataset not found. Update DATASET_SLUG to match your Kaggle dataset name.'
        )

OUTPUT_ROOT = Path('/kaggle/working/ai_sorted_dataset')
REPORT_CSV  = Path('/kaggle/working/ai_resort_report.csv')

print(f'Source : {INPUT_ROOT}')
print(f'Output : {OUTPUT_ROOT}')

# Count images per folder
IMAGE_EXTS = {'.jpg', '.jpeg', '.png', '.webp', '.bmp', '.tiff'}
total = 0
for folder in sorted(INPUT_ROOT.iterdir()):
    if folder.is_dir():
        n = len([p for p in folder.iterdir() if p.suffix.lower() in IMAGE_EXTS])
        total += n
        print(f'  {folder.name:<45} {n:>5} images')
print(f'\nTotal: {total} images')
print(f'Estimated time at ~30 s/img on T4: ~{total*30/3600:.1f} h')

## 3 — Classifier (self-contained, no local imports)

In [None]:
import io
import ollama
from PIL import Image

CANONICAL_CATEGORIES = [
    'Municipal - PWD (Roads)',
    'Municipal - Sanitation',
    'Municipal - Horticulture',
    'Municipal - Street Lighting',
    'Municipal - Water & Sewerage',
    'Utility - Power (DISCOM)',
    'State Transport',
    'Pollution Control Board',
    'Police - Local Law Enforcement',
    'Police - Traffic',
    'Uncategorized',
]

CATEGORY_DEFINITIONS = {
    'Municipal - PWD (Roads)':        'broken roads, potholes, cracked pavement, damaged footpaths, bridge damage',
    'Municipal - Sanitation':         'garbage dumps, overflowing trash bins, dirty public toilets, waste piles on streets',
    'Municipal - Horticulture':       'fallen or uprooted trees, overgrown vegetation, unmaintained parks, dead/dry plants',
    'Municipal - Street Lighting':    'broken street lights, non-functional lamp posts, dark or unlit public roads',
    'Municipal - Water & Sewerage':   'waterlogging, flooded streets, blocked drains, sewer overflow, water pipe leaks',
    'Utility - Power (DISCOM)':       'dangling electrical wires, open or damaged transformers, hazardous power cables',
    'State Transport':                'damaged bus shelters, broken state buses, transport terminal damage',
    'Pollution Control Board':        'air pollution, thick smoke, industrial waste dumping, open burning of garbage',
    'Police - Local Law Enforcement': 'illegal parking, footpath encroachment, public nuisance, fights or brawls',
    'Police - Traffic':               'failed traffic signals, road blockages, severe traffic congestion',
    'Uncategorized':                  'does not clearly match any of the above civic categories',
}

# Upgraded models for Kaggle T4 (16 GB VRAM)
VISION_MODEL    = 'qwen2.5vl:7b'   # 4.9 GB — 2x better image understanding
REASONING_MODEL = 'llama3.1:8b'    # 4.7 GB — 8x better reasoning quality

_ALIAS_MAP = {
    'municipal - pwd (roads)':         'Municipal - PWD (Roads)',
    'municipal - pwd roads':           'Municipal - PWD (Roads)',
    'municipal - sanitation':          'Municipal - Sanitation',
    'municipal - horticulture':        'Municipal - Horticulture',
    'municipal - street lighting':     'Municipal - Street Lighting',
    'municipal - water & sewerage':    'Municipal - Water & Sewerage',
    'municipal - water and sewerage':  'Municipal - Water & Sewerage',
    'utility - power (discom)':        'Utility - Power (DISCOM)',
    'utility - power discom':          'Utility - Power (DISCOM)',
    'state transport':                 'State Transport',
    'pollution control board':         'Pollution Control Board',
    'police - local law enforcement':  'Police - Local Law Enforcement',
    'police - traffic':                'Police - Traffic',
    'uncategorized':                   'Uncategorized',
}

def canonicalize(label: str) -> str:
    return _ALIAS_MAP.get(label.strip().lower(), 'Uncategorized')

def safe_dirname(label: str) -> str:
    return label.replace(' ', '_').replace('(', '').replace(')', '').replace('&', 'and')

def load_as_jpeg_bytes(path: Path) -> bytes:
    with Image.open(path) as img:
        if img.mode != 'RGB':
            img = img.convert('RGB')
        buf = io.BytesIO()
        img.save(buf, format='JPEG', quality=90)
        return buf.getvalue()

def classify(image_path: Path) -> dict:
    try:
        image_bytes = load_as_jpeg_bytes(image_path)

        # Step 1 — Vision
        vision_resp = ollama.generate(
            model=VISION_MODEL,
            prompt=(
                'You are analyzing a civic complaint photo from India. '
                'Describe what you see in 2-3 factual sentences. '
                'Focus on: what is visibly damaged or problematic, '
                'the setting (road, park, building, drain, etc.), '
                'and any visible hazards or health/safety risks. '
                'Be specific and objective. Do not greet or explain yourself.'
            ),
            options={'num_ctx': 4096},  # T4 16GB can afford larger context
            options={'num_ctx': 2048},
        )
        description = vision_resp['response'].strip()

        # Step 2 — Reasoning
        categories_block = '\n'.join(
            f'- {cat}: {CATEGORY_DEFINITIONS[cat]}' for cat in CANONICAL_CATEGORIES
        )
        reasoning_resp = ollama.generate(
            options={'num_ctx': 2048},  # T4 16GB can afford larger context
            options={'num_ctx': 1024},
            prompt=(
                f'You are a civic complaint classifier for Indian municipal authorities.\n\n'
                f'Image description: "{description}"\n\n'
                f'Choose the SINGLE best matching category. Read ALL options before deciding.\n\n'
                f'Categories:\n{categories_block}\n\n'
                f'Decision rules (apply in order):\n'
                f'1. Dangling wires, power cables, open transformer, fallen electric pole → Utility - Power (DISCOM)\n'
                f'2. Waterlogging, flooded street, drain overflow, sewer, pipe leak → Municipal - Water & Sewerage\n'
                f'3. Garbage, trash, waste, dump, litter, bins, debris on ground → Municipal - Sanitation\n'
                f'4. Potholes, road cracks, broken road, damaged pavement, manhole cover damage → Municipal - PWD (Roads)\n'
                f'5. Broken street lights, non-functional lamp posts, unlit road → Municipal - Street Lighting\n'
                f'6. Fallen/uprooted trees, overgrown parks, dead plants, branches on road → Municipal - Horticulture\n'
                f'7. Smoke, burning, industrial pollution, waste in water → Pollution Control Board\n'
                f'8. Illegal parking, footpath encroachment, shops blocking path → Police - Local Law Enforcement\n'
                f'9. Traffic signal failure, road blockage, traffic jam → Police - Traffic\n'
                f'10. Damaged bus shelter, broken state bus, bus terminal → State Transport\n'
                f'11. Black/blurry/unrecognisable image or selfie/food → Uncategorized\n'
                f'12. Nothing matches clearly → Uncategorized\n\n'
                f'IMPORTANT: Do NOT default to Municipal - Water & Sewerage unless water/flooding/drain is explicitly described.\n'
                f'IMPORTANT: Litter and waste on a road = Municipal - Sanitation (not Transport or Roads).\n'
                f'IMPORTANT: Pole with hanging wires = Utility - Power (DISCOM).\n'
                f'Reply with ONLY the exact category name. No explanation.\n\nCategory:'
            ),
        )
        raw = reasoning_resp['response'].strip()
        if ':' in raw:
            raw = raw.split(':', 1)[-1].strip()

        canonical = canonicalize(raw)
        is_valid = canonical != 'Uncategorized'
        return {
            'department': canonical,
            'vision_description': description,
            'confidence': 0.9 if is_valid else 0.4,
        }

    except Exception as e:
        return {
            'department': 'Uncategorized',
            'vision_description': f'ERROR: {e}',
            'confidence': 0.0,
        }

print('Classifier ready.')

## 4 — Run re-sort

This reads images from the Kaggle input dataset and writes them (copied — Kaggle input is read-only) to `/kaggle/working/ai_sorted_dataset/`.

> **Note:** Kaggle input datasets are read-only, so images are **copied** here (not moved). The original dataset is untouched. You can delete the Kaggle dataset after downloading the sorted output.

In [None]:
import csv
import random
import shutil
from tqdm.notebook import tqdm

# ── Optional: set SAMPLE_PER_FOLDER = N to test with N images per folder first
# ── Set to 0 to process everything
SAMPLE_PER_FOLDER = 0

random.seed(42)

# Collect images
def collect_images(source: Path, sample: int) -> list:
    images = []
    for folder in sorted(source.iterdir()):
        if not folder.is_dir():
            continue
        found = [p for p in folder.iterdir() if p.suffix.lower() in IMAGE_EXTS]
        if not found:
            continue
        picked = random.sample(found, min(sample, len(found))) if sample else found
        images.extend(picked)
    return images

def safe_copy(src: Path, dest_dir: Path) -> Path:
    dest_dir.mkdir(parents=True, exist_ok=True)
    dest = dest_dir / src.name
    if dest.exists():
        counter = 1
        while dest.exists():
            dest = dest_dir / f'{src.stem}_{counter}{src.suffix}'
            counter += 1
    shutil.copy2(src, dest)
    return dest

images = collect_images(INPUT_ROOT, SAMPLE_PER_FOLDER)
print(f'Images to process: {len(images)}')

OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)

headers = [
    'filename', 'source_path', 'source_folder', 'original_label',
    'ai_label', 'confidence', 'vision_description', 'dest_path',
]

moved = same = errors = 0

FOLDER_TO_LABEL = {
    'Municipal_-_PWD_Roads':          'Municipal - PWD (Roads)',
    'Municipal_-_Sanitation':         'Municipal - Sanitation',
    'Municipal_-_Horticulture':       'Municipal - Horticulture',
    'Municipal_-_Street_Lighting':    'Municipal - Street Lighting',
    'Municipal_-_Water_and_Sewerage': 'Municipal - Water & Sewerage',
    'Utility_-_Power_DISCOM':         'Utility - Power (DISCOM)',
    'State_Transport':                'State Transport',
    'Pollution_Control_Board':        'Pollution Control Board',
    'Police_-_Local_Law_Enforcement': 'Police - Local Law Enforcement',
    'Police_-_Traffic':               'Police - Traffic',
    'Uncategorized':                  'Uncategorized',
}

with open(REPORT_CSV, 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=headers)
    writer.writeheader()

    for img_path in tqdm(images, desc='Re-sorting', unit='img'):
        folder_name    = img_path.parent.name
        original_label = FOLDER_TO_LABEL.get(folder_name, folder_name.replace('_', ' '))

        result     = classify(img_path)
        ai_label   = result['department']
        confidence = result['confidence']
        vision_desc = result['vision_description']

        if confidence <= 0.4 or ai_label in ('Unknown', ''):
            ai_label = 'Uncategorized'

        dest_dir  = OUTPUT_ROOT / safe_dirname(ai_label)
        dest_path = safe_copy(img_path, dest_dir)

        if ai_label != original_label:
            moved += 1
        else:
            same += 1

        writer.writerow({
            'filename':           img_path.name,
            'source_path':        str(img_path),
            'source_folder':      folder_name,
            'original_label':     original_label,
            'ai_label':           ai_label,
            'confidence':         confidence,
            'vision_description': vision_desc,
            'dest_path':          str(dest_path),
        })

print(f'\n✅ Done!')
print(f'   Processed  : {len(images)}')
print(f'   Re-labelled: {moved}  ({moved/len(images)*100:.1f}%)')
print(f'   Confirmed  : {same}  ({same/len(images)*100:.1f}%)')
print(f'   Report CSV : {REPORT_CSV}')

## 5 — Output summary

In [None]:
import pandas as pd

df = pd.read_csv(REPORT_CSV)
print('=== Per-category output counts ===')
print(df['ai_label'].value_counts().to_string())
print()
print('=== Re-labelled from → to (top 20) ===')
relabelled = df[df['original_label'] != df['ai_label']]
print(relabelled.groupby(['original_label', 'ai_label']).size()
      .sort_values(ascending=False).head(20).to_string())
print()
print('Output folder sizes:')
for cat_dir in sorted(OUTPUT_ROOT.iterdir()):
    if cat_dir.is_dir():
        n = sum(1 for _ in cat_dir.iterdir())
        print(f'  {cat_dir.name:<45} {n:>5}')