<a href="https://colab.research.google.com/github/ashwin-yedte/visual-intelligence-travel-finance/blob/main/Data_Preparation_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# =================================================================
 COMPLETE VISUAL INTELLIGENCE DATA COLLECTION PIPELINE for Indian Destination Dataset

 Phase 1: Identify the Curated Destinations, Pre-Organized by state and theme

 Phase 2: Downloads Images from Unsplash

 Phase 3: For each destination, searches for matching images

 Phase 4: Validates URLs before downloading

 Phase 5: Organizes: theme/state/destination/

#
#=================================================================


# =================================================================
 STEP 1: SETUP AND MOUNT GOOGLE DRIVE
# =================================================================


In [30]:
from google.colab import drive
import os

drive.mount('/content/drive')

BASE_PATH = '/content/drive/MyDrive/visual-intelligence-travel-finance'
LANDMARKS_PATH = f'{BASE_PATH}/data/landmarks'
CACHE_PATH = f'{BASE_PATH}/cache'

os.makedirs(LANDMARKS_PATH, exist_ok=True)
os.makedirs(CACHE_PATH, exist_ok=True)

print("="*80)
print("SETUP COMPLETE")
print("="*80)
print(f"Base path: {BASE_PATH}")
print(f"Landmarks path: {LANDMARKS_PATH}")
print(f"Cache path: {CACHE_PATH}")
print("="*80)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
SETUP COMPLETE
Base path: /content/drive/MyDrive/visual-intelligence-travel-finance
Landmarks path: /content/drive/MyDrive/visual-intelligence-travel-finance/data/landmarks
Cache path: /content/drive/MyDrive/visual-intelligence-travel-finance/cache


# =================================================================
 STEP 2: INSTALL PACKAGES
# =================================================================


In [31]:
!pip install -q requests pillow tqdm

import requests
from PIL import Image
import io
import time
from tqdm import tqdm
import json
import re

print("="*80)
print("PACKAGES INSTALLED")
print("="*80)

PACKAGES INSTALLED


# =================================================================
 STEP 3: UNSPLASH API CONFIGURATION
# =================================================================


In [32]:
UNSPLASH_ACCESS_KEY = "63qO1DVp5p0_xaGj4bmRmtjAfF4Z1E1yDUs1B1SD1B4"

# =================================================================
 STEP 4: LOAD CURATED INDIAN DESTINATIONS DATASET
# =================================================================


In [33]:
# ============================================================================
# CELL 4: CURATED INDIAN DESTINATIONS DATASET (UPDATED FOR CLIP)
# ============================================================================

INDIAN_DESTINATIONS = {
  "Beach": {
    "goa": [
      "Calangute Beach","Baga Beach","Anjuna Beach",
      "Vagator Beach","Palolem Beach","Colva Beach",
      "Morjim Beach","Arambol Beach","Candolim Beach",
      "Benaulim Beach","Agonda Beach","Ashwem Beach",
      "Mandrem Beach","Sinquerim Beach","Miramar Beach",
      "Dona Paula Beach","Bogmalo Beach","Butterfly Beach",
      "Kakolem Beach","Betalbatim Beach"
    ],
    "kerala": [
      "Kovalam Beach", "Varkala Beach", "Cherai Beach", "Marari Beach",
      "Bekal Beach", "Muzhappilangad Beach", "Kappad Beach",
      "Shanghumukham Beach", "Alappuzha Beach", "Payyambalam Beach",
      "Kollam Beach", "Kozhikode Beach", "Thirumullavaram Beach",
      "Kappil Beach", "Ezhimala Beach"
    ],
    "maharashtra": [
      "Alibaug Beach", "Kashid Beach", "Ganpatipule Beach", "Tarkarli Beach",
      "Juhu Beach", "Girgaon Chowpatty", "Aksa Beach", "Murud Beach",
      "Diveagar Beach", "Harihareshwar Beach", "Ratnagiri Beach",
      "Revdanda Beach", "Kelshi Beach", "Bordi Beach", "Dahanu Beach",
      "Nagaon Beach", "Versova Beach", "Manori Beach"
    ],
    "tamil_nadu": [
      "Marina Beach", "Mahabalipuram Beach", "Kanyakumari Beach",
      "Rameswaram Beach", "Covelong Beach", "Elliots Beach",
      "Besant Nagar Beach", "Dhanushkodi Beach", "Thiruvanmiyur Beach",
      "Kadalur Beach", "Pondicherry Beach", "Tuticorin Beach"
    ],
    "karnataka": [
      "Gokarna Beach", "Malpe Beach", "Karwar Beach", "Murudeshwar Beach",
      "Kaup Beach", "Panambur Beach", "Om Beach", "Kudle Beach",
      "Half Moon Beach", "Paradise Beach", "Tannirbhavi Beach",
      "Surathkal Beach", "Ullal Beach", "Someshwara Beach"
    ],
    "andhra_pradesh": [
      "Rishikonda Beach", "Ramakrishna Beach", "Yarada Beach",
      "Bheemunipatnam Beach", "Vodarevu Beach", "Suryalanka Beach",
      "Mypadu Beach", "Manginapudi Beach"
    ],
    "odisha": [
      "Puri Beach", "Chandrabhaga Beach", "Gopalpur Beach",
      "Astaranga Beach", "Paradeep Beach", "Balasore Beach",
      "Rushikulya Beach", "Satapada Beach"
    ],
    "west_bengal": [
      "Digha Beach", "Mandarmani Beach", "Bakkhali Beach",
      "Shankarpur Beach", "Tajpur Beach", "Henry Island Beach",
      "Talsari Beach", "Sagar Island Beach"
    ],
    "gujarat": [
      "Mandvi Beach", "Diu Beach", "Dwarka Beach", "Chorwad Beach",
      "Ahmedpur Mandvi Beach", "Tithal Beach", "Ubharat Beach",
      "Gopnath Beach", "Veraval Beach"
    ],
    "puducherry": [
      "Paradise Beach", "Auroville Beach", "Promenade Beach",
      "Serenity Beach", "Karaikal Beach", "Mahe Beach"
    ],
    "lakshadweep": [
      "Agatti Beach", "Kadmat Beach", "Bangaram Beach",
      "Kavaratti Beach", "Minicoy Beach", "Kalpeni Beach"
    ],
    "andaman_and_nicobar": [
      "Radhanagar Beach", "Corbyn's Cove Beach", "Elephant Beach",
      "Vijaynagar Beach", "Bharatpur Beach", "Kalapathar Beach",
      "Wandoor Beach", "Sitapur Beach", "Laxmanpur Beach"
    ]
  },
  "Temple": {
    "tamil_nadu": [
      "Meenakshi Temple Madurai", "Brihadeeswarar Temple Thanjavur",
      "Kapaleeshwarar Temple Chennai", "Ramanathaswamy Temple Rameswaram",
      "Ekambareswarar Temple Kanchipuram", "Kailasanathar Temple Kanchipuram",
      "Airavatesvara Temple Darasuram", "Thillai Nataraja Temple Chidambaram",
      "Ranganathaswamy Temple Srirangam", "Tirupati Balaji Temple"
    ],
    "kerala": [
      "Padmanabhaswamy Temple Thiruvananthapuram", "Guruvayur Temple",
      "Sabarimala Temple", "Ambalapuzha Temple", "Ettumanoor Temple",
      "Vadakkunnathan Temple Thrissur", "Attukal Temple"
    ],
    "karnataka": [
      "Virupaksha Temple Hampi", "Chennakesava Temple Belur",
      "Hoysaleswara Temple Halebidu", "Murudeshwar Temple",
      "Chamundeshwari Temple Mysore", "Udupi Krishna Temple",
      "Gokarneshwara Temple"
    ],
    "andhra_pradesh": [
      "Tirumala Venkateswara Temple", "Simhachalam Temple Visakhapatnam",
      "Srikalahasti Temple", "Kanaka Durga Temple Vijayawada",
      "Bhadrachalam Temple"
    ],
    "rajasthan": [
      "Brahma Temple Pushkar", "Dilwara Temples Mount Abu",
      "Karni Mata Temple Bikaner", "Eklingji Temple Udaipur"
    ],
    "maharashtra": [
      "Siddhivinayak Temple Mumbai", "Mahalakshmi Temple Mumbai",
      "Trimbakeshwar Temple Nashik", "Bhimashankar Temple",
      "Aundha Nagnath Temple"
    ],
    "madhya_pradesh": [
      "Khajuraho Temples", "Mahakaleshwar Temple Ujjain",
      "Omkareshwar Temple", "Bhojpur Temple"
    ],
    "uttar_pradesh": [
      "Kashi Vishwanath Temple Varanasi", "Krishna Janmabhoomi Mathura",
      "Banke Bihari Temple Vrindavan", "Hanuman Temple Allahabad"
    ],
    "uttarakhand": [
      "Kedarnath Temple", "Badrinath Temple", "Gangotri Temple",
      "Yamunotri Temple", "Neelkanth Mahadev Temple"
    ]
  },
  "Fort": {
    "rajasthan": [
      "Amber Fort Jaipur", "Mehrangarh Fort Jodhpur", "Jaisalmer Fort",
      "Chittorgarh Fort", "Kumbhalgarh Fort", "Ranthambore Fort",
      "Nahargarh Fort Jaipur", "Jaigarh Fort", "Gagron Fort"
    ],
    "maharashtra": [
      "Raigad Fort", "Pratapgad Fort", "Sinhagad Fort", "Shivneri Fort",
      "Rajgad Fort", "Panhala Fort", "Daulatabad Fort", "Torna Fort",
      "Lohagad Fort", "Vijaydurg Fort"
    ],
    "karnataka": [
      "Chitradurga Fort", "Bangalore Fort", "Belgaum Fort", "Bidar Fort",
      "Mirjan Fort", "Nandi Hills Fort"
    ],
    "madhya_pradesh": [
      "Gwalior Fort", "Mandu Fort", "Asirgarh Fort", "Raisen Fort"
    ],
    "telangana": [
      "Golconda Fort Hyderabad", "Warangal Fort", "Bhongir Fort"
    ],
    "delhi": [
      "Red Fort", "Purana Qila", "Tughlaqabad Fort"
    ],
    "gujarat": [
      "Diu Fort", "Champaner Fort", "Uparkot Fort Junagadh"
    ],
    "tamil_nadu": [
      "Gingee Fort", "Vellore Fort", "Dindigul Fort"
    ]
  },
  "Waterfall": {
    "karnataka": [
      "Jog Falls", "Abbey Falls", "Shivanasamudra Falls", "Kunchikal Falls",
      "Iruppu Falls", "Hebbe Falls", "Gokak Falls", "Unchalli Falls",
      "Magod Falls"
    ],
    "kerala": [
      "Athirappilly Falls", "Vazhachal Falls", "Meenmutty Falls",
      "Soochipara Falls", "Palaruvi Falls", "Thommankuthu Falls",
      "Aruvikkuzhi Falls", "Lakkom Falls"
    ],
    "maharashtra": [
      "Dudhsagar Falls", "Lingmala Falls", "Kune Falls", "Thoseghar Falls",
      "Pandavkada Falls", "Vajrai Falls"
    ],
    "madhya_pradesh": [
      "Dhuandhar Falls", "Bahuti Falls", "Patalpani Falls",
      "Rajat Prapat Falls"
    ],
    "meghalaya": [
      "Nohkalikai Falls", "Seven Sisters Falls", "Elephant Falls",
      "Kynrem Falls", "Wei Sawdong Falls"
    ],
    "jharkhand": [
      "Hundru Falls", "Jonha Falls", "Dassam Falls", "Hirni Falls"
    ],
    "chhattisgarh": [
      "Chitrakote Falls", "Tirathgarh Falls", "Amrit Dhara Falls"
    ],
    "uttarakhand": [
      "Kempty Falls", "Tiger Falls", "Bhatta Falls", "Vasudhara Falls"
    ]
  },
  "HillStation": {
    "himachal_pradesh": [
      "Shimla", "Manali", "Dharamshala", "Dalhousie", "Kasauli",
      "Kullu", "Chamba", "Kufri", "Palampur"
    ],
    "uttarakhand": [
      "Mussoorie", "Nainital", "Rishikesh", "Almora", "Ranikhet",
      "Lansdowne", "Kausani", "Auli"
    ],
    "tamil_nadu": [
      "Ooty", "Kodaikanal", "Coonoor", "Yercaud", "Yelagiri"
    ],
    "kerala": [
      "Munnar", "Wayanad", "Thekkady", "Ponmudi", "Vagamon",
      "Idukki", "Vythiri"
    ],
    "karnataka": [
      "Coorg", "Chikmagalur", "Sakleshpur", "Agumbe", "Kudremukh"
    ],
    "maharashtra": [
      "Mahabaleshwar", "Lonavala", "Matheran", "Panchgani",
      "Khandala", "Amboli"
    ],
    "west_bengal": [
      "Darjeeling", "Kalimpong", "Kurseong", "Mirik"
    ],
    "sikkim": [
      "Gangtok", "Pelling", "Lachung", "Yumthang Valley"
    ]
  },
  "Lakes": {
    "kashmir": [
      "Dal Lake", "Pangong Lake", "Wular Lake", "Manasbal Lake"
    ],
    "rajasthan": [
      "Pichola Lake Udaipur", "Fateh Sagar Lake", "Pushkar Lake",
      "Sambhar Lake", "Nakki Lake Mount Abu"
    ],
    "uttarakhand": [
      "Nainital Lake", "Bhimtal Lake", "Sattal Lake", "Roopkund Lake"
    ],
    "madhya_pradesh": [
      "Bhojtal Bhopal", "Upper Lake", "Lower Lake"
    ],
    "karnataka": [
      "Ulsoor Lake Bangalore", "Pampa Sarovar Hampi", "Kaveri River Lakes"
    ],
    "tamil_nadu": [
      "Ooty Lake", "Kodaikanal Lake", "Yercaud Lake"
    ],
    "kerala": [
      "Vembanad Lake", "Periyar Lake", "Ashtamudi Lake"
    ]
  },
  "National_parks": {
    "madhya_pradesh": [
      "Kanha National Park", "Bandhavgarh National Park",
      "Pench National Park", "Satpura National Park"
    ],
    "rajasthan": [
      "Ranthambore National Park", "Sariska National Park",
      "Keoladeo National Park"
    ],
    "uttarakhand": [
      "Jim Corbett National Park", "Rajaji National Park",
      "Nanda Devi National Park", "Valley of Flowers"
    ],
    "karnataka": [
      "Bandipur National Park", "Nagarhole National Park",
      "Bannerghatta National Park"
    ],
    "assam": [
      "Kaziranga National Park", "Manas National Park"
    ],
    "west_bengal": [
      "Sundarbans National Park", "Gorumara National Park"
    ],
    "kerala": [
      "Periyar National Park", "Silent Valley National Park",
      "Eravikulam National Park"
    ]
  },
  "Palaces": {
    "rajasthan": [
      "City Palace Udaipur", "Hawa Mahal Jaipur",
      "Umaid Bhawan Palace Jodhpur", "Lake Palace Udaipur",
      "Jal Mahal Jaipur", "City Palace Jaipur"
    ],
    "karnataka": [
      "Mysore Palace", "Bangalore Palace", "Tipu Sultan's Palace"
    ],
    "maharashtra": [
      "Aga Khan Palace Pune", "Shaniwar Wada Pune"
    ],
    "west_bengal": [
      "Victoria Memorial Kolkata", "Marble Palace Kolkata"
    ],
    "hyderabad": [
      "Chowmahalla Palace", "Falaknuma Palace", "Chiran Palace"
    ]
  }
}

# Save dataset
dataset_file = f"{CACHE_PATH}/indian_destinations_curated.json"
with open(dataset_file, 'w') as f:
    json.dump(INDIAN_DESTINATIONS, f, indent=2)

print("="*80)
print("CURATED DATASET LOADED")
print("="*80)

# Print summary
total_destinations = 0
for theme, states in INDIAN_DESTINATIONS.items():
    theme_total = sum(len(dests) for dests in states.values())
    total_destinations += theme_total
    print(f"{theme}: {theme_total} destinations across {len(states)} states")

print(f"\nTotal destinations: {total_destinations}")
print("="*80)

CURATED DATASET LOADED
Beach: 125 destinations across 12 states
Temple: 51 destinations across 9 states
Fort: 41 destinations across 8 states
Waterfall: 43 destinations across 8 states
HillStation: 48 destinations across 8 states
Lakes: 25 destinations across 7 states
National_parks: 21 destinations across 7 states
Palaces: 16 destinations across 5 states

Total destinations: 370


# =================================================================
 STEP 5: SEARCH FUNCTION USING CURATED DATASET
# =================================================================



In [34]:
def search_unsplash(query, per_page=10):
    """
    Search Unsplash for images

    Args:
        query: Search query (e.g., "Calangute Beach Goa India")
        per_page: Number of results (max 30)

    Returns:
        List of image data dicts
    """

    if UNSPLASH_ACCESS_KEY == "YOUR_ACCESS_KEY_HERE":
        print("ERROR: Please add your Unsplash API key first")
        return []

    url = "https://api.unsplash.com/search/photos"

    headers = {
        "Authorization": f"Client-ID {UNSPLASH_ACCESS_KEY}"
    }

    params = {
        "query": query,
        "per_page": min(per_page, 30),  # Max 30 per request
        "orientation": "landscape"
    }

    try:
        print(f"  Searching Unsplash for: '{query}'")
        response = requests.get(url, headers=headers, params=params, timeout=10)

        print(f"  API Status: {response.status_code}")

        if response.status_code == 200:
            data = response.json()
            results = data.get('results', [])

            print(f"Found {len(results)} images")

            image_data = []
            for result in results:
                image_data.append({
                    'url': result['urls']['regular'],  # Good quality, reasonable size
                    'download_url': result['links']['download'],
                    'photographer': result['user']['name'],
                    'photographer_url': result['user']['links']['html'],
                    'description': result.get('description', query),
                    'width': result['width'],
                    'height': result['height']
                })

            return image_data

        elif response.status_code == 401:
            print(" ERROR: Invalid API key")
            return []
        elif response.status_code == 403:
            print("  ERROR: Rate limit exceeded (50/hour)")
            return []
        else:
            print(f" ERROR: Status {response.status_code}")
            return []

    except Exception as e:
        print(f" Error: {e}")
        return []


print("="*80)
print("UNSPLASH SEARCH FUNCTION READY")
print("="*80)


UNSPLASH SEARCH FUNCTION READY


# =================================================================
 STEP 6: DOWNLOAD IMAGE FUNCTION
# =================================================================

In [35]:
def download_image_from_url(url, save_path, max_retries=2):
    """Download and save image from URL"""

    for attempt in range(max_retries):
        try:
            print(f"    Downloading...", end=" ")
            response = requests.get(url, timeout=15)

            if response.status_code == 200:
                img = Image.open(io.BytesIO(response.content))

                # Convert if needed
                if img.mode in ('RGBA', 'LA', 'P'):
                    img = img.convert('RGB')

                # Resize if too large
                max_size = (1920, 1920)
                if img.width > max_size[0] or img.height > max_size[1]:
                    img.thumbnail(max_size, Image.Resampling.LANCZOS)

                # Save
                img.save(save_path, 'JPEG', quality=90)
                print(f"Saved ({img.size})")
                return True
            else:
                print(f"Failed ({response.status_code})")

        except Exception as e:
            print(f"Error: {str(e)[:40]}")
            if attempt < max_retries - 1:
                time.sleep(1)

    return False


print("="*80)
print("DOWNLOAD FUNCTION READY")
print("="*80)




DOWNLOAD FUNCTION READY


# =================================================================
 STEP 7: DOWNLOAD BY THEME AND STATE
# =================================================================

In [36]:
def download_by_theme_and_state_unsplash(theme, state, images_per_destination=5,
                                         base_path=LANDMARKS_PATH):
    """
    Download destinations using Unsplash API

    Args:
        theme: Theme name (beaches, temples, etc.)
        state: State name
        images_per_destination: Images to download per destination
        base_path: Base path to save images
    """

    if theme not in INDIAN_DESTINATIONS:
        print(f"ERROR: Theme '{theme}' not found")
        print(f"Available: {', '.join(INDIAN_DESTINATIONS.keys())}")
        return

    if state not in INDIAN_DESTINATIONS[theme]:
        print(f"ERROR: State '{state}' not found")
        print(f"Available: {', '.join(INDIAN_DESTINATIONS[theme].keys())}")
        return

    destinations = INDIAN_DESTINATIONS[theme][state]

    print("\n" + "="*80)
    print(f"DOWNLOADING FROM UNSPLASH: {theme.upper()} - {state.upper()}")
    print("="*80)
    print(f"Destinations: {len(destinations)}")
    print(f"Images per destination: {images_per_destination}")
    print("="*80)

    total_downloaded = 0
    total_failed = 0
    total_no_results = 0

    for dest_idx, destination in enumerate(destinations, 1):

        print(f"\n[{dest_idx}/{len(destinations)}] {destination}")
        print("-" * 80)

        # Create search queries
        queries = [
            f"{destination} {state} India",
            f"{destination} India",
            destination
        ]

        all_images = []

        # Try different queries
        for query in queries:
            if len(all_images) >= images_per_destination:
                break

            results = search_unsplash(query, per_page=images_per_destination)
            all_images.extend(results)

            # Rate limiting
            time.sleep(1)

        if not all_images:
            print(f" No images found on Unsplash")
            total_no_results += 1
            continue

        # Remove duplicates
        seen_urls = set()
        unique_images = []
        for img in all_images:
            if img['url'] not in seen_urls:
                seen_urls.add(img['url'])
                unique_images.append(img)

        print(f" Found {len(unique_images)} unique images")

        # Clean folder name
        folder_name = destination.lower()
        folder_name = re.sub(r'[^\w\s-]', '', folder_name)
        folder_name = folder_name.replace(' ', '_')[:50]

        # Create folder
        dest_folder = f"{base_path}/{theme}/{state}/{folder_name}"
        os.makedirs(dest_folder, exist_ok=True)
        print(f"Folder: {dest_folder}")

        # Download images
        downloaded_count = 0

        for idx, img_data in enumerate(unique_images[:images_per_destination], 1):
            save_path = f"{dest_folder}/{folder_name}_{idx:03d}.jpg"

            if os.path.exists(save_path):
                print(f"  [{idx}] âœ“ Already exists")
                downloaded_count += 1
                total_downloaded += 1
                continue

            print(f"  [{idx}] {img_data['photographer']}")
            print(f"    URL: {img_data['url'][:80]}...")

            if download_image_from_url(img_data['url'], save_path):
                downloaded_count += 1
                total_downloaded += 1
            else:
                total_failed += 1

            # Rate limiting
            time.sleep(1)

        print(f"\n  Downloaded: {downloaded_count}/{len(unique_images)}")

    # Final summary
    print("\n" + "="*80)
    print(f"SUMMARY: {theme.upper()} - {state.upper()}")
    print("="*80)
    print(f"Downloaded: {total_downloaded} images")
    print(f"Failed: {total_failed}")
    print(f"No results: {total_no_results} destinations")
    print(f" Saved to: {base_path}/{theme}/{state}/")
    print("="*80)


print("="*80)
print("UNSPLASH DOWNLOAD FUNCTION READY")
print("="*80)

UNSPLASH DOWNLOAD FUNCTION READY


# =================================================================
  STEP 8: BATCH DOWNLOAD FUNCTION
# =================================================================

In [37]:
def batch_download_unsplash(theme_state_list, images_per_destination=10):
    """
    Download multiple theme-state combinations

    Args:
        theme_state_list: List of (theme, state) tuples
        images_per_destination: Images per destination

    Example:
        batch_download_unsplash([
            ('beaches', 'goa'),
            ('temples', 'tamil_nadu'),
            ('forts', 'rajasthan')
        ], images_per_destination=5)
    """

    print("="*80)
    print("BATCH DOWNLOAD FROM UNSPLASH")
    print("="*80)
    print(f"Combinations: {len(theme_state_list)}")
    print("="*80)

    for idx, (theme, state) in enumerate(theme_state_list, 1):
        print(f"\n[{idx}/{len(theme_state_list)}] Processing: {theme} - {state}")

        download_by_theme_and_state_unsplash(
            theme=theme,
            state=state,
            images_per_destination=images_per_destination
        )

        # Delay between states to respect rate limits
        if idx < len(theme_state_list):
            print("\nWaiting 5 seconds before next batch...")
            time.sleep(5)

    print("\n" + "="*80)
    print("BATCH DOWNLOAD COMPLETE")
    print("="*80)


print("="*80)
print("BATCH DOWNLOAD FUNCTION READY")
print("="*80)

BATCH DOWNLOAD FUNCTION READY


#  =================================================================
 STEP 9: CREATE METADATA (UPDATED FOR CLIP INTEGRATION)
#  =================================================================


In [38]:
# ============================================================================
# CELL 9: CREATE METADATA (UPDATED FOR CLIP INTEGRATION)
# ============================================================================

def create_metadata(base_path=LANDMARKS_PATH):
    """Generate metadata.json compatible with CLIP prompt library"""

    metadata = {
        "version": "1.0",
        "created_date": "2026-02-14",
        "source": "Unsplash API",
        "license": "Unsplash License (Free to use)",
        "dataset": "Indian Travel Destinations",
        "clip_prompt_library": "clip_prompts_india_themes_semantic.json",
        "prompt_categories": [
            "LandscapeType", "RegionalStyle", "Atmosphere", "VisualQuality",
            "CrowdDensity", "NaturalVsCultural", "WaterFeatures",
            "VegetationType", "Accessibility", "Activities", "EconomyBudget"
        ],
        "themes": []
    }

    total_images = 0
    total_destinations = 0

    for theme in sorted(os.listdir(base_path)):
        theme_path = os.path.join(base_path, theme)

        if not os.path.isdir(theme_path) or theme.startswith('.'):
            continue

        theme_data = {
            "theme_name": theme,
            "theme_type": theme,  # Beach, Temple, Waterfall, HillStation
            "states": []
        }

        for state in sorted(os.listdir(theme_path)):
            state_path = os.path.join(theme_path, state)

            if not os.path.isdir(state_path) or state.startswith('.'):
                continue

            state_data = {
                "state_name": state,
                "destinations": []
            }

            for destination in sorted(os.listdir(state_path)):
                dest_path = os.path.join(state_path, destination)

                if not os.path.isdir(dest_path) or destination.startswith('.'):
                    continue

                images = sorted([f for f in os.listdir(dest_path)
                               if f.endswith(('.jpg', '.jpeg', '.png'))])

                if images:
                    # Create destination ID
                    dest_id = f"{theme.upper()}_{state.upper()}_{destination.upper()}"
                    dest_id = dest_id.replace(' ', '_').replace('-', '_')

                    dest_data = {
                        "destination_id": dest_id,
                        "destination_name": destination.replace('_', ' ').title(),
                        "theme": theme,
                        "state": state,
                        "folder": f"{theme}/{state}/{destination}",
                        "folder_path": f"{base_path}/{theme}/{state}/{destination}",
                        "images": images,
                        "image_count": len(images),

                        # Placeholders for VL Encoding Layer
                        "embeddings_computed": False,
                        "prompts_extracted": False,
                        "geo_tagged": False,

                        # Placeholder for geo-location
                        "geo_location": {
                            "latitude": None,
                            "longitude": None,
                            "city": None,
                            "region": None
                        },

                        # Placeholder for offers
                        "offers": {
                            "hotels": [],
                            "activities": [],
                            "flights": [],
                            "packages": []
                        }
                    }

                    state_data["destinations"].append(dest_data)
                    total_images += len(images)
                    total_destinations += 1

            if state_data["destinations"]:
                theme_data["states"].append(state_data)

        if theme_data["states"]:
            metadata["themes"].append(theme_data)

    metadata["total_themes"] = len(metadata["themes"])
    metadata["total_destinations"] = total_destinations
    metadata["total_images"] = total_images

    # Save
    metadata_path = f"{base_path}/metadata.json"
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)

    print("="*80)
    print("METADATA CREATED")
    print("="*80)
    print(f"Themes: {metadata['total_themes']}")
    print(f"Destinations: {metadata['total_destinations']}")
    print(f"Total images: {metadata['total_images']}")
    print(f"Saved to: {metadata_path}")
    print("="*80)

    return metadata

=================================================================

STEP 11: DOWNLOAD IMAGES  FOR THEMES: BEACHES, WATERFALLS,
   HILLSTATIONS AND TEMPLES

=================================================================

In [39]:

# MINIMAL DOWNLOAD (Quick test - ~40 destinations, ~200 images)
minimal_download = [
    ('Beach', 'goa'),              # 8 beaches
    ('Temple', 'tamil_nadu'),      # 4 temples
    ('Waterfall', 'karnataka'),    # 3 waterfalls
    ('HillStation', 'himachal_pradesh')  # 4 hill stations
]

# RECOMMENDED DOWNLOAD (~60 destinations, ~300 images)
recommended_download = [
    # Beaches (20 destinations)
    ('Beach', 'goa'),
    ('Beach', 'kerala'),
    ('Beach', 'maharashtra'),

    # Temples (13 destinations)
    ('Temple', 'tamil_nadu'),
    ('Temple', 'kerala'),
    ('Temple', 'karnataka'),

    # Waterfalls (11 destinations)
    ('Waterfall', 'karnataka'),
    ('Waterfall', 'kerala'),
    ('Waterfall', 'maharashtra'),

    # Hill Stations (14 destinations)
    ('HillStation', 'himachal_pradesh'),
    ('HillStation', 'uttarakhand'),
    ('HillStation', 'tamil_nadu')
]

# FULL DOWNLOAD (All states, ~70 destinations, ~350 images)
full_download = [
    # All Beaches
    ('Beach', 'goa'),
    ('Beach', 'kerala'),
    ('Beach', 'maharashtra'),
    ('Beach', 'tamil_nadu'),
    ('Beach', 'karnataka'),

    # All Temples
    ('Temple', 'tamil_nadu'),
    ('Temple', 'kerala'),
    ('Temple', 'karnataka'),
    ('Temple', 'andhra_pradesh'),

    # All Waterfalls
    ('Waterfall', 'karnataka'),
    ('Waterfall', 'kerala'),
    ('Waterfall', 'maharashtra'),
    ('Waterfall', 'meghalaya'),

    # All Hill Stations
    ('HillStation', 'himachal_pradesh'),
    ('HillStation', 'uttarakhand'),
    ('HillStation', 'tamil_nadu'),
    ('HillStation', 'kerala')
]

all_beaches = [
    ('Beach', 'goa'),
    ('Beach', 'kerala'),
    ('Beach', 'maharashtra'),
    ('Beach', 'tamil_nadu'),
    ('Beach', 'karnataka')
]
print("="*80)
print("="*80)

print("\n3. ALL BEACHES")
print(" batch_download_unsplash(all_beaches, images_per_destination=5)")

print("\n" + "="*80)

batch_download_unsplash(all_beaches, images_per_destination=5)

create_metadata()


3. ALL BEACHES
 batch_download_unsplash(all_beaches, images_per_destination=5)

BATCH DOWNLOAD FROM UNSPLASH
Combinations: 5

[1/5] Processing: Beach - goa

DOWNLOADING FROM UNSPLASH: BEACH - GOA
Destinations: 12
Images per destination: 5

[1/12] Candolim Beach
--------------------------------------------------------------------------------
  Searching Unsplash for: 'Candolim Beach goa India'
  API Status: 200
Found 5 images
 Found 5 unique images
Folder: /content/drive/MyDrive/visual-intelligence-travel-finance/data/landmarks/Beach/goa/candolim_beach
  [1] Joydeep Sensarma
    URL: https://images.unsplash.com/photo-1653975725948-0983735da034?crop=entropy&cs=tin...
    Downloading... Saved ((1080, 718))
  [2] Dipti Goyal
    URL: https://images.unsplash.com/photo-1663659763040-e9774bec2285?crop=entropy&cs=tin...
    Downloading... Saved ((1080, 810))
  [3] Hamza Shaikh
    URL: https://images.unsplash.com/photo-1584111728495-7d9599602355?crop=entropy&cs=tin...
    Downloading... Saved

KeyboardInterrupt: 

In [40]:
create_metadata()

METADATA CREATED
Themes: 1
Destinations: 28
Total images: 130
Saved to: /content/drive/MyDrive/visual-intelligence-travel-finance/data/landmarks/metadata.json


{'version': '1.0',
 'created_date': '2026-02-14',
 'source': 'Unsplash API',
 'license': 'Unsplash License (Free to use)',
 'dataset': 'Indian Travel Destinations',
 'clip_prompt_library': 'clip_prompts_india_themes_semantic.json',
 'prompt_categories': ['LandscapeType',
  'RegionalStyle',
  'Atmosphere',
  'VisualQuality',
  'CrowdDensity',
  'NaturalVsCultural',
  'WaterFeatures',
  'VegetationType',
  'Accessibility',
  'Activities',
  'EconomyBudget'],
 'themes': [{'theme_name': 'Beach',
   'theme_type': 'Beach',
   'states': [{'state_name': 'goa',
     'destinations': [{'destination_id': 'BEACH_GOA_AGONDA_BEACH',
       'destination_name': 'Agonda Beach',
       'theme': 'Beach',
       'state': 'goa',
       'folder': 'Beach/goa/agonda_beach',
       'folder_path': '/content/drive/MyDrive/visual-intelligence-travel-finance/data/landmarks/Beach/goa/agonda_beach',
       'images': ['agonda_beach_001.jpg'],
       'image_count': 1,
       'embeddings_computed': False,
       'prompt