# üåç Tourism Data Collection Pipeline - Google Colab Setup

This notebook sets up and runs the complete tourism data collection pipeline with AI enrichment.

## Features:
- ‚úÖ Gemma 3 12B model initialization
- ‚úÖ OSM data extraction
- ‚úÖ AI-powered POI enrichment (Price, Tips, Best Time)
- ‚úÖ Destination profiling
- ‚úÖ Supabase database loading

## Requirements:
- GPU runtime (T4 recommended)
- Supabase credentials
- Hugging Face Token (for Gemma model access)
- ~2-3 hours for full pipeline

## üì¶ Step 1: Install Dependencies

In [None]:
%%capture
# Install required packages
!pip install transformers accelerate bitsandbytes
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install supabase requests overpy tqdm python-dotenv streamlit

## üîê Step 2: Configure Credentials

In [None]:
import os
from google.colab import userdata

# Set up Supabase credentials
# Add these as Colab secrets: SUPABASE_URL, SUPABASE_KEY, HF_TOKEN
try:
    os.environ['SUPABASE_URL'] = userdata.get('SUPABASE_URL')
    os.environ['SUPABASE_KEY'] = userdata.get('SUPABASE_KEY')
    os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')
    print("‚úÖ Credentials loaded from Colab secrets")
except:
    print("‚ö†Ô∏è Credentials not found in secrets. Please add them manually:")
    os.environ['SUPABASE_URL'] = input("Enter SUPABASE_URL: ")
    os.environ['SUPABASE_KEY'] = input("Enter SUPABASE_KEY: ")
    os.environ['HF_TOKEN'] = input("Enter Hugging Face Token: ")
    print("‚úÖ Credentials set")

# Login to Hugging Face
from huggingface_hub import login
login(token=os.environ['HF_TOKEN'])

## üì• Step 3: Clone Repository

In [None]:
# Clone the repository (or upload your code)
!git clone https://github.com/alokanand1official/data-collector-be.git
%cd data-collector-be/data_collector

# Alternative: Upload from local
# from google.colab import files
# uploaded = files.upload()  # Upload your zipped codebase

## üíé Step 4: Initialize Gemma 3 Model

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import json

# Check GPU availability
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# Configure 4-bit quantization for memory efficiency
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load Gemma 3 12B model
model_name = "google/gemma-3-12b-it"

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)

print("Loading model (this may take 2-3 minutes)...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

print("‚úÖ Gemma 3 12B model loaded successfully!")

# Test the model
def generate_text(prompt, max_length=512):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_length,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test generation
test_prompt = "Generate a JSON object with a description of Paris: "
result = generate_text(test_prompt, max_length=100)
print("\nTest generation:")
print(result)

## üîß Step 5: Create Gemma-based Enricher

In [None]:
%%writefile etl/enrich/colab_ai_enricher.py
import json
import logging
from pathlib import Path
from typing import Dict, Any

logger = logging.getLogger("ColabAIEnricher")

class ColabAIEnricher:
    """
    AI Enricher using Hugging Face Transformers (for Colab).
    Uses the loaded Gemma model directly instead of Ollama API.
    """
    
    def __init__(self, model, tokenizer, silver_dir: Path, gold_dir: Path):
        self.model = model
        self.tokenizer = tokenizer
        self.silver_dir = silver_dir
        self.gold_dir = gold_dir
        
    def process_city(self, city_name: str, limit: int = None) -> bool:
        """Enriches POIs for a city using the Gemma model."""
        city_key = city_name.lower().replace(" ", "_")
        
        # Load Silver data
        silver_file = self.silver_dir / city_key / "pois.json"
        if not silver_file.exists():
            logger.error(f"Silver file not found: {silver_file}")
            return False
            
        with open(silver_file, 'r') as f:
            pois = json.load(f)
            
        # Load existing Gold data
        gold_file = self.gold_dir / city_key / "pois.json"
        existing_pois = []
        if gold_file.exists():
            with open(gold_file, 'r') as f:
                existing_pois = json.load(f)
                
        existing_ids = {poi.get('osm_id') for poi in existing_pois}
        
        # Filter POIs to enrich
        to_enrich = [poi for poi in pois if poi.get('osm_id') not in existing_ids]
        
        if limit:
            to_enrich = to_enrich[:limit]
            
        logger.info(f"Enriching {len(to_enrich)} POIs for {city_name}...")
        
        # Enrich POIs
        enriched_pois = []
        for i, poi in enumerate(to_enrich):
            try:
                enriched = self._enrich_poi(poi)
                enriched_pois.append(enriched)
                if (i + 1) % 10 == 0:
                    logger.info(f"Enriched {i + 1}/{len(to_enrich)} POIs")
            except Exception as e:
                logger.error(f"Failed to enrich {poi.get('name')}: {e}")
                enriched_pois.append(self._mock_enrich(poi))
                
        # Merge with existing
        all_pois = existing_pois + enriched_pois
        
        # Save to Gold
        gold_file.parent.mkdir(parents=True, exist_ok=True)
        with open(gold_file, 'w') as f:
            json.dump(all_pois, f, indent=2, ensure_ascii=False)
            
        logger.info(f"‚úÖ Enriched {len(enriched_pois)} POIs. Total: {len(all_pois)}")
        return True
        
    def _enrich_poi(self, poi: Dict) -> Dict:
        """Enriches a single POI using the Gemma model."""
        prompt = f"""Analyze this tourism POI and provide enrichment data in JSON format:
Name: {poi.get('name')}
Type: {poi.get('poi_type')}
Tags: {poi.get('tags')}

Return ONLY valid JSON with:
1. "description": Engaging 2-3 sentence description
2. "duration_min": Recommended visit time (minutes)
3. "best_time": Best time to visit (Morning/Afternoon/Evening/Anytime)
4. "best_time_reason": Why this time is best
5. "price_level": 0=Free, 1=Cheap, 2=Moderate, 3=Expensive
6. "tips": Array of 2-3 practical tips
7. "what_to_expect": One sentence summary
8. "personas": Score 0-100 for {{Culture, Adventure, Food, Relax}}
"""
        
        # Generate using Gemma
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
        outputs = self.model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=0.7,
            do_sample=True,
            pad_token_id=self.tokenizer.eos_token_id
        )
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract JSON from response
        try:
            start = response.find('{')
            end = response.rfind('}') + 1
            if start != -1 and end > start:
                enrichment = json.loads(response[start:end])
                
                # Merge enrichment with original POI
                poi['description'] = enrichment.get('description', poi.get('description', ''))
                poi['duration_min'] = enrichment.get('duration_min', 60)
                poi['best_time'] = enrichment.get('best_time', 'Anytime')
                poi['best_time_reason'] = enrichment.get('best_time_reason', 'Good time to visit')
                poi['price_level'] = enrichment.get('price_level', 2)
                poi['tips'] = enrichment.get('tips', [])
                poi['what_to_expect'] = enrichment.get('what_to_expect', '')
                poi['personas'] = enrichment.get('personas', {"Culture": 50, "Relax": 50})
                poi['is_popular'] = enrichment.get('is_popular', False)
                
                return poi
        except:
            pass
            
        return self._mock_enrich(poi)
            
    def _mock_enrich(self, poi: Dict) -> Dict:
        """Fallback enrichment"""
        poi['description'] = poi.get('description', f"A wonderful place in {poi.get('city_name', 'the city')}.")
        poi['duration_min'] = 60
        poi['best_time'] = "Morning"
        poi['best_time_reason'] = "Good lighting"
        poi['price_level'] = 2
        poi['tips'] = ["Check opening hours"]
        poi['what_to_expect'] = "Interesting experience"
        poi['personas'] = {"Culture": 80, "Relax": 50}
        return poi


## üèÉ Step 6: Run Bronze Layer (Data Extraction)

In [None]:
from orchestrator import Orchestrator

# Initialize orchestrator
orch = Orchestrator()

# Choose cities to process
# Azerbaijan Cities: Baku, Gabala, Sheki, Ganja, Quba, Lahij, Gobustan
CITIES = ["Baku", "Gabala", "Sheki", "Ganja", "Quba", "Lahij", "Gobustan"]

# Run Bronze Layer (OSM data extraction)
for city in CITIES:
    print(f"\n{'='*50}")
    print(f"Extracting data for {city}...")
    print(f"{'='*50}")
    orch.run_bronze_layer(city)
    print(f"‚úÖ Bronze layer complete for {city}")

## üîÑ Step 7: Run Silver Layer (Data Transformation)

In [None]:
# Run Silver Layer (data standardization)
for city in CITIES:
    print(f"\nTransforming data for {city}...")
    orch.run_silver_layer(city)
    print(f"‚úÖ Silver layer complete for {city}")

## ‚ú® Step 8: Run Gold Layer (AI Enrichment)

In [None]:
from pathlib import Path
from etl.enrich.colab_ai_enricher import ColabAIEnricher

# Initialize Colab enricher with the loaded model
enricher = ColabAIEnricher(
    model=model,
    tokenizer=tokenizer,
    silver_dir=Path("layers/silver"),
    gold_dir=Path("layers/gold")
)

# Enrich POIs for each city
for city in CITIES:
    print(f"\n{'='*50}")
    print(f"Enriching POIs for {city}...")
    print(f"{'='*50}")
    enricher.process_city(city, limit=50)  # Limit to 50 POIs for faster testing
    print(f"‚úÖ Gold layer complete for {city}")

## üì§ Step 9: Load to Supabase

In [None]:
# Load enriched data to Supabase
for city in CITIES:
    print(f"\nLoading {city} to Supabase...")
    orch.run_load_layer(city)
    print(f"‚úÖ Data loaded for {city}")

print("\nüéâ Pipeline complete! All data loaded to Supabase.")

## üìä Step 10: Verify Results

In [None]:
import json
from pathlib import Path

# Check enriched data
for city in CITIES:
    city_key = city.lower().replace(" ", "_")
    gold_file = Path(f"layers/gold/{city_key}/pois.json")
    
    if gold_file.exists():
        with open(gold_file, 'r') as f:
            pois = json.load(f)
        
        print(f"\n{city}:")
        print(f"  Total POIs: {len(pois)}")
        print(f"  Sample POI:")
        if pois:
            sample = pois[0]
            print(f"    Name: {sample.get('name')}")
            print(f"    Description: {sample.get('description', 'N/A')[:100]}...")
            print(f"    Duration: {sample.get('duration_min', 'N/A')} min")
            print(f"    Best Time: {sample.get('best_time', 'N/A')}")

## üíæ Step 11: Download Results (Optional)

In [None]:
# Zip and download the enriched data
!zip -r enriched_data.zip layers/gold/

from google.colab import files
files.download('enriched_data.zip')

print("‚úÖ Enriched data downloaded!")