# RAG Trip Recommendations

Recommend trips based on GEO entities extracted from customer reviews using Vector Search.

In [1]:
# pip install chromadb openai

import json
import chromadb
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv(dotenv_path='../.env')
client = OpenAI()

## 1. Load trips data

In [2]:
with open('../data/trips_data.json', 'r', encoding='utf-8') as f:
    trips = json.load(f)

print(f"Loaded {len(trips)} trips")
print(f"Example: {trips[0]}")

Loaded 154 trips
Example: {'Country': 'Greece', 'City': 'Athens', 'Start date': '2025-04-01', 'Count of days': 5, 'Cost in EUR': 1200, 'Extra activities': ['Acropolis tour', 'Food tasting'], 'Trip details': 'Experience the rich history of Athens by visiting the Acropolis and its surroundings. Enjoy traditional Greek dishes at local tavernas. Explore the vibrant city center for a memorable cultural immersion.'}


## 2. Prepare texts for embedding

Combine trip fields into a single searchable text.

In [3]:
def trip_to_text(trip):
    """Convert trip dict to text for embedding."""
    activities = ", ".join(trip["Extra activities"])
    return f"{trip['Country']}, {trip['City']}. {trip['Trip details']} Activities: {activities}"

# Prepare all texts
trip_texts = [trip_to_text(trip) for trip in trips]
trip_ids = [f"trip_{i}" for i in range(len(trips))]

print("Example text for embedding:")
print(trip_texts[1])

Example text for embedding:
Spain, Barcelona. Discover Barcelona’s famous architecture, including Gaudí’s Sagrada Família. Indulge in tapas at bustling local bars. Take time to explore the city’s vibrant nightlife and Mediterranean beaches. Activities: City tour, Nightlife tour, Food tasting


## 3. Create Vector Store with ChromaDB

ChromaDB will handle embeddings automatically using OpenAI.

In [4]:
# Initialize ChromaDB (in-memory)
chroma_client = chromadb.Client()

# Create embedding function using OpenAI
# ChromaDB has built-in OpenAI integration
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

embedding_fn = OpenAIEmbeddingFunction(
    model_name="text-embedding-3-small"
)

# Create or get collection
collection = chroma_client.get_or_create_collection(
    name="trips",
    embedding_function=embedding_fn
)

print(f"Collection created: {collection.name}")

Collection created: trips


## 4. Index trips (add to vector store)

In [5]:
# Prepare metadata for each trip (for filtering and display)
trip_metadatas = [
    {
        "country": trip["Country"],
        "city": trip["City"],
        "cost": trip["Cost in EUR"],
        "days": trip["Count of days"],
        "start_date": trip["Start date"]
    }
    for trip in trips
]

# Add all trips to collection (this creates embeddings automatically)
collection.add(
    documents=trip_texts,
    ids=trip_ids,
    metadatas=trip_metadatas
)

print(f"Indexed {collection.count()} trips")

Indexed 154 trips


## 5. Search function

In [6]:
def search_trips_by_entities(geo_entities, n_results=3):
    """
    Search for trips based on GEO entities from customer review.
    
    Args:
        geo_entities: list of GEO entity strings, e.g. ["spain", "las ramblas"]
        n_results: number of trips to return
    
    Returns:
        list of matching trips with scores
    """
    # Convert entities to natural query
    query = f"Trip visiting: {', '.join(geo_entities)}"
    
    # Search in ChromaDB
    results = collection.query(
        query_texts=[query],
        n_results=n_results
    )
    
    # Format results
    recommendations = []
    for i in range(len(results['ids'][0])):
        trip_id = results['ids'][0][i]
        trip_idx = int(trip_id.split('_')[1])
        
        recommendations.append({
            "rank": i + 1,
            "country": results['metadatas'][0][i]['country'],
            "city": results['metadatas'][0][i]['city'],
            "cost": results['metadatas'][0][i]['cost'],
            "days": results['metadatas'][0][i]['days'],
            "distance": results['distances'][0][i],  # lower = more similar
            "details": trips[trip_idx]['Trip details']
        })
    
    return recommendations


def print_recommendations(recommendations, entities):
    """Pretty print recommendations."""
    print(f"\n{'='*60}")
    print(f"GEO Entities: {entities}")
    print(f"{'='*60}")
    
    for rec in recommendations:
        print(f"\n#{rec['rank']}: {rec['city']}, {rec['country']}")
        print(f"   Cost: {rec['cost']} EUR | Days: {rec['days']}")
        print(f"   Score: {1 - rec['distance']:.3f}")  # Convert distance to similarity
        print(f"   {rec['details'][:200]}...")

## 6. Test with example entities

In [15]:
# Test 1: Spanish entities
entities_1 = ["spain", "las ramblas", "gothic district", "sagrada familia"]
results_1 = search_trips_by_entities(entities_1)
print_recommendations(results_1, entities_1)


GEO Entities: ['spain', 'las ramblas', 'gothic district', 'sagrada familia']

#1: Barcelona, Spain
   Cost: 950 EUR | Days: 4
   Score: 0.651
   Discover Barcelona’s famous architecture, including Gaudí’s Sagrada Família. Indulge in tapas at bustling local bars. Take time to explore the city’s vibrant nightlife and Mediterranean beaches....

#2: Seville, Spain
   Cost: 950 EUR | Days: 5
   Score: 0.610
   Admire the world’s largest Gothic cathedral and climb the Giralda tower. Immerse yourself in the rhythms of flamenco dancing. Savor local tapas in atmospheric squares and taverns....

#3: Bilbao, Spain
   Cost: 950 EUR | Days: 4
   Score: 0.569
   Marvel at cutting-edge architecture in the iconic Guggenheim Museum. Indulge in Basque pintxos and unique regional wines. Enjoy riverside walks and the city’s vibrant cultural scene....


In [8]:
# Test 2: Italian entities
entities_2 = ["rome", "colosseum", "pasta", "italian"]
results_2 = search_trips_by_entities(entities_2)
print_recommendations(results_2, entities_2)


GEO Entities: ['rome', 'colosseum', 'pasta', 'italian']

#1: Rome, Italy
   Cost: 1350 EUR | Days: 6
   Score: 0.675
   Immerse yourself in Rome’s ancient history with a visit to the Colosseum and the Roman Forum. Learn to make authentic pasta in a hands-on cooking class. Stroll through charming piazzas and discover hi...

#2: Rome, Italy
   Cost: 1400 EUR | Days: 5
   Score: 0.618
   Explore the spiritual heart of Catholicism at St. Peter’s Basilica and the Vatican Museums. Wander Rome’s ancient ruins and Renaissance piazzas. Savor authentic gelato as you soak up la dolce vita....

#3: Naples, Italy
   Cost: 1000 EUR | Days: 5
   Score: 0.607
   Sample legendary pizza from generations-old pizzerias. Take a day trip to the ruins of Pompeii for ancient Roman insights. Revel in southern Italy’s vibrant street life and Mediterranean flair....


In [9]:
# Test 3: Beach/tropical entities
entities_3 = ["beach", "coral reef", "snorkeling", "red sea"]
results_3 = search_trips_by_entities(entities_3)
print_recommendations(results_3, entities_3)


GEO Entities: ['beach', 'coral reef', 'snorkeling', 'red sea']

#1: Sharm El Sheikh, Egypt
   Cost: 1050 EUR | Days: 5
   Score: 0.717
   Discover vibrant coral reefs on a snorkeling adventure in the Red Sea. Cruise along the coast for breathtaking marine vistas. Relax at seaside cafes and savor local seafood delicacies....

#2: Hurghada, Egypt
   Cost: 1100 EUR | Days: 5
   Score: 0.685
   Dive into the Red Sea’s vibrant underwater world. Head inland for a desert safari showcasing unique landscapes. Enjoy sun-soaked beaches and a variety of water sports....

#3: Aqaba, Jordan
   Cost: 1000 EUR | Days: 4
   Score: 0.676
   Dive into the Red Sea’s vibrant coral reefs. Experience Wadi Rum’s dramatic desert scenery on a guided tour. Enjoy the warm hospitality and flavors of southern Jordan....


## 7. Integration with NER results

Load NER results from customer reviews and recommend trips.

In [10]:
# Load NER results from previous processing
with open('../data/output/NER_customer_surveys_hotels_1k.json', 'r', encoding='utf-8') as f:
    ner_results = json.load(f)

print(f"Loaded NER results for {len(ner_results)} reviews")
print(f"\nExample NER result:")
print(f"Text: {ner_results[0]['text'][:100]}...")
print(f"Entities: {ner_results[0]['entities']}")

Loaded NER results for 1000 reviews

Example NER result:
Text: hotel america nice hotel good location stayed 3 nights hotel america late december, rooms modern nic...
Entities: [{'label': 'org', 'text': 'hotel america'}, {'label': 'org', 'text': 'hotel america'}, {'label': 'tim', 'text': 'december'}, {'label': 'gpe', 'text': 'las ramblas'}, {'label': 'gpe', 'text': 'spain'}, {'label': 'org', 'text': 'hotel america'}]


In [16]:
def recommend_trips_for_review(ner_result, n_results=3):
    """
    Recommend trips based on NER result from a customer review.
    
    Args:
        ner_result: dict with 'text' and 'entities' fields
        n_results: number of trips to recommend
    """
    # Extract GEO-related entities (geo, gpe, nat labels)
    geo_entities = [
        ent['text'] 
        for ent in ner_result['entities'] 
        if ent['label'] in ['geo', 'gpe', 'nat']
    ]
    
    if not geo_entities:
        print("No GEO entities found in this review.")
        return []
    
    return search_trips_by_entities(geo_entities, n_results)


# Test with first few reviews that have GEO entities
for i, ner_result in enumerate(ner_results[:10]):
    geo_ents = [e['text'] for e in ner_result['entities'] if e['label'] in ['geo', 'gpe', 'nat']]
    if geo_ents:
        print(f"\n\nReview #{i}:")
        print(f"Review text: {ner_result['text'][:150]}...")
        recommendations = recommend_trips_for_review(ner_result)
        print_recommendations(recommendations, geo_ents)
        break  # Show just first match for demo



Review #0:
Review text: hotel america nice hotel good location stayed 3 nights hotel america late december, rooms modern nice, really liked location hotel, located 3 blocks m...

GEO Entities: ['las ramblas', 'spain']

#1: Barcelona, Spain
   Cost: 950 EUR | Days: 4
   Score: 0.561
   Discover Barcelona’s famous architecture, including Gaudí’s Sagrada Família. Indulge in tapas at bustling local bars. Take time to explore the city’s vibrant nightlife and Mediterranean beaches....

#2: Granada, Spain
   Cost: 850 EUR | Days: 3
   Score: 0.544
   Explore the captivating Moorish palace of the Alhambra. Wander the winding lanes of the Albaicín. Feel the pulse of flamenco in authentic tablaos across the city....

#3: Granada, Spain
   Cost: 900 EUR | Days: 3
   Score: 0.541
   Explore the ornate palaces and gardens of the Alhambra. Enjoy a tapas crawl through narrow lanes of the Albaicín. Immerse yourself in the fusion of Moorish and Spanish cultures....


## 8. Batch processing - recommend trips for all reviews

In [12]:
def process_all_reviews(ner_results, n_results=3):
    """
    Process all reviews and generate trip recommendations.
    """
    all_recommendations = []
    
    for ner_result in ner_results:
        geo_entities = [
            ent['text'] 
            for ent in ner_result['entities'] 
            if ent['label'] in ['geo', 'gpe', 'nat']
        ]
        
        if geo_entities:
            recommendations = search_trips_by_entities(geo_entities, n_results)
        else:
            recommendations = []
        
        all_recommendations.append({
            "review_id": ner_result.get('id', ''),
            "geo_entities": geo_entities,
            "recommendations": recommendations
        })
    
    return all_recommendations

# Process all 1000 reviews
demo_recommendations = process_all_reviews(ner_results[:1000])

# Show statistics
with_recs = sum(1 for r in demo_recommendations if r['recommendations'])
print(f"Processed {len(demo_recommendations)} reviews")
print(f"Reviews with GEO entities and recommendations: {with_recs}")

Processed 1000 reviews
Reviews with GEO entities and recommendations: 660


In [13]:
# Save recommendations to file
output_path = '../data/output/RAG_trip_recommendations.json'

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(demo_recommendations, f, ensure_ascii=False, indent=2)

print(f"Saved recommendations to {output_path}")

Saved recommendations to ../data/output/RAG_trip_recommendations.json
