### Prepare data for langgraph examples

In [1]:
!pip install faker

Collecting faker
  Downloading Faker-30.1.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-30.1.0-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-30.1.0


In [2]:
import csv
import random
from faker import Faker
from datetime import timedelta

# Initialize Faker for generating fake data
fake = Faker()

# List of top 50 US and European cities
us_cities = [
    "New York",
    "Los Angeles",
    "Chicago",
    "Las Vegas",
    "San Francisco",
    "Orlando",
    "Miami",
    "Washington",
    "New Orleans",
    "Boston",
    "Seattle",
    "San Diego",
    "Honolulu",
    "Nashville",
    "Denver",
    "Austin",
    "Philadelphia",
    "Atlanta",
    "San Antonio",
    "Portland",
    "Charleston",
    "Phoenix",
    "Savannah",
    "Dallas",
    "Minneapolis",
    "Houston",
    "Santa Fe",
    "Sedona",
    "Key West",
    "Salt Lake City",
    "Baltimore",
    "Charlotte",
    "Newport",
    "Scottsdale",
    "Jacksonville",
    "Tampa",
    "St. Louis",
    "Indianapolis",
    "Asheville",
    "San Jose",
    "Kansas City",
    "Milwaukee",
    "Pittsburgh",
    "Cincinnati",
    "Providence",
    "Fort Lauderdale",
    "Savannah",
    "Richmond",
    "Sacramento",
    "Oklahoma City",
]

european_cities = [
    "Paris",
    "London",
    "Rome",
    "Barcelona",
    "Amsterdam",
    "Berlin",
    "Vienna",
    "Prague",
    "Venice",
    "Florence",
    "Istanbul",
    "Madrid",
    "Lisbon",
    "Dublin",
    "Budapest",
    "Athens",
    "Edinburgh",
    "Copenhagen",
    "Stockholm",
    "Brussels",
    "Zurich",
    "Milan",
    "Munich",
    "Seville",
    "Porto",
    "Dubrovnik",
    "Krakow",
    "Reykjavik",
    "Nice",
    "Warsaw",
    "Helsinki",
    "Oslo",
    "Geneva",
    "Lyon",
    "St. Petersburg",
    "Moscow",
    "Tallinn",
    "Bergen",
    "Bratislava",
    "Ljubljana",
    "Valencia",
    "Malaga",
    "Naples",
    "Cologne",
    "Salzburg",
    "Bordeaux",
    "Marseille",
    "Luxembourg",
    "Gothenburg",
    "Bologna",
]

# Combine US and European cities for random selection
all_cities = us_cities + european_cities


# Function to generate synthetic data for one person
def generate_person_data():
    # Generate basic person details
    name = fake.name()
    age = random.randint(18, 75)

    # Decide if the person is based in the US or Europe
    if random.random() < 0.5:
        current_location = random.choice(us_cities)
        primary_region = us_cities
        secondary_region = european_cities
    else:
        current_location = random.choice(european_cities)
        primary_region = european_cities
        secondary_region = us_cities

    # Generate travel details
    num_trips = random.randint(1, 10)
    past_travel_destinations = []
    flights = []

    for _ in range(num_trips):
        # 90% chance the destination is within the primary region
        if random.random() < 0.9:
            arrival_city = random.choice(primary_region)
        else:
            arrival_city = random.choice(secondary_region)

        past_travel_destinations.append(arrival_city)

        # Generate flight details
        flight_number = (
            f"{random.choice('ABCDEFGHIJKLMNOPQRSTUVWXYZ')}{random.randint(100, 999)}"
        )
        flight_date = fake.date_between(start_date="-2y", end_date="today")

        flights.append(
            {
                "Flight_Number": flight_number,
                "Departure_City": current_location,
                "Arrival_City": arrival_city,
                "Flight_Date": flight_date,
            }
        )

    return {
        "Name": name,
        "Current_Location": current_location,
        "Age": age,
        "Past_Travel_Destinations": past_travel_destinations,
        "Number_of_Trips": num_trips,
        "Flights": flights,
    }


# Generate data for multiple people and save to CSV
def generate_csv(num_rows=200, filename="data/person_details.csv"):
    with open(filename, "w", newline="", encoding="utf-8") as csvfile:
        fieldnames = [
            "Name",
            "Current_Location",
            "Age",
            "Past_Travel_Destinations",
            "Number_of_Trips",
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for _ in range(num_rows):
            person_data = generate_person_data()
            row = {
                "Name": person_data["Name"],
                "Current_Location": person_data["Current_Location"],
                "Age": person_data["Age"],
                "Past_Travel_Destinations": ", ".join(
                    person_data["Past_Travel_Destinations"]
                ),
                "Number_of_Trips": person_data["Number_of_Trips"],
            }
            writer.writerow(row)

    print(f"Data has been written to {filename}")


# Call the function to generate the CSV file
generate_csv()

Data has been written to data/person_details.csv


In [4]:

# Load the person details CSV
input_csv = "data/person_details.csv"
output_csv = "data/synthetic_travel_data.csv"

with open(input_csv, mode="r") as infile, open(
    output_csv, mode="w", newline=""
) as outfile:
    reader = csv.DictReader(infile)
    fieldnames = reader.fieldnames + [
        "Past_Travel_Destinations",
        "Number_of_Trips",
        "Flight_Number",
        "Departure_City",
        "Arrival_City",
        "Flight_Date",
    ]
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    writer.writeheader()

    for row in reader:
        # Extract basic details
        current_location = row["Current_Location"]
        region_cities = us_cities if current_location in us_cities else european_cities

        # Generate number of trips
        number_of_trips = random.randint(1, 5)

        # Generate travel and flight details
        past_travel_destinations = []
        flight_details = []

        for _ in range(number_of_trips):
            if random.random() < 0.9:
                arrival_city = random.choice(region_cities)
            else:
                arrival_city = random.choice(us_cities + european_cities)

            flight_number = f"{random.choice('ABCDEFGHIJKLMNOPQRSTUVWXYZ')}{random.randint(1000, 9999)}"
            flight_date = fake.date_between(start_date="-2y", end_date="today")

            past_travel_destinations.append(arrival_city)
            flight_details.append(
                {
                    "Flight_Number": flight_number,
                    "Departure_City": current_location,
                    "Arrival_City": arrival_city,
                    "Flight_Date": flight_date,
                }
            )

        # Flatten travel and flight details for the CSV
        row["Past_Travel_Destinations"] = ", ".join(past_travel_destinations)
        row["Number_of_Trips"] = number_of_trips

        for i, flight in enumerate(flight_details):
            if i == 0:
                row.update(flight)
            else:
                writer.writerow(row)
                row = {
                    k: "" for k in row
                }  # Clear all other fields except the flight details
                row.update(flight)

        writer.writerow(row)

print(f"Synthetic travel data has been generated and saved to {output_csv}")

Synthetic travel data has been generated and saved to data/synthetic_travel_data.csv


In [5]:
import random
from datetime import datetime, timedelta
from faker import Faker
import sqlite3
import math

# Initialize Faker
fake = Faker()

# Get today's date
today = datetime.now().date()

# City data with coordinates (latitude, longitude)
city_data = {
    "New York": (40.7128, -74.0060),
    "Los Angeles": (34.0522, -118.2437),
    "Chicago": (41.8781, -87.6298),
    "Las Vegas": (36.1699, -115.1398),
    "San Francisco": (37.7749, -122.4194),
    "Orlando": (28.5383, -81.3792),
    "Miami": (25.7617, -80.1918),
    "Washington": (38.9072, -77.0369),
    "New Orleans": (29.9511, -90.0715),
    "Boston": (42.3601, -71.0589),
    "Seattle": (47.6062, -122.3321),
    "San Diego": (32.7157, -117.1611),
    "Honolulu": (21.3069, -157.8583),
    "Nashville": (36.1627, -86.7816),
    "Denver": (39.7392, -104.9903),
    "Austin": (30.2672, -97.7431),
    "Philadelphia": (39.9526, -75.1652),
    "Atlanta": (33.7490, -84.3880),
    "San Antonio": (29.4241, -98.4936),
    "Portland": (45.5155, -122.6789),
    "Paris": (48.8566, 2.3522),
    "London": (51.5074, -0.1278),
    "Rome": (41.9028, 12.4964),
    "Barcelona": (41.3851, 2.1734),
    "Amsterdam": (52.3676, 4.9041),
    "Berlin": (52.5200, 13.4050),
    "Vienna": (48.2082, 16.3738),
    "Prague": (50.0755, 14.4378),
    "Venice": (45.4408, 12.3155),
    "Florence": (43.7696, 11.2558),
    "Istanbul": (41.0082, 28.9784),
    "Madrid": (40.4168, -3.7038),
    "Lisbon": (38.7223, -9.1393),
    "Dublin": (53.3498, -6.2603),
    "Budapest": (47.4979, 19.0402),
    "Athens": (37.9838, 23.7275),
    "Edinburgh": (55.9533, -3.1883),
    "Copenhagen": (55.6761, 12.5683),
    "Stockholm": (59.3293, 18.0686),
    "Brussels": (50.8503, 4.3517),
    "Zurich": (47.3769, 8.5417),
    "Milan": (45.4642, 9.1900),
    "Munich": (48.1351, 11.5820),
    "Seville": (37.3891, -5.9845),
}


def calculate_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Earth's radius in kilometers

    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = (
        math.sin(dlat / 2) ** 2
        + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
    )
    c = 2 * math.asin(math.sqrt(a))
    distance = R * c

    return distance


def calculate_flight_duration(distance):
    # Assume average speed of 800 km/h and add 30 minutes for takeoff and landing
    duration_hours = distance / 800
    duration_minutes = int(duration_hours * 60) + 30
    return timedelta(minutes=duration_minutes)


def calculate_flight_price(distance):
    # Base price of $50 plus $0.1 per km
    base_price = 50
    price_per_km = 0.1
    return round(base_price + (distance * price_per_km), 2)


def generate_user(user_id):
    name = fake.name()
    age = random.randint(18, 80)
    home_location = random.choice(list(city_data.keys()))

    return (user_id, name, age, home_location)


def generate_flight_booking(booking_id, user_id, user_name):
    origin = random.choice(list(city_data.keys()))
    destination = random.choice([city for city in city_data.keys() if city != origin])

    origin_coords = city_data[origin]
    dest_coords = city_data[destination]

    distance = calculate_distance(
        origin_coords[0], origin_coords[1], dest_coords[0], dest_coords[1]
    )
    flight_duration = calculate_flight_duration(distance)
    price = calculate_flight_price(distance)

    # Generate a random booking date within the next 15 days
    booking_date = today + timedelta(days=random.randint(0, 14))

    # Generate departure date between 1 and 30 days after the booking date
    departure_date = booking_date + timedelta(days=random.randint(1, 30))

    # Generate departure time
    departure_time = f"{random.randint(0, 23):02d}:{random.randint(0, 59):02d}"

    # Calculate arrival date and time
    departure_datetime = datetime.combine(
        departure_date, datetime.strptime(departure_time, "%H:%M").time()
    )
    arrival_datetime = departure_datetime + flight_duration

    return (
        booking_id,
        user_id,
        user_name,
        origin,
        destination,
        price,
        int(flight_duration.total_seconds() // 60),  # Convert to minutes
        departure_date.strftime("%Y-%m-%d"),
        departure_time,
        arrival_datetime.strftime("%Y-%m-%d"),
        arrival_datetime.strftime("%H:%M"),
        round(distance, 2),
        booking_date.strftime("%Y-%m-%d"),
    )


def generate_hotel_booking(booking_id, user_id, user_name):
    city = random.choice(list(city_data.keys()))
    hotel_name = fake.company() + " Hotel"
    check_in_date = fake.date_between(start_date="+1d", end_date="+60d")
    nights = random.randint(1, 14)
    check_out_date = check_in_date + timedelta(days=nights)
    price_per_night = round(random.uniform(50, 500), 2)
    total_price = round(price_per_night * nights, 2)
    num_guests = random.randint(1, 4)
    room_type = random.choice(["Single", "Double", "Suite", "Deluxe"])

    return (
        booking_id,
        user_id,
        user_name,
        city,
        hotel_name,
        check_in_date.strftime("%Y-%m-%d"),
        check_out_date.strftime("%Y-%m-%d"),
        nights,
        price_per_night,
        total_price,
        num_guests,
        room_type,
    )


def create_database():
    conn = sqlite3.connect("data/travel_bookings.db")
    cursor = conn.cursor()

    cursor.execute(
        """
    CREATE TABLE IF NOT EXISTS users (
        user_id INTEGER PRIMARY KEY,
        name TEXT,
        age INTEGER,
        home_location TEXT
    )
    """
    )

    cursor.execute(
        """
    CREATE TABLE IF NOT EXISTS flight_bookings (
        booking_id INTEGER PRIMARY KEY,
        user_id INTEGER,
        user_name TEXT,
        origin TEXT,
        destination TEXT,
        price REAL,
        flight_duration INTEGER,
        departure_date TEXT,
        departure_time TEXT,
        arrival_date TEXT,
        arrival_time TEXT,
        distance REAL,
        booking_date TEXT,
        FOREIGN KEY (user_id) REFERENCES users (user_id)
    )
    """
    )

    cursor.execute(
        """
    CREATE TABLE IF NOT EXISTS hotel_bookings (
        booking_id INTEGER PRIMARY KEY,
        user_id INTEGER,
        user_name TEXT,
        city TEXT,
        hotel_name TEXT,
        check_in_date TEXT,
        check_out_date TEXT,
        nights INTEGER,
        price_per_night REAL,
        total_price REAL,
        num_guests INTEGER,
        room_type TEXT,
        FOREIGN KEY (user_id) REFERENCES users (user_id)
    )
    """
    )

    conn.commit()
    return conn, cursor


def insert_sample_data(
    conn, cursor, num_users, num_flight_bookings, num_hotel_bookings
):
    # Generate and insert user data
    users = [generate_user(i + 1) for i in range(num_users)]
    cursor.executemany(
        """
    INSERT INTO users (user_id, name, age, home_location)
    VALUES (?, ?, ?, ?)
    """,
        users,
    )

    # Generate and insert flight bookings
    for i in range(num_flight_bookings):
        user = random.choice(users)
        flight_booking = generate_flight_booking(i + 1, user[0], user[1])
        cursor.execute(
            """
        INSERT INTO flight_bookings (booking_id, user_id, user_name, origin, destination, price, flight_duration, departure_date, departure_time, arrival_date, arrival_time, distance, booking_date)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        """,
            flight_booking,
        )

    # Generate and insert hotel bookings
    for i in range(num_hotel_bookings):
        user = random.choice(users)
        hotel_booking = generate_hotel_booking(i + 1, user[0], user[1])
        cursor.execute(
            """
        INSERT INTO hotel_bookings (booking_id, user_id, user_name, city, hotel_name, check_in_date, check_out_date, nights, price_per_night, total_price, num_guests, room_type)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        """,
            hotel_booking,
        )

    conn.commit()


if __name__ == "__main__":
    conn, cursor = create_database()
    insert_sample_data(
        conn, cursor, num_users=500, num_flight_bookings=1000, num_hotel_bookings=1000
    )
    conn.close()
    print("Sample data has been generated and inserted into the SQLite database.")

Sample data has been generated and inserted into the SQLite database.
