In [2]:
!pip install faker


Collecting faker
  Downloading Faker-32.1.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-32.1.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-32.1.0


In [10]:
import pandas as pd
import random
from faker import Faker

# Initialize Faker for realistic data generation
faker = Faker()

# Define constants for the dataset
NUM_ROWS = 5000  # Total bookings
NUM_CUSTOMERS = 1000  # Total unique customers
AGE_GROUPS = ["18-24", "25-34", "35-44", "45-54", "55-64", "65+"]
GENDERS = ["Male", "Female"]
STATES = ["California", "Texas", "Florida", "New York", "Georgia", "Illinois", "Arizona", "Oregon"]
CITIES = {
    "California": ["Los Angeles", "San Francisco", "San Diego", "San Jose"],
    "Texas": ["Houston", "Dallas", "Austin", "San Antonio"],
    "Florida": ["Miami", "Orlando", "Tampa", "Jacksonville"],
    "New York": ["New York City", "Buffalo", "Rochester", "Albany"],
    "Georgia": ["Atlanta", "Savannah", "Augusta", "Columbus"],
    "Illinois": ["Chicago", "Aurora", "Naperville", "Springfield"],
    "Arizona": ["Phoenix", "Tucson", "Scottsdale", "Flagstaff"],
    "Oregon": ["Portland", "Salem", "Eugene", "Bend"]
}
DOMESTIC_DESTINATIONS = ["Atlanta, GA", "Miami, FL", "Seattle, WA", "Chicago, IL", "Las Vegas, NV"]
AIRLINE_DESTINATION_MAPPING = {
    "Delta Airlines": [
        "London, UK", "Paris, France", "Toronto, Canada", "Tokyo, Japan", "Amsterdam, Netherlands"
    ] + DOMESTIC_DESTINATIONS,
    "American Airlines": [
        "London, UK", "Paris, France", "Toronto, Canada", "Tokyo, Japan", "Mexico City, Mexico"
    ] + DOMESTIC_DESTINATIONS,
    "United Airlines": [
        "London, UK", "Paris, France", "Toronto, Canada", "Tokyo, Japan", "Frankfurt, Germany"
    ] + DOMESTIC_DESTINATIONS,
    "Southwest Airlines": [
        "Montego Bay, Jamaica", "Nassau, Bahamas", "Grand Cayman, Cayman Islands", 
        "San Juan, Puerto Rico", "Cancun, Mexico", "Cozumel, Mexico", 
        "Belize City, Belize", "Liberia, Costa Rica", "Havana, Cuba"
    ] + DOMESTIC_DESTINATIONS,
    "Alaska Airlines": [
        "Toronto, Canada", "Vancouver, Canada"
    ] + DOMESTIC_DESTINATIONS
}
CLASSES = ["Economy", "Premium Economy", "Business", "First"]
BOOKING_CHANNELS = ["Website", "Mobile App", "Travel Agency"]
PRICE_RANGES = {
    "Domestic": {
        "Economy": (100, 300),
        "Premium Economy": (250, 500),
        "Business": (400, 800),
        "First": (700, 1500)
    },
    "International": {
        "Economy": (400, 800),
        "Premium Economy": (700, 1200),
        "Business": (1000, 2500),
        "First": (2000, 5000)
    }
}

# Generate Customer Profiles
customer_profiles = []
for i in range(NUM_CUSTOMERS):
    state = random.choice(STATES)  # Assign consistent state for customer
    city = random.choice(CITIES[state])  # Assign consistent city within the state
    frequent_flyer = random.choice(["Yes", "No"])  # Assign consistent frequent flyer status
    customer_profiles.append({
        "CustomerID": f"C{i+1:05d}",
        "AgeGroup": random.choice(AGE_GROUPS),
        "Gender": random.choice(GENDERS),
        "State": state,
        "City": city,
        "FrequentFlyer": frequent_flyer  # Add frequent flyer status to profile
    })

# Convert customer profiles to DataFrame for lookup
customer_df = pd.DataFrame(customer_profiles)

# Helper function to generate ticket price based on class and flight type
def generate_ticket_price(flight_type, travel_class):
    return round(random.uniform(*PRICE_RANGES[flight_type][travel_class]), 2)

# Generate Bookings
bookings = []
for i in range(NUM_ROWS):
    customer = random.choice(customer_profiles)  # Select a customer profile
    travel_class = random.choice(CLASSES)
    airline = random.choice(list(AIRLINE_DESTINATION_MAPPING.keys()))  # Select an airline
    destination = random.choice(AIRLINE_DESTINATION_MAPPING[airline])  # Choose valid destination for airline
    flight_type = "Domestic" if destination in DOMESTIC_DESTINATIONS else "International"  # Logical FlightType
    ticket_price = generate_ticket_price(flight_type, travel_class)  # Price based on flight type and class
    bookings.append({
        "BookingID": f"A{i+1:05d}",  # Unique BookingID
        "CustomerID": customer["CustomerID"],  # Reuse consistent CustomerID
        "AgeGroup": customer["AgeGroup"],  # Consistent AgeGroup
        "Gender": customer["Gender"],  # Consistent Gender
        "State": customer["State"],  # Consistent State
        "City": customer["City"],  # Consistent City
        "Airline": airline,
        "Destination": destination,
        "BookingDate": faker.date_between(start_date="-6M", end_date="today").strftime("%Y-%m-%d"),
        "FlightType": flight_type,  # Correctly determined FlightType
        "TicketPrice": ticket_price,
        "Class": travel_class,
        "BookingChannel": random.choice(BOOKING_CHANNELS),
        "FrequentFlyer": customer["FrequentFlyer"],  # Consistent FrequentFlyer status
        "AffinityIndex": random.randint(100, 200)
    })

# Convert bookings to DataFrame
bookings_df = pd.DataFrame(bookings)

# Save both datasets to CSV files
customer_profiles_file = "customer_profiles.csv"
bookings_file = "airline_bookings_data.csv"

customer_df.to_csv(customer_profiles_file, index=False)
bookings_df.to_csv(bookings_file, index=False)

print(f"Customer profiles saved to {customer_profiles_file}")
print(f"Booking data saved to {bookings_file}")


Customer profiles saved to customer_profiles.csv
Booking data saved to airline_bookings_data.csv
