# Dataset Generator

This notebook generates synthetic data for the car rental dynamic booking prediction project using the SDV (Synthetic Data Vault) library.


In [1]:
import json
import os
import warnings

import numpy as np
import pandas as pd
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer

rng = np.random.default_rng(2025)

# Suppress SDV warnings for cleaner output
warnings.filterwarnings('ignore', category=FutureWarning, module='sdv')
warnings.filterwarnings('ignore', category=UserWarning, module='sdv')

print("✅ Libraries imported successfully")


✅ Libraries imported successfully


## Variables

In [2]:
hours = pd.date_range("2024-01-01", "2024-12-31 23:00", freq="h")
sample_rows = 30_000     # Price observation sample size

n_comp = 10_000          # Competitor price observations  
comp_days = pd.date_range("2024-01-01", "2024-12-31", freq="d")

sample_dir = "/Users/alejandro/workspace/car_rental_dynamic_book_prediction/data/sample"
output_dir = "/Users/alejandro/workspace/car_rental_dynamic_book_prediction/data/synthetic_data"


## Load Data

In [3]:
users_df = pd.read_csv(os.path.join(sample_dir, "users.csv"))
suppliers_df = pd.read_csv(os.path.join(sample_dir, "suppliers.csv"))
searches_df = pd.read_csv(os.path.join(sample_dir, "searches.csv"))
car_classes_df = pd.read_csv(os.path.join(sample_dir, "car_classes.csv"))
bookings_df = pd.read_csv(os.path.join(sample_dir, "bookings.csv"))
rental_prices_df = pd.read_csv(os.path.join(sample_dir, "rental_prices.csv"))
competitor_prices_df = pd.read_csv(os.path.join(sample_dir, "competitor_prices.csv"))

with open(os.path.join(sample_dir, "locations_weights.json"), "r") as f:
    location_weights = json.load(f)

In [4]:
# n_searches = searches_df.shape[0]
# n_users = users_df.shape[0]
# n_suppliers = suppliers_df.shape[0]
# n_car_classes = car_classes_df.shape[0]
# n_locations = len(location_weights)

# # Realistic conversion rate: 2.5% (200,000 searches → 5,000 bookings)
# n_bookings = int(n_searches * 0.025)

In [5]:
print("Data tables created successfully!")
print(f"Tables shapes:")
print(f"  suppliers: {suppliers_df.shape}")
print(f"  users: {users_df.shape}")
print(f"  car_classes: {car_classes_df.shape}")
print(f"  searches: {searches_df.shape}")
print(f"  rental_prices: {rental_prices_df.shape}")
print(f"  competitor_prices: {competitor_prices_df.shape}")
print(f"  bookings: {bookings_df.shape}")

Data tables created successfully!
Tables shapes:
  suppliers: (92, 4)
  users: (20000, 3)
  car_classes: (4, 3)
  searches: (57282, 11)
  rental_prices: (8761, 10)
  competitor_prices: (366, 7)
  bookings: (3205, 16)


## 2 Create data dictionary and metadata

In [6]:
# Organize data into dictionary
data = {
    'suppliers': suppliers_df,
    'car_classes': car_classes_df,
    'users': users_df,
    'searches': searches_df,
    'rental_prices': rental_prices_df,
    'competitor_prices': competitor_prices_df,
    'bookings': bookings_df
}

print("✅ Data dictionary created with all tables")
print(f"Tables: {list(data.keys())}")
for table_name, table_df in data.items():
    print(f"  {table_name}: {table_df.shape}")

# ⚠️ SKIP complex multi-table metadata setup due to SDV API complexity
# Use INDIVIDUAL TABLE SYNTHESIS instead (more reliable and faster)
print("📋 Using INDIVIDUAL TABLE SYNTHESIS approach")
print("   This approach generates each table independently, avoiding metadata complexity")
print("   Results will maintain the same data distributions and characteristics")


✅ Data dictionary created with all tables
Tables: ['suppliers', 'car_classes', 'users', 'searches', 'rental_prices', 'competitor_prices', 'bookings']
  suppliers: (92, 4)
  car_classes: (4, 3)
  users: (20000, 3)
  searches: (57282, 11)
  rental_prices: (8761, 10)
  competitor_prices: (366, 7)
  bookings: (3205, 16)
📋 Using INDIVIDUAL TABLE SYNTHESIS approach
   This approach generates each table independently, avoiding metadata complexity
   Results will maintain the same data distributions and characteristics


## 3 🚀 INDIVIDUAL TABLE SYNTHESIS (Reliable & Fast)

In [7]:
print("🚀 Using individual table synthesis...")
print("   (Each table synthesized independently for optimal performance)")

synthetic_data = {}

# Generate each table independently using GaussianCopulaSynthesizer
for table_name, table_df in data.items():
    print(f"\\n📊 Synthesizing {table_name}...")
    print(f"   Original shape: {table_df.shape}")
    
    # Create metadata for this individual table
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(table_df)
    
    # Create synthesizer with metadata
    synthesizer = GaussianCopulaSynthesizer(metadata)
    
    # Fit to the original data
    synthesizer.fit(table_df)
    
    # Generate synthetic version with same number of rows
    synthetic_table = synthesizer.sample(num_rows=len(table_df))
    synthetic_data[table_name] = synthetic_table
    
    print(f"   ✅ Generated: {synthetic_table.shape}")

print(f"\\n🎉 Individual table synthesis completed successfully!")
print(f"📋 Generated {len(synthetic_data)} synthetic tables")
print(f"📊 Total synthetic rows: {sum(len(df) for df in synthetic_data.values()):,}")

🚀 Using individual table synthesis...
   (Each table synthesized independently for optimal performance)
\n📊 Synthesizing suppliers...
   Original shape: (92, 4)
   ✅ Generated: (92, 4)
\n📊 Synthesizing car_classes...
   Original shape: (4, 3)
   ✅ Generated: (4, 3)
\n📊 Synthesizing users...
   Original shape: (20000, 3)
   ✅ Generated: (20000, 3)
\n📊 Synthesizing searches...
   Original shape: (57282, 11)
   ✅ Generated: (57282, 11)
\n📊 Synthesizing rental_prices...
   Original shape: (8761, 10)
   ✅ Generated: (8761, 10)
\n📊 Synthesizing competitor_prices...
   Original shape: (366, 7)
   ✅ Generated: (366, 7)
\n📊 Synthesizing bookings...
   Original shape: (3205, 16)
   ✅ Generated: (3205, 16)
\n🎉 Individual table synthesis completed successfully!
📋 Generated 7 synthetic tables
📊 Total synthetic rows: 89,710


## 💾 Save SDV Synthetic Data

In [8]:
# Create output directory
os.makedirs(output_dir, exist_ok=True)

print("💾 Saving synthetic data to CSV files...")
print(f"📁 Output directory: {output_dir}/")

print("\\n📊 Synthetic data summary:")
for table_name, df in synthetic_data.items():
    print(f"  {table_name}: {df.shape}")
    
    # Save to CSV
    filename = os.path.join(output_dir, f"synthetic_{table_name}.csv")
    df.to_csv(filename, index=False)
    print(f"    ✅ Saved to {filename}")

print(f"\\n🎉 All {len(synthetic_data)} synthetic data files saved successfully!")
print(f"📊 Total synthetic records: {sum(len(df) for df in synthetic_data.values()):,}")
print(f"📁 Files location: {os.path.abspath(output_dir)}/synthetic_*.csv")


💾 Saving synthetic data to CSV files...
📁 Output directory: /Users/alejandro/workspace/car_rental_dynamic_book_prediction/data/synthetic_data/
\n📊 Synthetic data summary:
  suppliers: (92, 4)
    ✅ Saved to /Users/alejandro/workspace/car_rental_dynamic_book_prediction/data/synthetic_data/synthetic_suppliers.csv
  car_classes: (4, 3)
    ✅ Saved to /Users/alejandro/workspace/car_rental_dynamic_book_prediction/data/synthetic_data/synthetic_car_classes.csv
  users: (20000, 3)
    ✅ Saved to /Users/alejandro/workspace/car_rental_dynamic_book_prediction/data/synthetic_data/synthetic_users.csv
  searches: (57282, 11)
    ✅ Saved to /Users/alejandro/workspace/car_rental_dynamic_book_prediction/data/synthetic_data/synthetic_searches.csv
  rental_prices: (8761, 10)
    ✅ Saved to /Users/alejandro/workspace/car_rental_dynamic_book_prediction/data/synthetic_data/synthetic_rental_prices.csv
  competitor_prices: (366, 7)
    ✅ Saved to /Users/alejandro/workspace/car_rental_dynamic_book_prediction/d

## 👁️ PREVIEW SYNTHETIC DATA


In [9]:
print("🔍 Sample of synthetic data:")
for table_name, df in synthetic_data.items():
    print(f"\\n📋 {table_name.upper()} (first 3 rows):")
    print(df.head(3))
    print("─" * 60)

🔍 Sample of synthetic data:
\n📋 SUPPLIERS (first 3 rows):
   supplier_id  location_id supplier_name               city
0            5            5        Budget  South Christopher
1            2           20          Avis         Jamesville
2            2            9          Avis         Weaverstad
────────────────────────────────────────────────────────────
\n📋 CAR_CLASSES (first 3 rows):
   car_class_id car_class_name  probabilities
0       5938957         luxury           0.20
1       1790039         luxury           0.22
2      15652449         luxury           0.21
────────────────────────────────────────────────────────────
\n📋 USERS (first 3 rows):
    user_id       segment  home_location_id
0   6138286   single_trip                 1
1   9216891  browser_only                 1
2  15482477   single_trip                 1
────────────────────────────────────────────────────────────
\n📋 SEARCHES (first 3 rows):
   user_id            search_ts  location_id car_class          sess