# Generate Synthetic Data with Ray and SDV

Learn more about SDV here:
- Docs: https://docs.sdv.dev/sdv/
- Demo: https://colab.research.google.com/drive/1UltaGqbvbp4a_s85FZb3Pyjc_Igad3Ea?usp=sharing

In [None]:
!pip install "ray[default,data]==2.3.0" sdv

## Create a Data Synthesizer with SDV

In [2]:
from sdv.datasets.demo import download_demo
from sdv.lite import SingleTablePreset

# Download some data
real_data, metadata = download_demo(
    modality="single_table",
    dataset_name="fake_hotel_guests"
)

# Create the synthesizer and fit it
synthesizer = SingleTablePreset(
    metadata,
    name="FAST_ML"
)
synthesizer.fit(data=real_data)

# .sample returns a pandas DF of fake data based on the real_data
synthesizer.sample(num_rows=100)

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,guest_email,has_rewards,room_type,amenities_fee,checkin_date,checkout_date,room_rate,billing_address,credit_card_number
0,cheryl93@example.com,False,BASIC,9.436498,20 Mar 2020,11 Apr 2020,141.635838,"093 Williams Lane Apt. 170\nNorth Charles, PA ...",180090934066211
1,sandersdarlene@example.org,False,BASIC,20.158516,20 Jun 2020,11 Aug 2020,185.529627,"PSC 5816, Box 0394\nAPO AA 74642",30083012986584
2,hsmith@example.org,False,BASIC,22.907020,16 Apr 2020,11 Apr 2020,145.403493,"819 Peck Curve Suite 448\nEast Johnmouth, WI 0...",2290394691481974
3,jonesernest@example.net,False,BASIC,25.121149,04 Jun 2020,17 Jun 2020,180.463870,"82758 Huffman Isle Suite 101\nHumphreyview, KS...",4444812351068428276
4,pstanton@example.com,False,BASIC,21.185741,11 Nov 2019,25 Oct 2019,180.288810,"228 Fisher Fork\nSouth Mitchell, WA 81769",4875851017747494
...,...,...,...,...,...,...,...,...,...
95,fosterkendra@example.com,True,BASIC,0.000000,15 Jul 2020,18 Jun 2020,189.518567,"90352 Denise Mountains\nPort Deniseport, NV 10049",4445723911043299
96,robert56@example.org,False,BASIC,25.997329,08 Oct 2020,29 Sep 2020,204.100893,"34077 Susan Square\nPort Paulmouth, SD 43548",3555495904467243
97,leonard11@example.net,False,BASIC,24.639520,13 May 2020,07 Apr 2020,123.194022,"031 Danny Junctions Suite 402\nStephanieshire,...",370509599596584
98,bmaxwell@example.net,False,BASIC,28.541776,04 Dec 2020,24 Dec 2020,163.677712,"58335 Kevin Parks Apt. 692\nNew Lanceview, DC ...",375075843181362


## Scale the Synthesizer with Ray

In [3]:
import ray

ray.init()

2023-03-31 16:11:17,312	INFO worker.py:1544 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


0,1
Python version:,3.8.5
Ray version:,2.3.0
Dashboard:,http://127.0.0.1:8265


In [10]:
"""Custom Ray SDV DataSource implementation"""
import os
import random
import string
from ray.data.block import Block, BlockMetadata
from ray.data.datasource.datasource import Datasource, Reader, ReadTask


def _read_single_frame(synthesizer, num_rows) -> Block:
    filename = "".join(random.choice(string.ascii_lowercase) for i in range(8))
    df = synthesizer.sample(num_rows=num_rows, output_file_path=filename)
    os.remove(filename)
    return df


class _SDVDatasourceReader(Reader):
    def __init__(self, synthesizer, num_samples, num_rows):
        self._synthesizer = synthesizer
        self._num_samples = num_samples
        self._num_rows = num_rows

    def estimate_inmemory_data_size(self):
        return None

    def get_read_tasks(self, parallelism: int):
        read_tasks = []
        for _ in range(self._num_samples):
            metadata = BlockMetadata(
                num_rows=None,
                size_bytes=None,
                schema=None,
                input_files=None,
                exec_stats=None,
            )
            read_task = ReadTask(
                lambda synthesizer=self._synthesizer: [
                    _read_single_frame(
                        synthesizer, self._num_rows
                    )
                ],
                metadata,
            )
            read_tasks.append(read_task)
        return read_tasks


class SDVSample(Datasource):
    """SDV sample datasource, for generating synthetic data."""

    def create_reader(self, synthesizer, num_samples, num_rows):
        return _SDVDatasourceReader(synthesizer, num_samples, num_rows)


In [12]:
# Scale the SDV synthesizer across the Ray cluster
# num_rows and num_samples can be tweaked to create an even larger data frame
df = ray.data.read_datasource(
    SDVSample(),
    synthesizer=synthesizer,
    num_samples=10,
    num_rows=1000,
).to_modin()

df  # 10 samples * 1000 rows

Read progress: 100%|██████████| 10/10 [00:01<00:00,  8.17it/s]


Unnamed: 0,guest_email,has_rewards,room_type,amenities_fee,checkin_date,checkout_date,room_rate,billing_address,credit_card_number
0,dixonrachel@example.org,False,DELUXE,10.964845,08 Jan 2021,30 Dec 2020,193.684972,"116 White Lights Apt. 673\nAmberfort, NH 94842",180099737579247
1,dharris@example.com,False,BASIC,11.830869,10 Dec 2020,07 Dec 2020,112.339902,"747 Bentley Manor Suite 036\nPort Katelynview,...",344416798640481
2,reneecooper@example.net,True,DELUXE,12.520167,31 Jul 2020,17 Jul 2020,300.257638,"2983 English Mountain Apt. 080\nDoylestad, NY ...",4077950265409504
3,ryan65@example.com,True,BASIC,,08 Mar 2020,,149.793036,"2233 Linda Drives Suite 801\nWest Robertview, ...",4329957852289450
4,isimmons@example.org,False,BASIC,39.503692,14 Sep 2020,09 Oct 2020,115.950327,"89162 Kelly Course\nPort James, AK 86270",4674902734165438
...,...,...,...,...,...,...,...,...,...
995,allenpamela@example.net,False,BASIC,0.000000,27 Feb 2021,17 Feb 2021,173.804938,"1086 Gabriel Oval Suite 956\nBrianmouth, OH 55995",4138358717829502807
996,tamara79@example.org,False,BASIC,10.992596,06 Sep 2020,08 Sep 2020,221.507093,"903 Morgan Grove\nWest Keithmouth, MN 93920",4210955015012
997,joseph73@example.org,False,DELUXE,42.573083,06 Apr 2020,30 Mar 2020,233.591536,Unit 9103 Box 2180\nDPO AE 15462,676362239110
998,stephaniereed@example.com,False,BASIC,,18 Aug 2020,10 Aug 2020,200.625045,"67052 Wilson Row Suite 212\nTracyland, WI 24891",376728711792905
