## Loading data

In [1]:
from sdv.datasets.demo import download_demo

real_data, metadata = download_demo(
        modality='multi_table',
        dataset_name='fake_hotels'
)

In [2]:
real_data['hotels'].head()

Unnamed: 0,hotel_id,city,state,rating,classification
0,HID_000,Boston,Massachusetts,4.8,RESORT
1,HID_001,Boston,Massachuesetts,4.1,CHAIN
2,HID_002,San Francisco,California,3.8,MOTEL
3,HID_003,San Francisco,California,4.0,CHAIN
4,HID_004,New York City,New York,3.7,MOTEL


In [3]:
real_data['guests'].head()

Unnamed: 0,guest_email,hotel_id,has_rewards,room_type,amenities_fee,checkin_date,checkout_date,room_rate,billing_address,credit_card_number
0,awolf@phillips.com,HID_000,False,BASIC,37.89,27 Dec 2020,28 Dec 2020,156.23,"993 Rebecca Landing\nJesseburgh, PA 05072",4075084747483975747
1,tonya44@wilkinson-wilkins.com,HID_000,False,BASIC,24.37,30 Dec 2020,31 Dec 2020,139.43,"958 Beverly Bypass\nSouth Ronald, GA 46368",180072822063468
2,harriskathleen@goodwin.com,HID_000,True,DELUXE,0.0,17 Sep 2020,19 Sep 2020,403.33,"8302 Nathaniel Pike\nRileyland, TX 71613",38983476971380
3,kayladiaz@wallace-simmons.com,HID_000,False,BASIC,,28 Dec 2020,30 Dec 2020,140.61,"77 Massachusetts Ave\nCambridge, MA 02139",4969551998845740
4,paigemendoza@tran-martin.com,HID_000,True,DELUXE,0.0,05 Apr 2020,10 Apr 2020,197.41,"1234 Corporate Drive\nBoston, MA 02116",3558512986488983


## Generating Synthetic Data

In [4]:
from sdv.multi_table import HMASynthesizer

synthesizer = HMASynthesizer(metadata)
synthesizer.fit(real_data)

Preprocess Tables: 100%|█████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  8.56it/s]



Learning relationships:


(1/1) Tables 'hotels' and 'guests' ('hotel_id'): 100%|█████████████████████████████████| 10/10 [00:01<00:00,  6.10it/s]





Modeling Tables: 100%|███████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.19s/it]


In [5]:
synthetic_data = synthesizer.sample(scale=2)

In [6]:
synthetic_data['hotels'].head(3)

Unnamed: 0,hotel_id,city,state,rating,classification
0,HID_000,Austin,California,3.9,CHAIN
1,HID_001,Boston,California,4.3,RESORT
2,HID_002,Austin,California,4.6,CHAIN


In [7]:
synthetic_data['guests'].head(3)

Unnamed: 0,guest_email,hotel_id,has_rewards,room_type,amenities_fee,checkin_date,checkout_date,room_rate,billing_address,credit_card_number
0,moodyeric@example.net,HID_000,False,BASIC,28.69,17 Sep 2020,14 Sep 2020,96.57,"PSC 4273, Box 0888\nAPO AA 28423",5161033759518983
1,coryguzman@example.com,HID_000,False,BASIC,2.62,21 Apr 2020,28 Apr 2020,108.67,"417 Lawrence Hollow\nEast Ericshire, IN 65660",4133047413145475690
2,caitlinlee@example.net,HID_000,False,BASIC,3.25,27 Sep 2020,03 Oct 2020,121.35,"69754 Mcguire Haven Apt. 260\nCrawfordside, IN...",4977328103788


## Evaluating Data

In [8]:
from sdv.evaluation.multi_table import run_diagnostic

diagnostic = run_diagnostic(
    real_data=real_data,
    synthetic_data=synthetic_data,
    metadata=metadata
)

Generating report ...
(1/3) Evaluating Data Validity: : 100%|██████████████████████████████████████████████| 15/15 [00:00<00:00, 1363.71it/s]
(2/3) Evaluating Data Structure: : 100%|████████████████████████████████████████████████| 2/2 [00:00<00:00, 400.03it/s]
(3/3) Evaluating Relationship Validity: : 100%|█████████████████████████████████████████| 1/1 [00:00<00:00, 250.02it/s]

Overall Score: 100.0%

Properties:
- Data Validity: 100.0%
- Data Structure: 100.0%
- Relationship Validity: 100.0%


## Visualizing the Data

In [9]:
from sdv.evaluation.multi_table import get_column_plot

fig = get_column_plot(
    real_data=real_data,
    synthetic_data=synthetic_data,
    column_name='has_rewards',
    table_name='guests',
    metadata=metadata
)

fig.show()