In [1]:
import torch

if torch.cuda.is_available():
    print("GPU is available")
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Count: {torch.cuda.device_count()}")
else:
    print("GPU is not available")


GPU is available
GPU Name: NVIDIA GeForce GTX 1650
GPU Count: 1


In [2]:
!pip install sdv

^C


In [2]:
from sdv.metadata import SingleTableMetadata
from sdv.evaluation.single_table import evaluate_quality, get_column_plot
from sdv.single_table import GaussianCopulaSynthesizer, CTGANSynthesizer, TVAESynthesizer, CopulaGANSynthesizer

import pandas as pd

### Load dataset

In [3]:
data = pd.read_csv(r'A:\Downloads\Remedy.ai\datasets\breast.csv')

data.head()

Unnamed: 0,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,fractal_dimension1,...,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3,Diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,M
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,M
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,M
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,M
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,M


In [4]:
metadata = SingleTableMetadata()

# Detect metadata from CSV file
metadata.detect_from_csv(filepath=r'A:\Downloads\Remedy.ai\datasets\breast.csv')

metadata

{
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1",
    "columns": {
        "radius1": {
            "sdtype": "numerical"
        },
        "texture1": {
            "sdtype": "numerical"
        },
        "perimeter1": {
            "sdtype": "numerical"
        },
        "area1": {
            "sdtype": "numerical"
        },
        "smoothness1": {
            "sdtype": "numerical"
        },
        "compactness1": {
            "sdtype": "numerical"
        },
        "concavity1": {
            "sdtype": "numerical"
        },
        "concave_points1": {
            "sdtype": "numerical"
        },
        "symmetry1": {
            "sdtype": "numerical"
        },
        "fractal_dimension1": {
            "sdtype": "numerical"
        },
        "radius2": {
            "sdtype": "numerical"
        },
        "texture2": {
            "sdtype": "numerical"
        },
        "perimeter2": {
            "sdtype": "numerical"
        },
        "area2": {
            "sdty

### GaussianCopulaSynthesizer

In [12]:
# Move up one directory level
os.chdir("..")

# Get the new working directory
new_dir = os.getcwd()
print("New Directory:", new_dir)

New Directory: c:\Users\akash\Desktop\Synthetic-Data-Generation-in-Medical-Applications


In [5]:
synthesizer = GaussianCopulaSynthesizer(
    metadata
)
synthesizer.fit(data)
synthesizer.save(filepath=r'A:\Downloads\Remedy.ai\synthesizers\GCSbc.pkl')
#synthesizer.get_learned_distributions()



In [6]:
# or load it
synthesizer = GaussianCopulaSynthesizer.load(
    filepath=r'A:\Downloads\Remedy.ai\synthesizers\GCSbc.pkl'
)
synthetic_data = synthesizer.sample(
    num_rows=1000,
    batch_size=100,
    output_file_path=r'A:\Downloads\Remedy.ai\synthetic\GCSbc.csv',
)

  0%|          | 0/1000 [00:00<?, ?it/s]

Sampling rows: 100%|██████████| 1000/1000 [00:01<00:00, 731.59it/s]


### CTGANSynthesizer

In [7]:
synthesizer = CTGANSynthesizer(
    metadata,
    epochs=500,
)
synthesizer.fit(data)
synthesizer.save(filepath='A:\Downloads\Remedy.ai\synthesizers\CTGANbc.pkl')
synthesizer.get_loss_values()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,Epoch,Generator Loss,Discriminator Loss
0,0,0.854951,0.001737
1,1,0.885055,-0.047173
2,2,0.909199,-0.073515
3,3,0.913198,-0.092323
4,4,0.897798,-0.171141
...,...,...,...
495,495,-0.981625,-0.058435
496,496,-0.973408,-0.103055
497,497,-0.971506,-0.275430
498,498,-1.042479,-0.260550


In [8]:
# or load it
synthesizer = CTGANSynthesizer.load(
    filepath=r'A:\Downloads\Remedy.ai\synthesizers\CTGANbc.pkl'
)
synthetic_data = synthesizer.sample(
    num_rows=1000,
    batch_size=100,
    output_file_path='A:\Downloads\Remedy.ai\synthetic\CTGANbc.csv',
)

Sampling rows: 100%|██████████| 1000/1000 [00:02<00:00, 416.22it/s]


### TVAESynthesizer

In [9]:
synthesizer = TVAESynthesizer(
    metadata,
    epochs=500,
)
synthesizer.fit(data)
synthesizer.save(filepath='A:\Downloads\Remedy.ai\synthesizers\TVAESbc.pkl')

In [10]:
synthesizer.get_loss_values()

Unnamed: 0,Epoch,Batch,Loss
0,0,0,155.145752
1,0,1,127.926491
2,1,0,132.876816
3,1,1,118.581406
4,2,0,119.217270
...,...,...,...
995,497,1,-26.173100
996,498,0,-27.578339
997,498,1,-25.100630
998,499,0,-26.697193


In [11]:
# or load it
synthesizer = TVAESynthesizer.load(
    filepath=r'A:\Downloads\Remedy.ai\synthesizers\TVAESbc.pkl'
)

In [14]:
synthetic_data = synthesizer.sample(
    num_rows=1000,
    batch_size=100,
    output_file_path='A:\Downloads\Remedy.ai\synthetic\TVAESbc.csv',
)

Sampling rows: 100%|██████████| 1000/1000 [00:03<00:00, 305.04it/s]


### CopulaGANSynthesizer

In [34]:
# train the synthesizer 
synthesizer = CopulaGANSynthesizer(
    metadata,
    epochs=500,
    verbose=True,
)

synthesizer.fit(data)
synthesizer.save(filepath='synthesizers/CGANbc.pkl')

Gen. (-2.26) | Discrim. (-0.23): 100%|██████████| 500/500 [00:23<00:00, 21.68it/s]


In [35]:
synthesizer.get_loss_values()

Unnamed: 0,Epoch,Generator Loss,Discriminator Loss
0,0,0.817293,-0.003665
1,1,0.770676,-0.029333
2,2,0.782333,-0.093022
3,3,0.783737,-0.113403
4,4,0.739277,-0.188489
...,...,...,...
495,495,-2.712113,-0.443949
496,496,-2.386628,-0.423564
497,497,-2.203330,-0.059741
498,498,-2.323760,0.019763


In [None]:
# or load it
synthesizer = CopulaGANSynthesizer.load(
    filepath='synthesizers/CGANbc.pkl'
)

In [36]:
synthetic_data = synthesizer.sample(
    num_rows=100_000,
    batch_size=100,
    output_file_path='synthetic/CGANbc.csv',
)

Sampling rows: 100%|██████████| 100000/100000 [04:25<00:00, 377.00it/s]
