In [1]:
!pip install sdv sdmetrics imbalanced-learn --quiet

In [2]:
# @title 2. Import Libraries and Modules (Corrected)
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE

# Import synthesizers from the Synthetic Data Vault (SDV) library
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer, CTGANSynthesizer

# Import evaluation metrics and visualization tools from the SDMetrics library
from sdmetrics.reports.single_table import QualityReport
from sdmetrics.single_table import MulticlassDecisionTreeClassifier, BinaryLogisticRegression
from sdmetrics.visualization import get_column_plot # <-- FIX: Added the missing import for visualization

# Suppress warnings for a cleaner output
warnings.filterwarnings('ignore')

In [3]:
file_path = 'C:/Users/paava/Documents/SynGen/heart_disease_uci.csv'
real_data = pd.read_csv(file_path)

# --- Robust Data Cleaning ---
# 1. Replace common non-standard missing value placeholders (e.g., '?') with NaN
real_data.replace('?', pd.NA, inplace=True)

# 2. Drop rows with any missing values to ensure data integrity for all models.
original_rows = len(real_data)
real_data.dropna(inplace=True)
cleaned_rows = len(real_data)

print("Dataset successfully loaded.")
if original_rows > cleaned_rows:
    print(f"Cleaned {original_rows - cleaned_rows} rows with missing values.")

print(f"Shape of the dataset after cleaning: {real_data.shape}")
real_data.head()

Dataset successfully loaded.
Cleaned 621 rows with missing values.
Shape of the dataset after cleaning: (299, 16)


Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [4]:
# @title 4. Interactive Configuration: Select Your Target Column
# This cell makes the script adaptable to any dataset.

column_names = real_data.columns.tolist()
print("Available columns in the dataset:")
for col in column_names:
    print(f"- {col}")

while True:
    TARGET_COLUMN = input("\nPlease enter the name of the target column from the list above: ")
    if TARGET_COLUMN in column_names:
        # Attempt to convert target to a numeric type for compatibility
        try:
            real_data = real_data.astype({TARGET_COLUMN: float}).astype({TARGET_COLUMN: int})
            print(f"\nTarget column set to: '{TARGET_COLUMN}'")
            break
        except (ValueError, TypeError):
            print(f"Error: The column '{TARGET_COLUMN}' contains non-numeric values and cannot be used as a target for this binary classification demo. Please choose a numeric column.")
    else:
        print(f"Error: '{TARGET_COLUMN}' is not a valid column name. Please try again.")

Available columns in the dataset:
- id
- age
- sex
- dataset
- cp
- trestbps
- chol
- fbs
- restecg
- thalch
- exang
- oldpeak
- slope
- ca
- thal
- num



Please enter the name of the target column from the list above:  num



Target column set to: 'num'


In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd

# Separate features from the target
features = real_data.drop(columns=[TARGET_COLUMN])
target = real_data[TARGET_COLUMN]

# Automatically identify numerical and categorical columns from the features
numerical_cols = features.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = features.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numerical columns identified: {numerical_cols}")
print(f"Categorical columns identified: {categorical_cols}")

# Create a preprocessing pipeline using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'  # Keep other columns (if any) untouched
)

# Apply the transformations
processed_features = preprocessor.fit_transform(features)

# Get feature names after one-hot encoding for the new DataFrame
new_column_names = preprocessor.get_feature_names_out()

# Create the preprocessed DataFrame and add target column back
preprocessed_data = pd.DataFrame(processed_features, columns=new_column_names, index=real_data.index)
preprocessed_data[TARGET_COLUMN] = target

print("\nData after preprocessing (scaling and one-hot encoding):")
display(preprocessed_data.head())

Numerical columns identified: ['id', 'age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca']
Categorical columns identified: ['sex', 'dataset', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']

Data after preprocessing (scaling and one-hot encoding):


Unnamed: 0,num__id,num__age,num__trestbps,num__chol,num__thalch,num__oldpeak,num__ca,cat__sex_Female,cat__sex_Male,cat__dataset_Cleveland,...,cat__restecg_st-t abnormality,cat__exang_False,cat__exang_True,cat__slope_downsloping,cat__slope_flat,cat__slope_upsloping,cat__thal_fixed defect,cat__thal_normal,cat__thal_reversable defect,num
0,-1.596821,0.940446,0.74976,-0.262867,0.029124,1.069475,-0.718306,0.0,1.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0
1,-1.586375,1.384143,1.596354,0.747722,-1.790447,0.380309,2.487269,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,2
2,-1.57593,1.384143,-0.661231,-0.339138,-0.880662,1.327912,1.418744,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1
3,-1.565485,-1.943588,-0.096835,0.061285,1.632079,2.103224,-0.718306,0.0,1.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0
4,-1.555039,-1.499891,-0.096835,-0.81583,0.982232,0.294163,-0.718306,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0


In [6]:
# @title 6. Split Data into Training and Hold-Out Test Sets (Fool-Proof)
# This cell now checks if stratification is possible before splitting the data.

# Check the distribution of the target column
target_counts = preprocessed_data.value_counts()
min_class_count = target_counts.min()

stratify_param = None
# Stratification requires at least 2 members of each class
if min_class_count >= 2:
    print("Stratification is possible. Splitting data with stratification.")
    stratify_param = preprocessed_data
else:
    print(f"Warning: The least populated class in '{TARGET_COLUMN}' has only {min_class_count} member(s).")
    print("Stratification is not possible and will be skipped.")

# Split the PREPROCESSED data once
real_train_data, real_test_data = train_test_split(
    preprocessed_data,
    test_size=0.2,
    stratify=stratify_param, # Will be None if stratification is not possible
    random_state=42
)

print(f"\nReal training data shape: {real_train_data.shape}")
print(f"Real hold-out test data shape: {real_test_data.shape}")

Stratification is not possible and will be skipped.

Real training data shape: (239, 30)
Real hold-out test data shape: (60, 30)


In [7]:
# @title 7. Train Generative Models and Create Synthetic Data (Fool-Proof)
# This cell includes a dynamic check to prevent SMOTE from failing on small minority classes.

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=real_train_data)

# --- Model 1: Gaussian Copula ---
print("Training GaussianCopula model...")
gauss_copula_synthesizer = GaussianCopulaSynthesizer(metadata)
gauss_copula_synthesizer.fit(real_train_data)
gauss_copula_data = gauss_copula_synthesizer.sample(num_rows=len(real_train_data))
print("GaussianCopula training complete.\n")

# --- Model 2: CTGAN ---
print("Training CTGAN model...")
ctgan_synthesizer = CTGANSynthesizer(metadata, epochs=100, verbose=False)
ctgan_synthesizer.fit(real_train_data)
ctgan_data = ctgan_synthesizer.sample(num_rows=len(real_train_data))
print("CTGAN training complete.\n")

# --- Model 3: SMOTE (with safety check) ---
print("Applying SMOTE for data augmentation...")
X_train = real_train_data.drop(columns=TARGET_COLUMN)
y_train = real_train_data

# --- SMOTE Safety Check ---
minority_class_count = y_train.value_counts().min()
# SMOTE's k_neighbors must be less than the number of samples in the minority class. Default is 5.
k_neighbors = 5
if minority_class_count <= k_neighbors:
    k_neighbors = minority_class_count - 1 # Adjust k_neighbors to be safe
    print(f"Warning: Minority class has only {minority_class_count} samples. Adjusting SMOTE k_neighbors to {k_neighbors} to prevent an error.")

if k_neighbors > 0:
    smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    smote_data = pd.DataFrame(X_resampled, columns=X_train.columns)
    smote_data = y_resampled
    print("SMOTE application complete.\n")
else:
    print("SMOTE cannot be applied (minority class has 1 or 0 samples). Skipping SMOTE.\n")
    smote_data = None # Set to None so it can be handled in the next step

print("All possible synthetic datasets have been generated.")

Training GaussianCopula model...
GaussianCopula training complete.

Training CTGAN model...


  File "C:\Users\paava\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\paava\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\paava\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\paava\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


CTGAN training complete.

Applying SMOTE for data augmentation...
SMOTE cannot be applied (minority class has 1 or 0 samples). Skipping SMOTE.

All possible synthetic datasets have been generated.


In [8]:
# @title 8. Evaluate the Synthetic Datasets (Corrected)
# This cell now uses the correct evaluation metric for multiclass targets.

# --- FIX: Import the correct multiclass evaluation metric ---
from sdmetrics.single_table import MulticlassDecisionTreeClassifier

synthetic_datasets = {
    "Gaussian Copula": gauss_copula_data,
    "CTGAN": ctgan_data
}
# Only add SMOTE to the evaluation if it was successfully generated
if smote_data is not None:
    synthetic_datasets = smote_data

results = []

metadata_dict = metadata.to_dict()

for model_name, synthetic_data in synthetic_datasets.items():
    print(f"Evaluating model: {model_name}...")

    # 1. Statistical Quality Score
    quality_report = QualityReport()
    quality_report.generate(real_train_data, synthetic_data, metadata_dict)
    quality_score = quality_report.get_score()

    # 2. Machine Learning Efficacy Score (TSTR)
    # --- FIX: Use the MulticlassDecisionTreeClassifier for evaluation ---
    ml_efficacy_score = MulticlassDecisionTreeClassifier.compute(
        train_data=synthetic_data,
        test_data=real_test_data,
        target=TARGET_COLUMN,
        metadata=metadata_dict
    )

    results.append({
        "Model": model_name,
        "Statistical Quality Score": quality_score,
        "ML Efficacy (F1-Score)": ml_efficacy_score
    })

print("\nEvaluation complete.")

Evaluating model: Gaussian Copula...
Generating report ...

(1/2) Evaluating Column Shapes: |███████████████████████████████████████████████████| 30/30 [00:00<00:00, 1251.66it/s]|
Column Shapes Score: 93.84%

(2/2) Evaluating Column Pair Trends: |█████████████████████████████████████████████| 435/435 [00:02<00:00, 181.02it/s]|
Column Pair Trends Score: 88.47%

Overall Score (Average): 91.15%

Evaluating model: CTGAN...
Generating report ...

(1/2) Evaluating Column Shapes: |████████████████████████████████████████████████████| 30/30 [00:00<00:00, 810.89it/s]|
Column Shapes Score: 89.14%

(2/2) Evaluating Column Pair Trends: |█████████████████████████████████████████████| 435/435 [00:02<00:00, 200.09it/s]|
Column Pair Trends Score: 82.49%

Overall Score (Average): 85.81%


Evaluation complete.


In [9]:
# @title 9. Display and Compare Results
# Finally, let's display the results in a clean table.

if results:
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values(by="ML Efficacy (F1-Score)", ascending=False)
    print("--- Comparative Results of Synthetic Data Generation Models ---")
    display(results_df)
else:
    print("No models were successfully evaluated.")

--- Comparative Results of Synthetic Data Generation Models ---


Unnamed: 0,Model,Statistical Quality Score,ML Efficacy (F1-Score)
1,CTGAN,0.858139,0.200317
0,Gaussian Copula,0.911541,0.199111


In [10]:
# @title 10. Save Generated Datasets to CSV
# This cell saves each generated synthetic dataset to a CSV file.

for model_name, data in synthetic_datasets.items():
    if data is not None:
        filename = f"synthetic_data_{model_name.lower().replace(' ', '_')}.csv"
        data.to_csv(filename, index=False)
        print(f"Saved {model_name} data to '{filename}'")

Saved Gaussian Copula data to 'synthetic_data_gaussian_copula.csv'
Saved CTGAN data to 'synthetic_data_ctgan.csv'


In [11]:
# @title 11. Visually Compare Data Distributions (Corrected)
# This cell allows you to pick a column and visually compare the distributions
# of the real data vs. the synthetic data from each model.

# Get available columns from the original (pre-processed) data for selection
plot_columns = real_train_data.columns.tolist()
print("Available columns for plotting:")
for col in plot_columns:
    print(f"- {col}")

while True:
    try:
        column_to_plot = input("\nPlease enter the name of the column you want to visualize: ")
        if column_to_plot in plot_columns:
            break
        else:
            print(f"Error: '{column_to_plot}' is not a valid column name. Please try again.")
    except Exception as e:
        print(f"An error occurred: {e}")

print("\n--- Generating Comparison Plots ---")
for model_name, synthetic_data in synthetic_datasets.items():
    if synthetic_data is not None:
        print(f"\nComparing '{column_to_plot}' for {model_name} model:")
        # --- FIX: Removed the unexpected 'metadata' argument ---
        fig = get_column_plot(
            real_data=real_train_data,
            synthetic_data=synthetic_data,
            column_name=column_to_plot
        )
        fig.show()

Available columns for plotting:
- num__id
- num__age
- num__trestbps
- num__chol
- num__thalch
- num__oldpeak
- num__ca
- cat__sex_Female
- cat__sex_Male
- cat__dataset_Cleveland
- cat__dataset_Hungary
- cat__dataset_VA Long Beach
- cat__cp_asymptomatic
- cat__cp_atypical angina
- cat__cp_non-anginal
- cat__cp_typical angina
- cat__fbs_False
- cat__fbs_True
- cat__restecg_lv hypertrophy
- cat__restecg_normal
- cat__restecg_st-t abnormality
- cat__exang_False
- cat__exang_True
- cat__slope_downsloping
- cat__slope_flat
- cat__slope_upsloping
- cat__thal_fixed defect
- cat__thal_normal
- cat__thal_reversable defect
- num



Please enter the name of the column you want to visualize:  num



--- Generating Comparison Plots ---

Comparing 'num' for Gaussian Copula model:



Comparing 'num' for CTGAN model:
