In [1]:
import pandas as pd
from dp_cgans import DP_CGAN


In [2]:
real_data = pd.read_csv("test_datasets/real_data_adult.csv", index_col=0)
print(f"Test dataset has {real_data.shape[0]} instances with {real_data.shape[1]} features")

Test dataset has 200 instances with 9 features


In [3]:
synthetic_data = pd.read_csv("test_datasets/syn_data_adult.csv", index_col=0)
print(f"Synthetic dataset has {synthetic_data.shape[0]} instances with {synthetic_data.shape[1]} features")

Synthetic dataset has 100 instances with 9 features


In [4]:
real_data.head()

Unnamed: 0,Age,Workclass,Education,Occupation,Relationship,Race,Sex,Hours-per-week,Over-50K
0,39,State-gov,Bachelors,Adm-clerical,Not-in-family,White,Male,40,<=50K
1,50,Self-emp-not-inc,Bachelors,Exec-managerial,Husband,White,Male,13,<=50K
2,38,Private,HS-grad,Handlers-cleaners,Not-in-family,White,Male,40,<=50K
3,53,Private,11th,Handlers-cleaners,Husband,Black,Male,40,<=50K
4,28,Private,Bachelors,Prof-specialty,Wife,Black,Female,40,<=50K


In [9]:
violations = real_data[((real_data['Relationship'] == 'Husband') & (real_data['Sex'] != 'Male')) | ((real_data['Relationship'] == 'Wife') & (real_data['Sex'] != 'Female'))] 


print('Number of violations in the real data:', len(violations))


Number of violations in the real data: 0


In [10]:
violations = synthetic_data[((synthetic_data['Relationship'] == 'Husband') & (synthetic_data['Sex'] != 'Male')) | ((synthetic_data['Relationship'] == 'Wife') & (synthetic_data['Sex'] != 'Female'))] 


print('Number of violations in the real data:', len(violations))


Number of violations in the real data: 17


In [None]:
# Writing a Custom Constraint Class

def is_valid(column_names, data):
  """
  This function returns a series of True/False values that
  specify whether the constaint is met by every row in the data
  """

  # Assume the first column name is the Relationship column
  # and the second column is the Sex column
  relationship = column_names[0]
  sex = column_names[1]

  # Define conditions where Wives are always Females and Husbands are always Males
  # Any other relationship status is independent of gender and considered valid
  valid_husband = (data[relationship] == 'Husband') & (data[sex] == 'Male')
  valid_wife = (data[relationship] == 'Wife') & (data[sex] == 'Female')
  other_relationship = ~(data[relationship].isin(['Husband', 'Wife']))

  # Return True for rows where conditions are met
  return valid_husband | valid_wife | other_relationship


In [None]:
validity_check = is_valid(['Relationship', 'Sex'], synthetic_data)
validity_check.value_counts()

In [None]:
def transform(column_names, data):
  """
  In this function, we'll preprocess the data to make it ready for machine learning.
  """

  # Assume the first column name is the Relationship column
  # and the second column is the Sex column
  relationship = column_names[0]
  sex = column_names[1]

  # let's replace rewards members' amenities with a typical value (median)
  # typical_value = data[numerical_column].median()
  typical_value = data[relationship].mode()[0]
  data[relationship] = data[relationship].mask((data[sex] == 'Female' | data[sex] == 'Male'), typical_value)

  return data



In [None]:
def reverse_transform(column_names, data):
  """
  In this function, we can reverse any functionality in order to create fully valid data
  """

  # Assume the first column name is the Relationship column
  # and the second column is the Sex column
  relationship = column_names[0]
  sex = column_names[1]

  # let's replace rewards members' amenities with a typical value (median)
  # typical_value = data[numerical_column].median()
  data[relationship] = data[relationship].mask(data[sex] == 'Female', 'Wife')
  data[relationship] = data[relationship].mask(data[sex] == 'Male', 'Husband')

  return data

In [None]:
from sdv.constraints import create_custom_constraint_class

SexToRelationshipConstraint = create_custom_constraint_class(
    is_valid_fn=is_valid,
    transform_fn=transform,
    reverse_transform_fn=reverse_transform
)

In [None]:
wifes_are_females_husbands_are_males = {
    'constraint_class': 'SexToRelationshipConstraint',
    'constraint_parameters': {
        'column_names': ['Relationship', 'Sex']
    }
}


In [None]:
model = DP_CGAN(
   epochs=2, # number of training epochs
   batch_size=100, # the size of each batch
   log_frequency=True,
   verbose=True,
   generator_dim=(128, 128, 128),
   discriminator_dim=(128, 128, 128),
   generator_lr=2e-4,
   discriminator_lr=2e-4,
   discriminator_steps=1,
   private=False,
)


In [None]:
model.add_constraints([
    wifes_are_females_husbands_are_males
])

In [None]:

print("Start training model")
model.fit(real_data, )
model.save("/content/gdrive/MyDrive/Notebooks/test data/generated data/generator.pkl")

# Generate 100 synthetic rows
syn_data = model.sample(100)
syn_data.to_csv("/content/gdrive/MyDrive/Notebooks/test data/generated data/syn_data_file.csv")