# Notebook used to evaluate the ML efficiency

In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
import pandas as pd
from datasets import load_dataset
import numpy as np
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def build_mapping(unique):
    letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    letter_idx = 0
    offset = -1
    mapping = {}
    for value in unique:
        if offset == -1:
            mapping[value] = letters[letter_idx]
            letter_idx += 1
        else:
            mapped = letters[offset] + letters[letter_idx]
            mapping[value] = mapped
            letter_idx += 1
            
        if letter_idx == 26:
            offset += 1
            letter_idx = 0
    return mapping

def reverse_mapping(mapping):
    res = {}
    for k in mapping.keys():
        res[mapping[k]] = k
    return res

In [3]:
def anonymize_dataset(dataframe : pd.DataFrame, cat_cols : list):
    """
    Anonymize the given dataset with the following properties
    - continuous values of features are scaled with MinMaxScaler (between 0 and 1)
    - categorical values of features are replaced by capital letters (A, B, ..., AA, AB, ...)
    - names of features are replaced by capital letters (A, B, ..., AA, AA, ...)
    
    args:
        * df: the dataframe to anonymize
        * cat_cols: list of categorical columns in df
    
    Returns the anonymized dataframe with the fit scaler, categorical mapping for each feature and column mapping
    """
    df = dataframe.copy()
    not_cat_cols = list(df.columns)
    cat_mapping = {}
    
    for c in cat_cols:
        not_cat_cols.remove(c)  # remove the column name from all the columns for later
        
        unique = df[c].unique()     # create mapping and replace values in categorical features
        mapping = build_mapping(unique)
        cat_mapping[c] = mapping
        df[c].replace(mapping, inplace=True)
        
    # scale continuous features    
    scaler = MinMaxScaler()
    df[not_cat_cols] = scaler.fit_transform(df[not_cat_cols])
    
    # replace columns names
    col_mapping = build_mapping(df.columns)
    df.rename(columns=col_mapping, inplace=True)
    
    return df, scaler, cat_mapping, col_mapping

In [2]:
data = load_dataset('scikit-learn/adult-census-income', split="train").to_pandas()
data.shape

(32561, 15)

In [10]:
cols = data.columns
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
col_mapping = {}

for i, col in enumerate(cols):
    col_mapping[col] = letters[i]
col_mapping

{'age': 'A',
 'workclass': 'B',
 'fnlwgt': 'C',
 'education': 'D',
 'education.num': 'E',
 'marital.status': 'F',
 'occupation': 'G',
 'relationship': 'H',
 'race': 'I',
 'sex': 'J',
 'capital.gain': 'K',
 'capital.loss': 'L',
 'hours.per.week': 'M',
 'native.country': 'N',
 'income': 'O'}

In [11]:
renamed = data.copy()
renamed.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [13]:
# full renaming:
# values in categorical features are renamed with A B C D ... AA AB AC ... BA BB BC ...
# values in continuous features are scaled (one scaler for each continuous feature)
# mapping (for categorical) and scaler (for continuous) should be saved to do inverse operation afterwards

# process categorical columns
cat_cols = ["workclass", "education", "marital.status", "occupation", "relationship", "race", "sex", "native.country", "income"]
cat_mapping = {}            
        
for col in cat_cols:
    unique = data[col].unique()
    mapping = build_mapping(unique)
    cat_mapping[col] = mapping
    renamed[col].replace(mapping, inplace=True)
        
renamed.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,A,77053,A,9,A,A,A,A,A,0,4356,40,A,A
1,82,B,132870,A,9,A,B,A,A,A,0,4356,18,A,A
2,66,A,186061,B,10,A,A,B,B,A,0,4356,40,A,A
3,54,B,140359,C,4,B,C,B,A,A,0,3900,40,A,A
4,41,B,264663,B,10,C,D,C,A,A,0,3900,40,A,A


In [15]:
# process continuous features

cont_cols = data.drop(cat_cols, axis=1).columns

scaler = MinMaxScaler()
renamed[cont_cols] = scaler.fit_transform(renamed[cont_cols])
renamed.head()


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,1.0,A,0.043987,A,0.533333,A,A,A,A,A,0.0,1.0,0.397959,A,A
1,0.890411,B,0.081896,A,0.533333,A,B,A,A,A,0.0,1.0,0.173469,A,A
2,0.671233,A,0.118021,B,0.6,A,A,B,B,A,0.0,1.0,0.397959,A,A
3,0.506849,B,0.086982,C,0.2,B,C,B,A,A,0.0,0.895317,0.397959,A,A
4,0.328767,B,0.171404,B,0.6,C,D,C,A,A,0.0,0.895317,0.397959,A,A


In [16]:
renamed = renamed.rename(columns=col_mapping)
renamed.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O
0,1.0,A,0.043987,A,0.533333,A,A,A,A,A,0.0,1.0,0.397959,A,A
1,0.890411,B,0.081896,A,0.533333,A,B,A,A,A,0.0,1.0,0.173469,A,A
2,0.671233,A,0.118021,B,0.6,A,A,B,B,A,0.0,1.0,0.397959,A,A
3,0.506849,B,0.086982,C,0.2,B,C,B,A,A,0.0,0.895317,0.397959,A,A
4,0.328767,B,0.171404,B,0.6,C,D,C,A,A,0.0,0.895317,0.397959,A,A


In [17]:
renamed.to_csv("adult_full_rename.csv")

In [18]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [19]:
renamed.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O
0,1.0,A,0.043987,A,0.533333,A,A,A,A,A,0.0,1.0,0.397959,A,A
1,0.890411,B,0.081896,A,0.533333,A,B,A,A,A,0.0,1.0,0.173469,A,A
2,0.671233,A,0.118021,B,0.6,A,A,B,B,A,0.0,1.0,0.397959,A,A
3,0.506849,B,0.086982,C,0.2,B,C,B,A,A,0.0,0.895317,0.397959,A,A
4,0.328767,B,0.171404,B,0.6,C,D,C,A,A,0.0,0.895317,0.397959,A,A


In [23]:
with open("cat_mapping_scaler", "wb") as file:
    pickle.dump((cat_mapping, col_mapping, scaler), file)

In [3]:
with open('cat_mapping_scaler', 'rb') as file:
    cat_mapping, col_mapping, scaler = pickle.load(file)

In [12]:
# build reverse mapping

r_col_map = reverse_mapping(col_mapping)
r_cat_map = {}
for k in cat_mapping.keys():
    r_cat_map[k] = reverse_mapping(cat_mapping[k])
        


In [14]:
# build back the synth dataset

fake = pd.read_csv("synth_samples/adult_full_rename_synthetic.csv")
fake.drop("Unnamed: 0", axis=1, inplace=True)

fake.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O
0,0.09589,B,0.174026,A,0.533333,E,G,E,A,B,0.0,0.0,0.397959,A,A
1,0.438356,F,0.073244,B,0.6,E,H,E,A,B,0.0,0.0,0.602041,A,A
2,0.041096,B,0.086746,B,0.6,D,G,C,A,B,0.0,0.0,0.397959,A,A
3,0.30137,B,0.192878,A,0.533333,D,E,C,B,B,0.0,0.0,0.397959,A,A
4,0.246575,B,0.113557,B,0.6,D,F,C,A,A,0.0,0.0,0.397959,A,A


In [15]:
fake = fake.rename(columns=r_col_map)
fake.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,0.09589,B,0.174026,A,0.533333,E,G,E,A,B,0.0,0.0,0.397959,A,A
1,0.438356,F,0.073244,B,0.6,E,H,E,A,B,0.0,0.0,0.602041,A,A
2,0.041096,B,0.086746,B,0.6,D,G,C,A,B,0.0,0.0,0.397959,A,A
3,0.30137,B,0.192878,A,0.533333,D,E,C,B,B,0.0,0.0,0.397959,A,A
4,0.246575,B,0.113557,B,0.6,D,F,C,A,A,0.0,0.0,0.397959,A,A


In [17]:
cat_cols = ["workclass", "education", "marital.status", "occupation", "relationship", "race", "sex", "native.country", "income"]

for col in cat_cols:
    fake[col].replace(r_cat_map[col], inplace=True)
fake.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,0.09589,Private,0.174026,HS-grad,0.533333,Married-civ-spouse,Craft-repair,Husband,White,Male,0.0,0.0,0.397959,United-States,<=50K
1,0.438356,Self-emp-inc,0.073244,Some-college,0.6,Married-civ-spouse,Transport-moving,Husband,White,Male,0.0,0.0,0.602041,United-States,<=50K
2,0.041096,Private,0.086746,Some-college,0.6,Never-married,Craft-repair,Own-child,White,Male,0.0,0.0,0.397959,United-States,<=50K
3,0.30137,Private,0.192878,HS-grad,0.533333,Never-married,Other-service,Own-child,Black,Male,0.0,0.0,0.397959,United-States,<=50K
4,0.246575,Private,0.113557,Some-college,0.6,Never-married,Adm-clerical,Own-child,White,Female,0.0,0.0,0.397959,United-States,<=50K


In [18]:
tmp = fake.drop(cat_cols, axis=1)
not_cat_cols = tmp.columns
not_cat_cols

Index(['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss',
       'hours.per.week'],
      dtype='object')

In [19]:
fake[not_cat_cols] = scaler.inverse_transform(fake[not_cat_cols])
fake.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,24.0,Private,268525.0,HS-grad,9.0,Married-civ-spouse,Craft-repair,Husband,White,Male,0.0,0.0,40.0,United-States,<=50K
1,49.0,Self-emp-inc,120131.0,Some-college,10.0,Married-civ-spouse,Transport-moving,Husband,White,Male,0.0,0.0,60.0,United-States,<=50K
2,20.0,Private,140011.0,Some-college,10.0,Never-married,Craft-repair,Own-child,White,Male,0.0,0.0,40.0,United-States,<=50K
3,39.0,Private,296282.0,HS-grad,9.0,Never-married,Other-service,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,35.0,Private,179488.0,Some-college,10.0,Never-married,Adm-clerical,Own-child,White,Female,0.0,0.0,40.0,United-States,<=50K


In [None]:
fake.to_csv("synth_samples/adult_full_rename_synthetic_translated.csv", index=False)

### anonymize california

In [21]:
real = pd.read_csv("housing.csv")
cat_cols = ["ocean_proximity"]

In [22]:
renamed, scaler, cat_mapping, col_mapping = anonymize_dataset(real, cat_cols)
renamed.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
0,0.211155,0.567481,0.784314,0.022331,0.019863,0.008941,0.020556,0.539668,0.902266,A
1,0.212151,0.565356,0.392157,0.180503,0.171477,0.06721,0.186976,0.538027,0.708247,A
2,0.210159,0.564293,1.0,0.03726,0.02933,0.013818,0.028943,0.466028,0.695051,A
3,0.209163,0.564293,1.0,0.032352,0.036313,0.015555,0.035849,0.354699,0.672783,A
4,0.209163,0.564293,1.0,0.04133,0.043296,0.015752,0.042427,0.230776,0.674638,A


In [24]:
with open("scaler_mappings_housing", "wb") as file:
    pickle.dump((scaler, cat_mapping, col_mapping), file)

In [22]:
with open("scaler_mappings_housing", "rb") as file:
    scaler, cat_mapping, col_mapping = pickle.load(file)

In [None]:
renamed.to_csv("housing_renamed.csv", index=False)

In [23]:
real = pd.read_csv("housing.csv")
renamed = pd.read_csv("synth_samples/housing_renamed_synthetic.csv")
renamed

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
0,0.719124,0.015940,0.549020,0.031716,0.041279,0.022618,0.043907,0.112074,0.149486,D
1,0.594622,0.161530,0.666667,0.036167,0.040968,0.036128,0.039138,0.293968,1.000000,B
2,0.240040,0.510096,0.392157,0.025561,0.041434,0.019731,0.042427,0.082764,0.432990,B
3,0.703187,0.246546,0.627451,0.050791,0.057573,0.023739,0.057556,0.282334,0.190517,C
4,0.305777,0.638682,0.274510,0.030063,0.039727,0.023039,0.045387,0.066530,0.157321,C
...,...,...,...,...,...,...,...,...,...,...
20495,0.632470,0.134963,0.333333,0.037464,0.037709,0.020993,0.036507,0.212907,0.348660,B
20496,0.700199,0.167906,0.196078,0.051300,0.083178,0.065164,0.085183,0.136702,0.082476,C
20497,0.612550,0.142402,0.470588,0.034768,0.043296,0.024328,0.043743,0.167536,0.304125,B
20498,0.513944,0.192349,0.568627,0.044763,0.069212,0.029429,0.066930,0.229314,0.602474,D


In [24]:
not_cat_cols = list(real.columns)
not_cat_cols.remove("ocean_proximity")
col_mapping = reverse_mapping(col_mapping)
fake = renamed.rename(columns=col_mapping)
fake.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,0.719124,0.01594,0.54902,0.031716,0.041279,0.022618,0.043907,0.112074,0.149486,D
1,0.594622,0.16153,0.666667,0.036167,0.040968,0.036128,0.039138,0.293968,1.0,B
2,0.24004,0.510096,0.392157,0.025561,0.041434,0.019731,0.042427,0.082764,0.43299,B
3,0.703187,0.246546,0.627451,0.050791,0.057573,0.023739,0.057556,0.282334,0.190517,C
4,0.305777,0.638682,0.27451,0.030063,0.039727,0.023039,0.045387,0.06653,0.157321,C


In [25]:
fake["ocean_proximity"].replace(reverse_mapping(cat_mapping["ocean_proximity"]), inplace=True)
fake.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,0.719124,0.01594,0.54902,0.031716,0.041279,0.022618,0.043907,0.112074,0.149486,NEAR OCEAN
1,0.594622,0.16153,0.666667,0.036167,0.040968,0.036128,0.039138,0.293968,1.0,<1H OCEAN
2,0.24004,0.510096,0.392157,0.025561,0.041434,0.019731,0.042427,0.082764,0.43299,<1H OCEAN
3,0.703187,0.246546,0.627451,0.050791,0.057573,0.023739,0.057556,0.282334,0.190517,INLAND
4,0.305777,0.638682,0.27451,0.030063,0.039727,0.023039,0.045387,0.06653,0.157321,INLAND


In [26]:
fake[not_cat_cols] = scaler.inverse_transform(fake[not_cat_cols])
fake.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-117.13,32.69,29.0,1249.0,267.0,810.0,268.0,2.125,87500.0,NEAR OCEAN
1,-118.38,34.06,35.0,1424.0,265.0,1292.0,239.0,4.7625,500001.0,<1H OCEAN
2,-121.94,37.34,21.0,1007.0,268.0,707.0,259.0,1.7,225000.0,<1H OCEAN
3,-117.29,34.86,33.0,1999.0,372.0,850.0,351.0,4.5938,107400.0,INLAND
4,-121.28,38.55,15.0,1184.0,257.0,825.0,277.0,1.4646,91300.0,INLAND


In [None]:
fake.to_csv("synth_samples/housing_renamed_synthetic_translated.csv", index=False)

### anonymize HELOC

In [4]:
real = pd.read_csv("heloc.csv")
cat_cols = ["RiskPerformance"]
real

Unnamed: 0,RiskPerformance,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,...,PercentInstallTrades,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance
0,Bad,55,144,4,84,20,3,0,83,2,...,43,0,0,0,33,-8,8,1,1,69
1,Bad,61,58,15,41,2,4,4,100,-7,...,67,0,0,0,0,-8,0,-8,-8,0
2,Bad,67,66,5,24,9,0,0,100,-7,...,44,0,4,4,53,66,4,2,1,86
3,Bad,66,169,1,73,28,1,1,93,76,...,57,0,5,4,72,83,6,4,3,91
4,Bad,81,333,27,132,12,0,0,100,-7,...,25,0,1,1,51,89,3,1,0,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10454,Good,73,131,5,57,21,0,0,95,80,...,19,7,0,0,26,-8,5,2,0,100
10455,Bad,65,147,39,68,11,0,0,92,28,...,42,1,1,1,86,53,2,2,1,80
10456,Bad,74,129,6,64,18,1,1,100,-7,...,33,3,4,4,6,-8,5,-8,0,56
10457,Bad,72,234,12,113,42,2,2,96,35,...,20,6,0,0,19,-8,4,1,0,38


In [6]:
renamed, scaler, cat_mapping, col_mapping = anonymize_dataset(real, cat_cols)
renamed

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,...,O,P,Q,R,S,T,U,V,W,X
0,A,0.621359,0.188424,0.033163,0.237245,0.329545,0.428571,0.321429,0.844037,0.119565,...,0.477064,0.272727,0.120000,0.120000,0.174274,0.002083,0.414634,0.31250,0.370370,0.715596
1,A,0.679612,0.082512,0.061224,0.127551,0.125000,0.464286,0.464286,1.000000,0.021739,...,0.697248,0.272727,0.120000,0.120000,0.037344,0.002083,0.219512,0.03125,0.037037,0.082569
2,A,0.737864,0.092365,0.035714,0.084184,0.204545,0.321429,0.321429,1.000000,0.021739,...,0.486239,0.272727,0.173333,0.173333,0.257261,0.156250,0.317073,0.34375,0.370370,0.871560
3,A,0.728155,0.219212,0.025510,0.209184,0.420455,0.357143,0.357143,0.935780,0.923913,...,0.605505,0.272727,0.186667,0.173333,0.336100,0.191667,0.365854,0.40625,0.444444,0.917431
4,A,0.873786,0.421182,0.091837,0.359694,0.238636,0.321429,0.321429,1.000000,0.021739,...,0.311927,0.272727,0.133333,0.133333,0.248963,0.204167,0.292683,0.31250,0.333333,0.816514
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10454,B,0.796117,0.172414,0.035714,0.168367,0.340909,0.321429,0.321429,0.954128,0.967391,...,0.256881,0.484848,0.120000,0.120000,0.145228,0.002083,0.341463,0.34375,0.333333,1.000000
10455,A,0.718447,0.192118,0.122449,0.196429,0.227273,0.321429,0.321429,0.926606,0.402174,...,0.467890,0.303030,0.133333,0.133333,0.394191,0.129167,0.268293,0.34375,0.370370,0.816514
10456,A,0.805825,0.169951,0.038265,0.186224,0.306818,0.357143,0.357143,1.000000,0.021739,...,0.385321,0.363636,0.173333,0.173333,0.062241,0.002083,0.341463,0.03125,0.333333,0.596330
10457,A,0.786408,0.299261,0.053571,0.311224,0.579545,0.392857,0.392857,0.963303,0.478261,...,0.266055,0.454545,0.120000,0.120000,0.116183,0.002083,0.317073,0.31250,0.333333,0.431193


In [7]:
renamed.to_csv("heloc_renamed.csv", index=False)

In [None]:
with open('scaler_mapping_heloc', 'wb') as file:
    pickle.dump((scaler, cat_mapping, col_mapping), file)

In [4]:
with open('scaler_mapping_heloc', 'rb') as file:
    scaler, cat_mapping, col_mapping = pickle.load(file)

In [9]:
fake = pd.read_csv("synth_samples/heloc_renamed_synthetic.csv")
cat_cols = ["RiskPerformance"]
fake.shape

(10400, 24)

In [12]:
fake = fake.rename(columns=reverse_mapping(col_mapping))
not_cat_cols = list(fake.columns)
not_cat_cols.remove(cat_cols[0])
fake.head()

Unnamed: 0,RiskPerformance,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,...,PercentInstallTrades,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance
0,B,0.873786,0.253695,0.045918,0.270408,0.443182,0.321429,0.321429,0.944954,0.902174,...,0.357798,0.030303,0.12,0.12,0.070539,0.002083,0.292683,0.3125,0.333333,0.0
1,B,0.902913,0.26601,0.028061,0.27551,0.420455,0.321429,0.321429,0.908257,0.467391,...,0.238532,0.272727,0.133333,0.133333,0.045643,0.002083,0.243902,0.3125,0.333333,0.2844
2,B,0.815534,0.139163,0.053571,0.158163,0.397727,0.321429,0.321429,1.0,0.021739,...,0.220183,0.272727,0.12,0.12,0.190871,0.1,0.341463,0.3125,0.37037,0.48624
3,B,0.854369,0.219212,0.033163,0.239796,0.545455,0.321429,0.321429,0.981651,0.01087,...,0.33945,0.545455,0.12,0.12,0.037344,0.15625,0.414634,0.34375,0.333333,0.54128
4,A,0.747573,0.157635,0.028061,0.104592,0.193182,0.321429,0.321429,0.889908,0.315217,...,0.311927,0.272727,0.146667,0.146667,0.195021,0.225,0.292683,0.34375,0.333333,0.84404


In [13]:
fake[not_cat_cols] = scaler.inverse_transform(fake[not_cat_cols])
fake.head()

Unnamed: 0,RiskPerformance,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,...,PercentInstallTrades,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance
0,B,81.0,197.0,9.0,97.0,30.0,0.0,0.0,94.0,74.0,...,30.0,-8.0,-1.040834e-15,-1.040834e-15,8.0,-8.0,3.0,1.0,0.0,-9.0
1,B,84.0,207.0,2.0,99.0,28.0,0.0,0.0,90.0,34.0,...,17.0,0.0,1.0,1.0,2.0,-8.0,1.0,1.0,0.0,21.9996
2,B,75.0,104.0,12.0,53.0,26.0,0.0,0.0,100.0,-7.0,...,15.0,0.0,-1.040834e-15,-1.040834e-15,37.0,39.0,5.0,1.0,1.0,44.00016
3,B,79.0,169.0,4.0,85.0,39.0,0.0,0.0,98.0,-8.0,...,28.0,9.0,-1.040834e-15,-1.040834e-15,-1.505046e-14,66.0,8.0,2.0,0.0,49.99952
4,A,68.0,119.0,2.0,32.0,8.0,0.0,0.0,88.0,20.0,...,25.0,0.0,2.0,2.0,38.0,99.0,3.0,2.0,0.0,83.00036


In [14]:
fake[cat_cols[0]].replace(reverse_mapping(cat_mapping[cat_cols[0]]), inplace=True)
fake.head()

Unnamed: 0,RiskPerformance,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,...,PercentInstallTrades,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance
0,Good,81.0,197.0,9.0,97.0,30.0,0.0,0.0,94.0,74.0,...,30.0,-8.0,-1.040834e-15,-1.040834e-15,8.0,-8.0,3.0,1.0,0.0,-9.0
1,Good,84.0,207.0,2.0,99.0,28.0,0.0,0.0,90.0,34.0,...,17.0,0.0,1.0,1.0,2.0,-8.0,1.0,1.0,0.0,21.9996
2,Good,75.0,104.0,12.0,53.0,26.0,0.0,0.0,100.0,-7.0,...,15.0,0.0,-1.040834e-15,-1.040834e-15,37.0,39.0,5.0,1.0,1.0,44.00016
3,Good,79.0,169.0,4.0,85.0,39.0,0.0,0.0,98.0,-8.0,...,28.0,9.0,-1.040834e-15,-1.040834e-15,-1.505046e-14,66.0,8.0,2.0,0.0,49.99952
4,Bad,68.0,119.0,2.0,32.0,8.0,0.0,0.0,88.0,20.0,...,25.0,0.0,2.0,2.0,38.0,99.0,3.0,2.0,0.0,83.00036


In [15]:
fake.to_csv("synth_samples/heloc_renamed_synthetic.translated.csv", index=False)

## Adult income

NOTE: small mistake, GReaT trained on full adult income dataset, not split into train test
perf may be a bit higher than expected when trained on synthetic data

In [5]:
original = load_dataset('scikit-learn/adult-census-income', split="train").to_pandas()
# synth = pd.read_csv("synth_samples/adult_synthetic.csv")

synth = pd.read_csv("synth_samples/adult_rename_synthetic.csv")
mapping = {}
cols = original.columns
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
for i, letter in enumerate(letters[:len(cols)]):
    mapping[letter] = cols[i]
synth = synth.rename(columns=mapping)

y_og = original["income"]
original.drop("income", axis=1, inplace=True)
y_syn = synth["income"]
synth.drop("income", axis=1, inplace=True)

synth.drop("Unnamed: 0", axis=1, inplace=True)

print(original.shape, synth.shape)

(32561, 14) (32500, 14)


In [6]:
cat_cols = ["workclass", "education", "marital.status", "occupation", "relationship", "race", "sex", "native.country"]

enc = OneHotEncoder()
og_one_hot = pd.DataFrame(enc.fit_transform(original[cat_cols]).toarray(), columns=enc.get_feature_names_out())

original.drop(cat_cols, axis=1, inplace=True)
original = pd.concat([original, og_one_hot, y_og], axis=1)



enc = OneHotEncoder()
synth_one_hot = pd.DataFrame(enc.fit_transform(synth[cat_cols]).toarray(), columns=enc.get_feature_names_out())

synth.drop(cat_cols, axis=1, inplace=True)
synth = pd.concat([synth, synth_one_hot, y_syn], axis=1)


print(original.shape, synth.shape)


(32561, 109) (32500, 104)


In [7]:
unk = []
for c in synth.columns:
    if c not in original.columns:
        unk.append(c)
        
for c in original.columns:
    if c not in synth.columns:
        unk.append(c)     
print(len(unk))

# deleting columns that are not in common
original.drop(unk, axis=1, inplace=True, errors="ignore")
synth.drop(unk, axis=1, inplace=True, errors="ignore")

print(original.shape, synth.shape)


5
(32561, 104) (32500, 104)


In [8]:
og_train, og_test = train_test_split(original, test_size=0.2, random_state=42)
synth_train, synth_test = train_test_split(synth, test_size=0.2, random_state=42)

y_og_train      = og_train["income"]
y_og_test       = og_test["income"]
y_synth_train   = synth_train["income"]

og_train.drop("income", axis=1, inplace=True)
og_test.drop("income", axis=1, inplace=True)
synth_train.drop("income", axis=1, inplace=True)
synth_test.drop("income", axis=1, inplace=True)


print(og_train.shape, og_test.shape, synth_train.shape)
print(y_og_train.shape, y_og_test.shape, y_synth_train.shape)

(26048, 103) (6513, 103) (26000, 103)
(26048,) (6513,) (26000,)


### Train real, test real

In [9]:
model = DecisionTreeClassifier(max_depth=8)

In [10]:
model.fit(og_train, y_og_train)

In [11]:
model.score(og_test, y_og_test)*100

85.5980346998311

In [11]:
model.score(og_test, y_og_test)*100

85.55197297712269

### train synthetic, test real

In [12]:
model_synth = DecisionTreeClassifier(max_depth=8)

In [13]:
model_synth.fit(synth_train, y_synth_train)

In [14]:
model_synth.score(og_test, y_og_test)*100  # without column renaming

84.79963150621833

In [14]:
model_synth.score(og_test, y_og_test)*100  # with column renaming

84.87640104406572

## California housing

In [15]:
real = pd.read_csv("housing.csv")
fake = pd.read_csv("synth_samples/housing_synthetic.csv")

fake.drop("Unnamed: 0", axis=1, inplace=True)

real.shape, fake.shape


((20640, 10), (20600, 10))

In [4]:
real.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [16]:

enc = OneHotEncoder()
real_prox = pd.DataFrame(enc.fit_transform(real["ocean_proximity"].values.reshape(-1, 1)).toarray(), columns=enc.get_feature_names_out())

real.drop("ocean_proximity", axis=1, inplace=True)
real = pd.concat([real, real_prox], axis=1)



enc = OneHotEncoder()
fake_prox = pd.DataFrame(enc.fit_transform(fake["ocean_proximity"].values.reshape(-1, 1)).toarray(), columns=enc.get_feature_names_out())

fake.drop("ocean_proximity", axis=1, inplace=True)
fake = pd.concat([fake, fake_prox], axis=1)

real.shape, fake.shape


((20640, 14), (20600, 14))

In [20]:
real_train, real_test = train_test_split(real, test_size=0.2, random_state=42)
fake_train, fake_test = train_test_split(fake, test_size=0.2, random_state=42)

y_train_real = real_train["median_house_value"]
y_test_real = real_test["median_house_value"]
y_train_fake = fake_train["median_house_value"]

real_train.drop("median_house_value", axis=1, inplace=True)
real_test.drop("median_house_value", axis=1, inplace=True)
fake_train.drop("median_house_value", axis=1, inplace=True)

print(real_train.shape, real_test.shape, fake_train.shape)
print(y_train_real.shape, y_test_real.shape, y_train_fake.shape)

(16512, 13) (4128, 13) (16480, 13)
(16512,) (4128,) (16480,)


### train real test real

In [21]:
from sklearn.tree import DecisionTreeRegressor

In [22]:
model = DecisionTreeRegressor(max_depth=10)

In [23]:
model.fit(real_train, y_train_real)

In [24]:
model.score(real_test, y_test_real)

0.7084213138706404

### Train synthetic test real

In [25]:
model = DecisionTreeRegressor(max_depth=10)

In [26]:
model.fit(fake_train, y_train_fake)

In [27]:
model.score(real_test, y_test_real)

0.5621995981355259

## HELOC

In [47]:
real = pd.read_csv("heloc.csv")
fake = pd.read_csv("synth_samples/heloc_synthetic.csv")

fake.drop("Unnamed: 0", axis=1, inplace=True)

real.shape, fake.shape

((10459, 24), (10400, 24))

In [48]:
real.head(5)

Unnamed: 0,RiskPerformance,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,...,PercentInstallTrades,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance
0,Bad,55,144,4,84,20,3,0,83,2,...,43,0,0,0,33,-8,8,1,1,69
1,Bad,61,58,15,41,2,4,4,100,-7,...,67,0,0,0,0,-8,0,-8,-8,0
2,Bad,67,66,5,24,9,0,0,100,-7,...,44,0,4,4,53,66,4,2,1,86
3,Bad,66,169,1,73,28,1,1,93,76,...,57,0,5,4,72,83,6,4,3,91
4,Bad,81,333,27,132,12,0,0,100,-7,...,25,0,1,1,51,89,3,1,0,80


In [49]:
y_real = real["RiskPerformance"]
X_real = real.drop("RiskPerformance", axis=1)

y_fake = fake["RiskPerformance"]
X_fake = fake.drop("RiskPerformance", axis=1)

print(X_real.shape, X_fake.shape)
print(y_real.shape, y_fake.shape)

(10459, 23) (10400, 23)
(10459,) (10400,)


In [50]:
X_real_train, X_real_test, y_real_train, y_real_test = train_test_split(X_real, y_real, test_size=0.2)
X_fake_train, X_fake_test, y_fake_train, y_fake_test = train_test_split(X_fake, y_fake, test_size=0.2)

X_fake_train.shape

(8320, 23)

### train real test real

In [51]:
model = DecisionTreeClassifier(max_depth=6)

model.fit(X_real_train, y_real_train)

In [53]:
model.score(X_real_test, y_real_test)*100

68.78585086042065

### train synthetic test real

In [54]:
model = DecisionTreeClassifier(max_depth=6)

model.fit(X_fake_train, y_fake_train)

In [55]:
score_fake = model.score(X_fake_test, y_fake_test)
score_tstr = model.score(X_real_test, y_real_test)

print(f"score fake: {score_fake*100}")
print(f"score TS-TR: {score_tstr*100}")

score fake: 77.59615384615385
score TS-TR: 67.97323135755258


In [20]:
scores_fake = cross_val_score(model, X_fake, y_fake, cv=cv)

print(np.mean(scores_fake)*100, np.std(scores_fake)*100)

78.52884615384615 0.5315490369648156


note: paper claims to avoid data leakage with split 0.8 0.2 when learning data but then uses a cross val score and do not mention on which data they do the cross val

## sick

In [37]:
# https://datahub.io/machine-learning/sick

real = pd.read_csv("sick.csv")
fake = pd.read_csv("synth_samples/sick_synthetic.csv")

fake.drop("Unnamed: 0", axis=1, inplace=True)

real.shape, fake.shape

((3772, 30), (3700, 30))

In [38]:
cat_cols = ["sex",
            "on_thyroxine",
            "query_on_thyroxine",
            'on_antithyroid_medication',
            'sick',
            "pregnant",
            'thyroid_surgery',
            'I131_treatment',
            'query_hypothyroid',
            'query_hyperthyroid',
            'lithium',
            'goitre',
            'tumor',
            'hypopituitary',
            'psych',
            'TSH_measured',
            'T3_measured',
            'TT4_measured',
            'T4U_measured',
            'FTI_measured',
            'TBG_measured',
            'TBG',
            'referral_source',
            ]

In [39]:
enc = OneHotEncoder()
real_enc = pd.DataFrame(enc.fit_transform(real[cat_cols]).toarray(), columns=enc.get_feature_names_out())

real.drop(cat_cols, axis=1, inplace=True)
real = pd.concat([real, real_enc], axis=1)


enc = OneHotEncoder()
fake_enc = pd.DataFrame(enc.fit_transform(fake[cat_cols]).toarray(), columns=enc.get_feature_names_out())

fake.drop(cat_cols, axis=1, inplace=True)
fake = pd.concat([fake, fake_enc], axis=1)

real.shape, fake.shape

((3772, 55), (3700, 54))

In [40]:
missing = []
for c in real.columns:
    if c not in fake.columns:
        missing.append(c)
        
real.drop(missing, axis=1, inplace=True)
print(real.shape)
print(missing)

(3772, 54)
['hypopituitary_t']


In [41]:
y_real = real["Class"]
X_real = real.drop("Class", axis=1)

y_fake = fake["Class"]
X_fake = fake.drop("Class", axis=1)

print(X_real.shape, X_fake.shape)
print(y_real.shape, y_fake.shape)

(3772, 53) (3700, 53)
(3772,) (3700,)


In [42]:
X_real_train, X_real_test, y_real_train, y_real_test = train_test_split(X_real, y_real, test_size=0.2)
X_fake_train, X_fake_test, y_fake_train, y_fake_test = train_test_split(X_fake, y_fake, test_size=0.2)

print(X_fake_train.shape)

(2960, 53)


### train real test real

In [43]:
model = DecisionTreeClassifier(max_depth=10)

model.fit(X_real_train, y_real_train)

In [44]:
model.score(X_real_test, y_real_test)

0.9841059602649007

### train synthetic test real

In [45]:
model = DecisionTreeClassifier(max_depth=10)

model.fit(X_fake_train, y_fake_train)

In [46]:
score_fake = model.score(X_fake_test, y_fake_test)
score_tstr = model.score(X_real_test, y_real_test)

print(f"score fake: {score_fake*100}")
print(f"score TS-TR: {score_tstr*100}")

score fake: 99.32432432432432
score TS-TR: 96.68874172185431
