# Notebook used to evaluate the ML efficiency

In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


NOTE: small mistake, GReaT trained on full adult income dataset, not split into train test
perf may be a bit higher than expected when trained on synthetic data

In [4]:
original = load_dataset('scikit-learn/adult-census-income', split="train").to_pandas()
synth = pd.read_csv("synth_samples/adult_synthetic.csv")

y_og = original["income"]
original.drop("income", axis=1, inplace=True)
y_syn = synth["income"]
synth.drop("income", axis=1, inplace=True)

synth.drop("Unnamed: 0", axis=1, inplace=True)

print(original.shape, synth.shape)

(32561, 14) (32500, 14)


In [5]:
cat_cols = ["workclass", "education", "marital.status", "occupation", "relationship", "race", "sex", "native.country"]

enc = OneHotEncoder()
og_one_hot = pd.DataFrame(enc.fit_transform(original[cat_cols]).toarray(), columns=enc.get_feature_names_out())

original.drop(cat_cols, axis=1, inplace=True)
original = pd.concat([original, og_one_hot, y_og], axis=1)



enc = OneHotEncoder()
synth_one_hot = pd.DataFrame(enc.fit_transform(synth[cat_cols]).toarray(), columns=enc.get_feature_names_out())

synth.drop(cat_cols, axis=1, inplace=True)
synth = pd.concat([synth, synth_one_hot, y_syn], axis=1)


print(original.shape, synth.shape)


(32561, 109) (32500, 159)


In [6]:
unk = []
for c in synth.columns:
    if c not in original.columns:
        unk.append(c)
        
for c in original.columns:
    if c not in synth.columns:
        unk.append(c)     
print(len(unk))

# deleting columns that are not in common
original.drop(unk, axis=1, inplace=True, errors="ignore")
synth.drop(unk, axis=1, inplace=True, errors="ignore")

print(original.shape, synth.shape)


62
(32561, 103) (32500, 103)


In [7]:
og_train, og_test = train_test_split(original, test_size=0.2, random_state=42)
synth_train, synth_test = train_test_split(synth, test_size=0.2, random_state=42)

y_og_train      = og_train["income"]
y_og_test       = og_test["income"]
y_synth_train   = synth_train["income"]

og_train.drop("income", axis=1, inplace=True)
og_test.drop("income", axis=1, inplace=True)
synth_train.drop("income", axis=1, inplace=True)
synth_test.drop("income", axis=1, inplace=True)


print(og_train.shape, og_test.shape, synth_train.shape)
print(y_og_train.shape, y_og_test.shape, y_synth_train.shape)

(26048, 102) (6513, 102) (26000, 102)
(26048,) (6513,) (26000,)


## Train real, test real

In [8]:
model = DecisionTreeClassifier(max_depth=8)

In [9]:
model.fit(og_train, y_og_train)

In [11]:
model.score(og_test, y_og_test)*100

85.5980346998311

## train synthetic, test real

In [12]:
model_synth = DecisionTreeClassifier(max_depth=8)

In [13]:
model_synth.fit(synth_train, y_synth_train)

In [14]:
model_synth.score(og_test, y_og_test)*100

84.79963150621833