In [1]:
import glob
import re
import os
def get_filenames_from_model(model_name, common_size, gamma, delta, other_schemes=None):
    # Define the search pattern, allowing any string for sample size part
    search_pattern_non_watermark = f"{model_name}_*_non-watermark.csv"
    search_pattern_watermark = f"{model_name}_*_with-watermark_gamma-{gamma}_delta-{delta}.csv"

    # Use glob to get all files matching the search
    directory = "/Users/minhkau/Documents/TUDelft/Year 3/RP/Code/tabular-gpt/samples"
    files_non_watermark = glob.glob(search_pattern_non_watermark, root_dir=directory)
    files_watermark = glob.glob(search_pattern_watermark, root_dir=directory)

    # Regular expressions to extract sample size from filenames
    regex_pattern_non_watermark = re.compile(rf"{model_name}_(\d+)_non-watermark.csv")
    regex_pattern_watermark = re.compile(rf"{model_name}_(\d+)_with-watermark_gamma-{gamma}_delta-{delta}.csv")

    largest_sample_size_non_watermark = -1
    largest_sample_size_with_watermark = -1
    largest_file_non_watermark = None
    largest_file_with_watermark = None
    # Iterate over the matching files and extract sample size
    for file in files_non_watermark:
        match = regex_pattern_non_watermark.match(os.path.basename(file))
        if match:
            sample_size = int(match.group(1))
            if sample_size == common_size:
                largest_file_non_watermark = file
                break
            if sample_size > largest_sample_size_non_watermark:
                largest_sample_size_non_watermark = sample_size
                largest_file_non_watermark = file

    for file in files_watermark:
        match = regex_pattern_watermark.match(os.path.basename(file))
        if match:
            sample_size = int(match.group(1))
            if sample_size == common_size:
                largest_file_with_watermark = file
                break
            if sample_size > largest_sample_size_with_watermark:
                largest_sample_size_with_watermark = sample_size
                largest_file_with_watermark = file
                
    if other_schemes:
        other_schemes_file_names = []
        for scheme_name in other_schemes:
            search_pattern_scheme = f"{model_name}_*_with-watermark_{scheme_name}.csv"
            regex_pattern_scheme = re.compile(rf"{model_name}_(\d+)_with-watermark_{scheme_name}.csv")
            files_scheme = glob.glob(search_pattern_scheme, root_dir=directory)
            largest_sample_size_scheme = -1
            largest_file_sheme = None
            for file in files_scheme:
                match = regex_pattern_scheme.match(os.path.basename(file))
                if match:
                    sample_size = int(match.group(1))
                    if sample_size == common_size:
                        largest_file_sheme = file
                        break
                    if sample_size > largest_sample_size_scheme:
                        largest_sample_size_scheme = sample_size
                        largest_file_sheme = file
            other_schemes_file_names.append(largest_file_sheme)
        return f"{model_name}.csv", largest_file_non_watermark, largest_file_with_watermark, other_schemes_file_names
            
    return f"{model_name}.csv", largest_file_non_watermark, largest_file_with_watermark

In [70]:
from sdmetrics.single_table import MLPRegressor, LinearRegression, LogisticDetection, BinaryDecisionTreeClassifier, BinaryMLPClassifier, LinearRegression, MLPRegressor


import pandas as pd


real_file_name, no_water_mark_file_name, with_water_mark_file_name = get_filenames_from_model('diabetes', common_size=1000, gamma=0.25, delta=2.0)
target = 'Outcome'


print(real_file_name)
print(no_water_mark_file_name)
print(with_water_mark_file_name)

samples_dir = "/Users/minhkau/Documents/TUDelft/Year 3/RP/Code/tabular-gpt/samples/"
real_path = samples_dir + real_file_name
synth_no_watermark_path = samples_dir + no_water_mark_file_name
synth_with_watermark_path = samples_dir + with_water_mark_file_name

real_table = pd.read_csv(real_path)
non_watermark_table = pd.read_csv(synth_no_watermark_path)
watermark_table = pd.read_csv(synth_with_watermark_path)

common_size = 1000
test_size = 200
frac = 0.8

# real_table = real_table.sample(common_size)
# non_watermark_table = non_watermark_table.sample(common_size)
# watermark_table = watermark_table.sample(common_size)

test_real = real_table.sample(test_size)
real_table = real_table.drop(test_real.index)
# train_real = real_table.sample(common_size)
train_real = real_table
train_synth_no_W = non_watermark_table
train_synth_W = watermark_table

# classification task: decision tree
print(BinaryDecisionTreeClassifier.compute(train_real, test_real, target=target))
print(BinaryDecisionTreeClassifier.compute(train_synth_no_W, test_real, target=target))
print(BinaryDecisionTreeClassifier.compute(train_synth_W, test_real, target=target))

# regression task
# print(LinearRegression.compute(train_real, test_real, target=target))
# print(LinearRegression.compute(train_synth_no_W, test_real, target=target))
# print(LinearRegression.compute(train_synth_W, test_real, target=target))
# print("ahhhhhh")
# print(MLPRegressor.compute(train_real, test_real, target=target))
# print(MLPRegressor.compute(train_synth_no_W, test_real, target=target))
# print(MLPRegressor.compute(train_synth_W, test_real, target=target))

diabetes.csv
diabetes_1000_non-watermark.csv
diabetes_1000_with-watermark_gamma-0.25_delta-2.0.csv
0.5260416666666666
0.3210702341137124
0.3435374149659864


In [244]:
from sklearn.impute import SimpleImputer
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def preprocess_data(train, test, target, label_encoder = None):
    X_train = train.drop(target, axis=1)
    y_train = train[target]   
    
    X_test = test.drop(target, axis=1)
    y_test = test[target]
    
    if label_encoder:
        y_train = label_encoder.fit_transform(y_train)
        y_test = label_encoder.fit_transform(y_test)

    
    numeric_features = X_train.select_dtypes(include=['int', 'float']).columns
    categorical_features = X_train.select_dtypes(include=['object']).columns
    
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    
    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])
    
    return X_train, y_train, X_test, y_test, preprocessor
    
# this works for adult and diabetes
def logistic_classifier(train, test, target):
    X_train, y_train, X_test, y_test, preprocessor = preprocess_data(train, test, target, label_encoder=LabelEncoder())
    
    # Create a pipeline with preprocessing and the classifier
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', LogisticRegression())])
    
    pipeline.fit(X_train, y_train)
    
    predictions = pipeline.predict(X_test)
    
    # Calculate F1 score
    f1 = f1_score(y_test, predictions)  # '>50K' as the positive label
    # print("F1 Score:", f1)
    return f1

# this works for adult and diabetes
def linear_classifier(train, test, target):
    X_train, y_train, X_test, y_test, preprocessor = preprocess_data(train, test, target, label_encoder=LabelEncoder())

    # Create a pipeline with preprocessing and the classifier
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', LinearRegression())])
    
    pipeline.fit(X_train, y_train)
    
    # Predict probabilities (continuous values) and threshold them to get binary predictions
    predictions_prob = pipeline.predict(X_test)
    threshold = 0.5
    predictions = (predictions_prob >= threshold).astype(int)
    
    # Calculate F1 score
    f1 = f1_score(y_test, predictions)
    # print("F1 Score:", f1)
    return f1

def linear_regression(train, test, target):

    X_train, y_train, X_test, y_test, preprocessor = preprocess_data(train, test, target)

    # Create a pipeline with preprocessing and the classifier
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', LinearRegression())])
    
    pipeline.fit(X_train, y_train)
    
    predictions = pipeline.predict(X_test)
    
    # Calculate regression metrics
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    
    # print("Mean Absolute Error (MAE):", mae)
    # print("Mean Squared Error (MSE):", mse)
    # print("R-squared (R²):", r2)
    # 
    return mse

def logistic_regression(train, test, target):
    X_train, y_train, X_test, y_test, preprocessor = preprocess_data(train, test, target)
    # Create a pipeline with preprocessing and the classifier
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', LogisticRegression(max_iter=1000))])
    
    pipeline.fit(X_train, y_train)
    
    predictions = pipeline.predict(X_test)
    
    # Calculate regression metrics
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    
    # print("Mean Absolute Error (MAE):", mae)
    # print("Mean Squared Error (MSE):", mse)
    # print("R-squared (R²):", r2)
    # 
    return mse

In [243]:
real_file_name, no_water_mark_file_name, with_water_mark_file_name = get_filenames_from_model('adult', common_size=1000, gamma=0.25, delta=2.0)
target = 'class'

samples_dir = "/Users/minhkau/Documents/TUDelft/Year 3/RP/Code/tabular-gpt/samples/"
real_path = samples_dir + real_file_name
synth_no_watermark_path = samples_dir + no_water_mark_file_name
synth_with_watermark_path = samples_dir + with_water_mark_file_name

real_table = pd.read_csv(real_path)
non_watermark_table = pd.read_csv(synth_no_watermark_path)
watermark_table = pd.read_csv(synth_with_watermark_path)

common_size = 1000
test_size = 200
frac = 0.8

# real_table = real_table.sample(common_size)
non_watermark_table = non_watermark_table.sample(common_size)
watermark_table = watermark_table.sample(common_size)


test_real = real_table.sample(test_size)
real_table = real_table.drop(test_real.index)
# train_real = real_table.sample(common_size)
train_real = real_table
train_synth_no_W = non_watermark_table
train_synth_W = watermark_table


print(logistic_classifier(real_table, test_real, target))
print(logistic_classifier(non_watermark_table, test_real, target))
print(logistic_classifier(watermark_table, test_real, target))

print(linear_classifier(real_table, test_real, target))
print(linear_classifier(non_watermark_table, test_real, target))
print(linear_classifier(watermark_table, test_real, target))

# print(linear_regression(real_table, test_real, target))
# print(linear_regression(non_watermark_table, test_real, target))
# print(linear_regression(watermark_table, test_real, target))
# 
# 
# print(logistic_regression(real_table, test_real, target))
# print(logistic_regression(non_watermark_table, test_real, target))
# print(logistic_regression(watermark_table, test_real, target))

0.7474747474747475
0.7169811320754716
0.6666666666666666
0.6966292134831461
0.6875
0.5747126436781609
