In [188]:
import glob
import re
import os
def get_filenames_from_model(model_name, common_size, gamma, delta, other_schemes=None):
    # Define the search pattern, allowing any string for sample size part
    search_pattern_non_watermark = f"{model_name}_*_non-watermark.csv"
    search_pattern_watermark = f"{model_name}_*_with-watermark_gamma-{gamma}_delta-{delta}.csv"

    # Use glob to get all files matching the search
    directory = "/Users/minhkau/Documents/TUDelft/Year 3/RP/Code/tabular-gpt/samples"
    files_non_watermark = glob.glob(search_pattern_non_watermark, root_dir=directory)
    files_watermark = glob.glob(search_pattern_watermark, root_dir=directory)

    # Regular expressions to extract sample size from filenames
    regex_pattern_non_watermark = re.compile(rf"{model_name}_(\d+)_non-watermark.csv")
    regex_pattern_watermark = re.compile(rf"{model_name}_(\d+)_with-watermark_gamma-{gamma}_delta-{delta}.csv")

    largest_sample_size_non_watermark = -1
    largest_sample_size_with_watermark = -1
    largest_file_non_watermark = None
    largest_file_with_watermark = None
    # Iterate over the matching files and extract sample size
    for file in files_non_watermark:
        match = regex_pattern_non_watermark.match(os.path.basename(file))
        if match:
            sample_size = int(match.group(1))
            if sample_size == common_size:
                largest_file_non_watermark = file
                break
            if sample_size > largest_sample_size_non_watermark:
                largest_sample_size_non_watermark = sample_size
                largest_file_non_watermark = file

    for file in files_watermark:
        match = regex_pattern_watermark.match(os.path.basename(file))
        if match:
            sample_size = int(match.group(1))
            if sample_size == common_size:
                largest_file_with_watermark = file
                break
            if sample_size > largest_sample_size_with_watermark:
                largest_sample_size_with_watermark = sample_size
                largest_file_with_watermark = file
                
    if other_schemes:
        other_schemes_file_names = []
        for scheme_name in other_schemes:
            search_pattern_scheme = f"{model_name}_*_with-watermark_{scheme_name}.csv"
            regex_pattern_scheme = re.compile(rf"{model_name}_(\d+)_with-watermark_{scheme_name}.csv")
            files_scheme = glob.glob(search_pattern_scheme, root_dir=directory)
            largest_sample_size_scheme = -1
            largest_file_sheme = None
            for file in files_scheme:
                match = regex_pattern_scheme.match(os.path.basename(file))
                if match:
                    sample_size = int(match.group(1))
                    if sample_size == common_size:
                        largest_file_sheme = file
                        break
                    if sample_size > largest_sample_size_scheme:
                        largest_sample_size_scheme = sample_size
                        largest_file_sheme = file
            other_schemes_file_names.append(largest_file_sheme)
        return f"{model_name}.csv", largest_file_non_watermark, largest_file_with_watermark, other_schemes_file_names
            
    return f"{model_name}.csv", largest_file_non_watermark, largest_file_with_watermark

def get_tables_by_model_name(model_name, common_size, gamma, delta, other_schemes):
    real_file_name, no_water_mark_file_name, with_water_mark_file_name, other_schemes_file_names = get_filenames_from_model(model_name, common_size=common_size, gamma=gamma, delta=delta,other_schemes=other_schemes)
            
    samples_dir = "/Users/minhkau/Documents/TUDelft/Year 3/RP/Code/tabular-gpt/samples/"
    real_path = samples_dir + real_file_name
    synth_no_watermark_path = samples_dir + no_water_mark_file_name
    synth_with_watermark_path = samples_dir + with_water_mark_file_name
    exp_path = samples_dir + other_schemes_file_names[0]
    
    real_table = pd.read_csv(real_path)
    non_watermark_table = pd.read_csv(synth_no_watermark_path)
    watermark_table = pd.read_csv(synth_with_watermark_path)
    exp_table = pd.read_csv(exp_path)
    
    return real_table, non_watermark_table, watermark_table, exp_table
    

In [15]:
from sklearn.impute import SimpleImputer
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def preprocess_data(train, test, target, label_encoder = None):
    X_train = train.drop(target, axis=1)
    y_train = train[target]   
    
    X_test = test.drop(target, axis=1)
    y_test = test[target]
    
    if label_encoder:
        y_train = label_encoder.fit_transform(y_train)
        y_test = label_encoder.fit_transform(y_test)

    
    numeric_features = X_train.select_dtypes(include=['int', 'float']).columns
    categorical_features = X_train.select_dtypes(include=['object']).columns
    
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    
    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])
    
    return X_train, y_train, X_test, y_test, preprocessor
    
# classifiers are used for adult and diabetes
def logistic_classifier(train, test, target):
    X_train, y_train, X_test, y_test, preprocessor = preprocess_data(train, test, target, label_encoder=LabelEncoder())
    
    # Create a pipeline with preprocessing and the classifier
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', LogisticRegression())])
    
    pipeline.fit(X_train, y_train)
    
    predictions_proba = pipeline.predict_proba(X_test)[:, 1]
    predictions = pipeline.predict(X_test)
    
    # Calculate F1 score
    f1 = f1_score(y_test, predictions)
    
    # Calculate AUC score
    auc = roc_auc_score(y_test, predictions_proba)
    
    return {'f1': f1, 'auc': auc}

def linear_classifier(train, test, target):
    X_train, y_train, X_test, y_test, preprocessor = preprocess_data(train, test, target, label_encoder=LabelEncoder())

    # Create a pipeline with preprocessing and the classifier
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', LinearRegression())])
    
    pipeline.fit(X_train, y_train)
    
    # Predict probabilities (continuous values) and threshold them to get binary predictions
    predictions_prob = pipeline.predict(X_test)
    threshold = 0.5
    predictions = (predictions_prob >= threshold).astype(int)
    
    # Calculate F1 score
    f1 = f1_score(y_test, predictions)
    auc = roc_auc_score(y_test, predictions_prob)    
    return {'f1': f1, 'auc': auc}

def decision_tree_classifier(train, test, target):

    label_encoder = LabelEncoder()

    # Preprocess the data
    X_train, y_train, X_test, y_test, preprocessor = preprocess_data(train, test, target, label_encoder)
    
    # Create a pipeline with preprocessing and the classifier
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', DecisionTreeClassifier(random_state=42))])
    
    pipeline.fit(X_train, y_train)
    
    predictions = pipeline.predict(X_test)
    # Predict probabilities for the positive class
    predictions_prob = pipeline.predict_proba(X_test)[:, 1]
    
    # Calculate F1 score
    f1 = f1_score(y_test, predictions)
    
    # Calculate AUC score
    auc = roc_auc_score(y_test, predictions_prob)
    
    return {'f1': f1, 'auc': auc}

# regression are used for california and abalone
def linear_regresser(train, test, target):

    X_train, y_train, X_test, y_test, preprocessor = preprocess_data(train, test, target)

    # Create a pipeline with preprocessing and the classifier
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', LinearRegression())])
    
    pipeline.fit(X_train, y_train)
    
    predictions = pipeline.predict(X_test)
    
    # Calculate regression metrics
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    
    return {'mae': mae, 'mse': mse, 'r2':r2}

def random_forest_regressor(train, test, target):
    X_train, y_train, X_test, y_test, preprocessor = preprocess_data(train, test, target)

    # Create a pipeline with preprocessing and the regressor
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))])
    
    pipeline.fit(X_train, y_train)
    
    predictions = pipeline.predict(X_test)
    
    # Calculate regression metrics
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    
    return {'mae': mae, 'mse': mse, 'r2': r2}

def mle_efficacy(models_meta_data, gamma_delta_pairs, common_size=1000, test_size=250, trial_num=5):
    final_results = []
    for meta in models_meta_data:
        model_name = meta['model_name']
        target = meta['target']
        task_name = meta['task_name'] # either regression or classification
        
        for (gamma, delta) in gamma_delta_pairs:
            real_table, non_watermark_table, watermark_table, exp_table = get_tables_by_model_name(model_name, common_size, gamma, delta, ["exp"])
            
            result = {
                'linear_classification': [],
                'logistic_classification': [],
                'decision_tree_classification': [],
                'linear_regression': [],
                # 'decision_tree_regression': []
                'random_forest_regression':[]
            }
            
            
            for i in range(trial_num):
                test_real = real_table.sample(test_size, random_state=i)
                real_table_dropped = real_table.drop(test_real.index, inplace=False)
                
                train_real = real_table_dropped.sample(common_size, random_state=i) if real_table.shape[0] > common_size else real_table_dropped
                train_synth_no_W = non_watermark_table.copy()
                train_synth_W = watermark_table.copy()
                train_exp = exp_table.copy()
                                
                if task_name == 'classification':
                    tasks = zip(['linear_classification', 'logistic_classification', 'decision_tree_classification'], [linear_classifier, logistic_classifier, decision_tree_classifier])
                else:
                    tasks = zip(['linear_regression', 'random_forest_regression'], [linear_regresser, random_forest_regressor])
                    
                for name, runner in tasks:
                    result[name].append({
                        'real': runner(train_real, test_real, target),
                        'non_watermark': runner(train_synth_no_W, test_real, target),
                        'watermark': runner(train_synth_W, test_real, target),
                        'exp': runner(train_exp, test_real, target)
                    })
            
            final_results.append({
                'model_name': model_name,
                'gamma':gamma,
                'delta':delta,
                'result':result,
                'tasks': ['linear_classification', 'logistic_classification', 'decision_tree_classification'] if task_name == 'classification' else ['linear_regression', 'random_forest_regression']
            })
    return final_results

In [16]:
models_metadata = [
    {'model_name': 'adult', 'target': 'class', 'task_name': 'classification'},
    {'model_name': 'california', 'target': 'MedHouseVal', 'task_name': 'regression'},
    {'model_name': 'abalone', 'target': 'Rings', 'task_name': 'regression'},
    # {'model_name': 'diabetes', 'target': 'Outcome', 'task_name': 'classification'},
]

gamma_delta_pairs = [(0.25, 5.0), (0.25, 2.0), (0.5, 1.0), (0.5, 2.0)]

mle_scores = mle_efficacy(models_metadata, gamma_delta_pairs, trial_num=15)

import json
# 
with open('with_exp_except_abalone.json', 'w') as json_file:
    json.dump(mle_scores, json_file, indent=4)



TypeError: can only concatenate str (not "NoneType") to str

In [19]:
import numpy as np
from scipy.stats import t
def calculate_confidence_interval(scores):
    # Extract R² scores
    
    # Calculate mean and standard deviation
    mean_score = np.mean(scores)
    std_score = np.std(scores, ddof=1)
    
    # Calculate the number of samples
    n = len(scores)
    
    # Calculate the standard error of the mean
    stderr_mean = std_score / np.sqrt(n)
    
    # Define the confidence level (e.g., 95%)
    confidence_level = 0.95
    
    # Calculate the critical value
    t_critical = t.ppf((1 + confidence_level) / 2, df=n - 1)
    
    # Calculate the margin of error
    margin_of_error = t_critical * stderr_mean
    
    return mean_score, margin_of_error

def get_avg_metrics(all_data):
    final_results = []
    for data in all_data:
        model_name = data['model_name']
        gamma = data['gamma']
        delta = data['delta']
        result = data['result']
        tasks= data['tasks']
        
        real_scores = []
        non_watermark_scores = []
        watermark_scores = []
        # exp_scores = []
        
        for task in tasks:
            for trial_result in result[task]:
                metric = 'r2' if 'regression' in task else 'auc' 
                real_scores.append(trial_result['real'][metric])
                non_watermark_scores.append(trial_result['non_watermark'][metric])
                watermark_scores.append(trial_result['watermark'][metric])
                # exp_scores.append(trial_result['exp'][metric])
        
        final_results.append({
            'model_name': model_name,
            'gamma':gamma,
            'delta':delta,
            'real_score': calculate_confidence_interval(real_scores),
            'non_watermark_score': calculate_confidence_interval(non_watermark_scores),
            'watermark_score': calculate_confidence_interval(watermark_scores),
            # 'exp_score': calculate_confidence_interval(exp_scores)
        })
             
    return final_results

def avg_mertrics_to_df(all_data, gamma_delta_pairs):
    processed_metrics = get_avg_metrics(all_data)
    
    model_names = []
    gammas = []
    deltas = []
    real_scores = []
    non_watermark_scores = []
    watermark_scores = []
    # exp_scores = []
    
    for data in processed_metrics:
        model_name = data['model_name']
        gamma = data['gamma']
        delta = data['delta']
        real_score, real_score_margin = data['real_score']
        non_watermark_score, non_watermark_score_margin = data['non_watermark_score']
        watermark_score, watermark_score_margin = data['watermark_score']
        # exp_score, exp_score_margin = data['exp_score']
        
        model_names.append(model_name)
        gammas.append(gamma)
        deltas.append(delta)
        
        real_scores.append(str(round(real_score, 3)) + "±" + str(round(real_score_margin, 3)))
        non_watermark_scores.append(str(round(non_watermark_score, 3)) + "±" + str(round(non_watermark_score_margin, 3)))
        watermark_scores.append(str(round(watermark_score, 3)) + "±" + str(round(watermark_score_margin, 3)))
        # exp_scores.append(str(round(exp_score, 3)) + "±" + str(round(exp_score_margin, 3)))

    avg_metrics_df = pd.DataFrame({
        'Model': model_names,
        'γ': gammas,
        'δ': deltas,
        'Real': real_scores,
        # 'Exp': exp_scores,
        'Non-Watermark': non_watermark_scores,
        'Watermark': watermark_scores,
    })
    
    return avg_metrics_df

with open('abalone_without_exp.json', 'r') as file:
    # Load the JSON data
    all_data = json.load(file)

allauh = avg_mertrics_to_df(all_data, gamma_delta_pairs)
# allauh.to_excel("output3.xlsx", index=False)
allauh

Unnamed: 0,Model,γ,δ,Real,Non-Watermark,Watermark
0,abalone,0.25,5.0,0.522±0.028,0.493±0.03,0.346±0.033
1,abalone,0.25,2.0,0.522±0.028,0.493±0.03,0.398±0.026
2,abalone,0.5,1.0,0.522±0.028,0.493±0.03,0.472±0.031
3,abalone,0.5,2.0,0.522±0.028,0.493±0.03,0.463±0.042
