In [2]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score
import pandas as pd
import os


def calculate_roc_auc(negative_scores, positive_scores):
    # 创建标签，0代表人类写的，1代表机器生成的
    labels = np.array([0] * len(negative_scores) + [1] * len(positive_scores))
    # 合并所有得分
    scores = np.array(negative_scores + positive_scores)
    valid_indices = ~np.isnan(labels) & ~np.isnan(scores)
    labels = labels[valid_indices]
    scores = scores[valid_indices]
    # 计算AUC
    auc = roc_auc_score(labels, scores)
    fpr, tpr, _ = roc_curve(labels, scores)
    return auc, fpr, tpr



In [3]:
def get_auc(df, scores_prefix):
    result_dict = {}
    human_scores = df['human_score'].to_list()

    for type_ in scores_prefix:
        wm_scores = df[f'{type_}_watermarked_text_score'].to_list()
        assert all(not np.isnan(score) for score in wm_scores), f'{type_} watermarked text scores contain NaN values'
        auc_w, _, _ = calculate_roc_auc(human_scores, wm_scores)
        result_dict[f'{type_}_auc'] = auc_w
    return result_dict


In [4]:
import os
import re
import glob

root_path = "/blue/buyuheng/li_an.ucsb/projects/watermark-simcse/watermarking/outputs/"
baseline_root = rf'/blue/buyuheng/li_an.ucsb/projects/baselines/adaptive-text-watermark/outputs'

train_dataset_names = ['c4']
wm_dataset_names = ['c4', 'lfqa']
scores_prefix = ['adaptive', 'paraphrased', 'sentiment_spoofed', 'latter_sentiment_spoofed', 'hate_spoofed']
scores = ['human_score'] + [f'{prefix}_watermarked_text_score' for prefix in scores_prefix]

results = []
baseline_results = []
for train_dataset_name in train_dataset_names:
    print(f'==================== train on {train_dataset_name} ====================')
    for wm_dataset_name in wm_dataset_names:
        print(f'------- wm on {wm_dataset_name} -------')
        pattern = re.compile(rf"""
            .*/{train_dataset_name}/twitter-roberta-base-sentiment/(?P<batch_size>\d+)batch_(?P<num_epoch>\d+)epochs/
            llama(?P<llama_para>\d+)gpt(?P<gpt_para>\d+)-sent1-latter_sent1-fact0-hate1/
            loss_cl(?P<cl_weight>[\d\.]+)-tl(?P<tl_weight>[\d\.]+)-wneg(?P<neg_weight>[\d\.]+)-margin(?P<margin>[\d\.]+)/
            wm-{wm_dataset_name}-alpha(?P<alpha>[\d\.]+)-delta(?P<delta_0>[\d\.]+)\|(?P<delta>[\d\.]+)\.csv$
        """, re.VERBOSE)
        baseline_pattern = re.compile(rf".*/wm-{wm_dataset_name}-alpha(?P<alpha>[\d\.]+)-delta(?P<delta_0>[\d\.]+)\|(?P<delta>[\d\.]+)\.csv$", re.VERBOSE)

        # get roc-auc results of mutual non-empty rows
        for filepath in glob.iglob(root_path + "/**/*.csv", recursive=True):
            match = pattern.match(filepath)
            if match:
                df = pd.read_csv(filepath)  
                print(filepath)
                if any(col not in df.columns for col in scores):
                    continue
                auc_result_dict = {}
                human_scores = df['human_score'].to_list()
                for prefix in scores_prefix:
                    wm_scores = df[f'{prefix}_watermarked_text_score'].dropna().to_list()
                    print(f'{prefix}\t', len(wm_scores), 'valid rows')
                    auc, _, _ = calculate_roc_auc(human_scores, wm_scores)
                    auc_result_dict[f'{prefix}_auc'] = auc

                result_dict = {'train_dataset_name': train_dataset_name, 'wm_dataset_name': wm_dataset_name}
                result_dict.update(match.groupdict())
                result_dict.update(auc_result_dict)
                results.append(result_dict)


------- wm on c4 -------
/blue/buyuheng/li_an.ucsb/projects/watermark-simcse/watermarking/outputs/c4/twitter-roberta-base-sentiment/128batch_30epochs/llama1gpt1-sent1-latter_sent1-fact0-hate1/loss_cl0.0-tl1.0-wneg999-margin0.7/wm-c4-alpha2.0-delta0.2|0.5.csv
adaptive	 200 valid rows
paraphrased	 197 valid rows
sentiment_spoofed	 189 valid rows
latter_sentiment_spoofed	 151 valid rows
hate_spoofed	 200 valid rows
/blue/buyuheng/li_an.ucsb/projects/watermark-simcse/watermarking/outputs/c4/twitter-roberta-base-sentiment/128batch_30epochs/llama1gpt1-sent1-latter_sent1-fact0-hate1/loss_cl0.0-tl1.0-wneg999-margin1.1/wm-c4-alpha2.0-delta0.2|0.5.csv
adaptive	 200 valid rows
paraphrased	 199 valid rows
sentiment_spoofed	 190 valid rows
latter_sentiment_spoofed	 142 valid rows
hate_spoofed	 200 valid rows
/blue/buyuheng/li_an.ucsb/projects/watermark-simcse/watermarking/outputs/c4/twitter-roberta-base-sentiment/128batch_30epochs/llama1gpt1-sent1-latter_sent1-fact0-hate1/loss_cl0.0-tl1.0-wneg999-m

In [16]:
import os
import re
import glob

root_path = "/blue/buyuheng/li_an.ucsb/projects/watermark-simcse/watermarking/outputs/"
baseline_root = rf'/blue/buyuheng/li_an.ucsb/projects/baselines/adaptive-text-watermark/outputs'

train_dataset_names = ['c4']
wm_dataset_names = ['c4']
scores_prefix = ['adaptive', 'paraphrased', 'latter_spoofing']
scores = ['human_score'] + [f'{prefix}_watermarked_text_score' for prefix in scores_prefix]


results = []
baseline_results = []
for train_dataset_name in train_dataset_names:
    print(f'==================== train on {train_dataset_name} ====================')
    for wm_dataset_name in wm_dataset_names:
        print(f'------- wm on {wm_dataset_name} -------')
        pattern = re.compile(rf"""
            .*/{train_dataset_name}/twitter-roberta-base-sentiment/64batch_12epochs/
            llama(?P<num_paraphrased_llama>\d+)-(?P<num_negative_llama>\d+)gpt(?P<num_paraphrased_gpt>\d+)-(?P<num_negative_gpt>\d+)-(?P<num_summary>\d+)/
            loss_cl(?P<cl_weight>[\d\.]+)-tl(?P<tl_weight>[\d\.]+)-wneg(?P<neg_weight>[\d\.]+)-margin(?P<margin>[\d\.]+)/
            wm-{wm_dataset_name}-alpha(?P<alpha>[\d\.]+)-delta(?P<delta_0>[\d\.]+)\|(?P<delta>[\d\.]+)\.csv$
        """, re.VERBOSE)
        baseline_pattern = re.compile(rf".*/wm-{wm_dataset_name}-alpha(?P<alpha>[\d\.]+)-delta(?P<delta_0>[\d\.]+)\|(?P<delta>[\d\.]+)\.csv$", re.VERBOSE)

        # find mutual non-empty rows
        mutual_non_empty_indices = None
        for filepath in glob.iglob(root_path + "/**/*.csv", recursive=True):
            match = pattern.match(filepath)
            if match:
                df = pd.read_csv(filepath)
                if any(col not in df.columns for col in scores):
                    continue
                df = df[scores]
                non_empty_indices = set(df.dropna().index)
                print(f'Number of non-empty rows: {len(non_empty_indices)}')
                print(filepath)
                if mutual_non_empty_indices is None:
                    mutual_non_empty_indices = non_empty_indices
                else:
                    mutual_non_empty_indices = mutual_non_empty_indices.intersection(non_empty_indices)
        # for filepath in glob.iglob(baseline_root + "/**/*.csv", recursive=True):
        #     match = baseline_pattern.match(filepath)
        #     if match:
        #         df = pd.read_csv(filepath)
        #         if any(col not in df.columns for col in scores):
        #             continue
        #         df = df[scores]
        #         non_empty_indices = set(df.dropna().index)
        #         mutual_non_empty_indices = mutual_non_empty_indices.intersection(non_empty_indices)

        # get roc-auc results of mutual non-empty rows
        if mutual_non_empty_indices is not None:
            mutual_non_empty_indices = list(mutual_non_empty_indices)
            print(f'Number of mutual non-empty rows: {len(mutual_non_empty_indices)}')
            for filepath in glob.iglob(root_path + "/**/*.csv", recursive=True):
                match = pattern.match(filepath)
                if match:
                    df = pd.read_csv(filepath)  
                    if any(col not in df.columns for col in scores):
                        continue
                    df = df[scores]
                    df = df.loc[mutual_non_empty_indices]
                    auc_result_dict = get_auc(df, scores_prefix)

                    result_dict = {'train_dataset_name': train_dataset_name, 'wm_dataset_name': wm_dataset_name}
                    result_dict.update(match.groupdict())
                    result_dict.update(auc_result_dict)
                    results.append(result_dict)

            # for filepath in glob.iglob(baseline_root + "/**/*.csv", recursive=True):
            #     match = baseline_pattern.match(filepath)
            #     if match:
            #         df = pd.read_csv(filepath)
            #         if any(col not in df.columns for col in scores):
            #             continue
            #         df = df[scores]
            #         df = df.loc[mutual_non_empty_indices]
            #         auc_result_dict = get_auc(df, scores_prefix)
            #         baseline_result_dict = {'wm_dataset_name': wm_dataset_name}
            #         baseline_result_dict.update(match.groupdict())
            #         baseline_result_dict.update(auc_result_dict)
            #         baseline_results.append(baseline_result_dict)

------- wm on c4 -------


In [5]:
results = pd.DataFrame(results)
results.to_csv(os.path.join(root_path, 'roc_auc_results.csv'), index=False)

In [54]:
baseline_results = pd.DataFrame(baseline_results)
baseline_results.to_csv(os.path.join(root_path, 'baseline_roc_auc_results.csv'), index=False)

In [34]:
!pwd

/blue/buyuheng/li_an.ucsb/projects/watermark-simcse/watermarking/util


In [2]:
path = r"/blue/buyuheng/li_an.ucsb/projects/watermark-simcse/watermarking/outputs/c4/twitter-roberta-base-sentiment/128batch_30epochs/llama8-0gpt8-1-0/loss_cl0.0-tl1.0-wneg999-margin0.8/wm-c4-alpha2.0-delta0.2|0.5.csv"
df = pd.read_csv(path)
df.columns

Index(['text_id', 'original_text', 'adaptive_watermarked_text',
       'watermarked_corrected_text', 'paraphrased_watermarked_text',
       'hate_watermarked_text', 'human_score',
       'adaptive_watermarked_text_score', 'corrected_watermarked_score',
       'paraphrased_watermarked_text_score', 'hate_watermarked_text_score',
       'hate_attack_original_output', 'original_sentiment',
       'target_modified_sentiment', 'modified_sentiment',
       'ppl_adaptive_watermarked_text', 'factual_watermarked_text',
       'factual_watermarked_text_score', 'spoofing_watermarked_text',
       'spoofing_attack_original_output', 'spoofing_watermarked_text_score',
       'success_spoofing', 'final_call_spoofing_watermarked_text',
       'final_call_spoofing_watermarked_text_score',
       'edit_distance_ori_spoofing', 'latter_spoofing_watermarked_text',
       'latter_spoofing_watermarked_text_score', 'success_latter_spoofing',
       'final_call_latter_spoofing_watermarked_text',
       'final_c