In [8]:
import pandas as pd
from datasets import load_dataset
import nltk


In [5]:
def pre_process(dataset, min_length, data_size=500):
    data = []
    for text in dataset['train']['text']:
        text0 = text.split()[0:min_length]
        if len(text0) >= min_length:
            text0 = ' '.join(text0)
            data.append({'text0': text0, 'text': text})
        else:
            pass
        
        if len(data) ==  data_size:
            break

    return data


In [None]:
data = "https://huggingface.co/datasets/allenai/c4/resolve/1ddc917116b730e1859edef32896ec5c16be51d0/realnewslike/c4-train.00000-of-00512.json.gz"
dataset = load_dataset('json', data_files=data)
dataset = pre_process(dataset, min_length=200, data_size=100)   # [{text0: 'text0', text: 'text'}]

num_sent = []

for item in dataset:
    text = item['text']
    text_sent = nltk.sent_tokenize(text)
    num_sent.append(len(text_sent))

df = pd.Series(num_sent)

In [13]:
df.describe()

count    100.000000
mean      26.680000
std       16.866008
min        7.000000
25%       13.750000
50%       23.000000
75%       33.000000
max       85.000000
dtype: float64

In [None]:
data_path = r'/mnt/data2/lian/projects/watermark/adaptive-text-watermark-yepeng/outputs/watermark-8b-2step.csv'
df = pd.read_csv(data_path)
df['num_of_sent'] = df['unwatermarked_text'].apply(lambda t: len(nltk.sent_tokenize(t)))

In [19]:
df['num_of_sent'].describe()

count    185.000000
mean       9.843243
std        2.338773
min        1.000000
25%        9.000000
50%       10.000000
75%       11.000000
max       15.000000
Name: num_of_sent, dtype: float64

# Get average word length

In [None]:
import os
import pandas as pd

data_dir = r'/mnt/data2/lian/projects/watermark/adaptive-text-watermark-yepeng/outputs/continue'

for filename in os.listdir(data_dir):
    if filename.endswith('csv'):
        print(filename)
        df = pd.read_csv(os.path.join(data_dir, filename))
        df['wm_avg_len'] = df['adaptive_watermarked_text'].apply(lambda x: len(x.strip().split()))
        avg_len = df['wm_avg_len'].mean()
        print(round(avg_len, 2))



watermark-8b-1step-10sent.csv
227.0


# Edit distance

In [15]:
import numpy as np

def word_level_edit_distance(text1, text2):
    if not isinstance(text1, str) or not isinstance(text2, str):
        return None
    
    # 将文本分割成单词
    words1 = text1.split()
    words2 = text2.split()
    
    # 初始化动态规划矩阵
    len1 = len(words1)
    len2 = len(words2)
    dp = np.zeros((len1 + 1, len2 + 1), dtype=int)
    
    # 填充第一列和第一行
    for i in range(len1 + 1):
        dp[i][0] = i
    for j in range(len2 + 1):
        dp[0][j] = j
    
    # 填充动态规划矩阵
    for i in range(1, len1 + 1):
        for j in range(1, len2 + 1):
            if words1[i - 1] == words2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]  # 如果单词相同，不需要操作
            else:
                dp[i][j] = min(
                    dp[i - 1][j] + 1,   # 删除
                    dp[i][j - 1] + 1,   # 插入
                    dp[i - 1][j - 1] + 1  # 替换
                )
    
    return dp[len1][len2]


In [16]:
import pandas as pd

path = r'/blue/buyuheng/li_an.ucsb/projects/baselines/adaptive-text-watermark/outputs/wm-model-Llama-3.1-8B-Instruct/wm-c4-alpha2.0-delta0.2|0.5.csv'
df = pd.read_csv(path)

In [17]:
import matplotlib.pyplot as plt
from tqdm import tqdm
# Wrap the apply function with tqdm for progress tracking
tqdm.pandas()

# Compute edit distances for spoofing columns
sentiment_spoofing_columns = ['spoofing_watermarked_text', 'latter_sentiment_spoof']
for col in sentiment_spoofing_columns:
    df[f'edit_distance_{col}'] = df.progress_apply(lambda x: word_level_edit_distance(x['adaptive_watermarked_text'], x[col]), axis=1)


  0%|          | 0/200 [00:00<?, ?it/s]

100%|██████████| 200/200 [00:04<00:00, 40.95it/s]
100%|██████████| 200/200 [00:01<00:00, 145.15it/s]


In [18]:
df.to_csv(path, index=False)
