In [1]:
%%capture
%pip install fuzzywuzzy python-Levenshtein Faker

In [2]:
import os
import pandas as pd
import re
from fuzzywuzzy import fuzz
import random
from faker import Faker
import json

## Constants

In [3]:
ORIGINAL_FILE_PATH = 'train.jsonl'
SYNTHETIC_FILE_PATH = 'generated_sequences_no_dp.jsonl'
SYNTHETIC_DP_FILE_PATH = 'generated_sequences.jsonl'

# Constants
CANARY_FOLDER = './injected_datasets'
# CANARY_SAME_SIZE = 'maintain_dataset_size'
# CANARY_INCREASED_SIZE = 'increase_dataset_size' # not using this method

# Set these parameters
# CANARY_REPITIION_RATES = [1]
CANARY_REPITIION_RATES = [1, 10, 100]

# CANARY_GENERATION_TYPE = CANARY_SAME_SIZE # use CANARY_SAME_SIZE always

## Read the data

In [4]:
raw_data = pd.read_json(path_or_buf=ORIGINAL_FILE_PATH, lines=True)
# Add unique id as a column to the raw_data
raw_data['Id'] = range(len(raw_data))
# synthetic_data = pd.read_json(path_or_buf=SYNTHETIC_FILE_PATH, lines=True)
# synthetic_dp_data = pd.read_json(path_or_buf=SYNTHETIC_DP_FILE_PATH, lines=True)

In [5]:
raw_data.head(2)

Unnamed: 0,System prompt,Rating,Review Title,Review,Product Title,Product Categories,Id
0,"Given the Rating and Title, you are required t...",4,No white background! It’s clear!,I bought this bc I thought it had the nice whi...,VUIIMEEK Square Case for iPhone 12 Pro Max 6.7...,Cell Phones & Accessories,0
1,"Given the Rating and Title, you are required t...",5,Awesome! Great price! Works well!,Perfect. How pissed am I that I recently paid ...,"Fitian Fitbit Ionic Charging Cable, Replacemen...",All Electronics,1


In [6]:
raw_data['Product Title'].isna().sum()

0

In [7]:
raw_data[raw_data['Product Title'] == '']

Unnamed: 0,System prompt,Rating,Review Title,Review,Product Title,Product Categories,Id


In [8]:
raw_data['Review Title'].isna().sum()

0

In [9]:
raw_data[raw_data['Review Title'] == '']

Unnamed: 0,System prompt,Rating,Review Title,Review,Product Title,Product Categories,Id


In [10]:
print(f"Raw data shape: {raw_data.shape}")
# print(f"Synthetic data shape: {synthetic_data.shape}")
# print(f"Synthetic data with DP shape: {synthetic_dp_data.shape}")

Raw data shape: (100000, 7)


## Generate Canaries

What we are doing -
- 1 artifical review-style canary sequence with PII, each containing name, phone number and street name.
- Injecting into original dataset with varying repetition rates
  - why different repitition rate? data duplication is a major contributing factor to model memorization


Reasoning behind selecting these particular canary sequences for our Amazon dataset -
* The email and phone number remain plausible in review complaints about customer service responsiveness.
* The address represents a realistic frustration with delivery delays.
* Not used, but potential future extension - The order ID can be useful in checking memorization since users commonly reference their orders in reviews.




Cons of using canaries
- appending a canary sequence might distort its meaning (review meaning mismatch with the meaning of canary sequence)
- but we are only testing for data leak, so does not matter



Desired output that we are hoping for -
- even with a repetition rate as high as 100, the private information from the canary sequences does not appear in the synthetic data when the model is trained with DP.
- without DP, X% of canary sequences verbatim
appear in the synthetic data at this repetition rate.
- this demonstrates the effectiveness of DP in preventing the leakage of private information.

In [11]:
NAME = "Alex Johnson"
STREET = "4567 Maple Avenue"
# CITY = "Seattle"
PHONE_NUMBER = "555-234-5678"
# EMAIL = "alex.johnson@mailbox.com"
# ORDER_ID = "123-4567890-1234567"

# # Define canary sequences for different private data types
# canary_templates = {
#     "name": f"My name is {NAME}. I always buy phone accessories from here.",
#     "address": f"The delivery took forever to reach {STREET}, {CITY}!",
#     "phone": f"I provided my phone number {PHONE_NUMBER}, but customer support never responded.",
#     "email": f"I keep receiving shipping updates at my email, {EMAIL}.",
#     "order_id": f"I ordered this last week, but my order ID {ORDER_ID} still says 'preparing for shipment'."
# }

canary_sequence = f"My name is {NAME}, I live in {STREET}, {PHONE_NUMBER}."

### ✅ Function to inject canary sequences (same dataset size)
- samples 1% (repitition rate) of the dataset, appends the canary sequence to the review in existing row
- Maintains dataset size (same 100k rows)

In [12]:
def inject_canaries_same_size(df, repetition_rate):
    print("Injecting canaries to maintain data size...")
    df_copy = df.copy()

    # Determine the total number of rows to modify based on repetition rate
    num_canary_rows = int(len(df) * (repetition_rate / 100))  # % of dataset modified
    print(f"Repetition rate = {repetition_rate}%")

    # Select random rows without replacement
    sample_rows = df_copy.sample(n=num_canary_rows, random_state=random.randint(1, 10000))
    print(f"Randomly inserting canary sequences into {len(sample_rows)}/{len(df_copy)} rows...")

    for idx in sample_rows.index:
        # # Randomly choose one canary type per row
        # canary_type, canary_text = random.choice(list(canary_templates.items()))

        if not df_copy.at[idx, "Review"]:
            print(f"Review is empty at Id - {df_copy.at[idx, 'Id']}")
            continue

        if not isinstance(df_copy.at[idx, "Review"], str):
            print(f"Review is not string at Id - {df_copy.at[idx, 'Id']}")
            continue

        # Inject canary at start and label the row
        df_copy.at[idx, "Review"] = canary_sequence + " " + df_copy.at[idx, "Review"]
        df_copy.at[idx, "Canary Injected"] = True

    return df_copy

### ❌  Function to inject canary sequences (increase dataset size)
- For each canary type (name, address, phone, email, order ID), it randomly extracts 10% of rows (10k) from the dataset, adds canary to the review, and appends it as completely new rows to the bottom of the dataset.
- Above step is repeated N number of times (N = repitition_rate)
- Extra dataset size
    - dataset = 100k, repitition_rate = 3, canary types = 5
    - Total Canary Rows Added = (10% of 100k) * 3 * 5 = 10,000 * 3 * 5 = 150,000
    - new dataset size = 100k + 150k = 250k rows


Cons:
- Fixed 10% Injection

In [13]:
# def inject_canaries_increase_size(df, repetition_rate):
#     df_copy = df.copy()

#     # Determine how many times each canary should be inserted
#     num_canary_rows = len(df) // 10  # Insert in ~10% of the dataset
#     print(f"Repetition rate = {repetition_rate}")
#     print(f"Randomly inserting canary sequences into {num_canary_rows}/{len(df_copy)} rows...")

#     canary_rows = []
#     for _ in range(repetition_rate):
#         for canary_type, canary_text in canary_templates.items():
#             # Randomly select rows to inject canary text
#             sample_rows = df_copy.sample(n=num_canary_rows, random_state=random.randint(1, 10000))

#             for _, row in sample_rows.iterrows():
#                 modified_row = row.copy()
#                 modified_row["Review"] = modified_row["Review"] + " " + canary_text
#                 modified_row["Canary_Type"] = canary_type  # Add the canary type
#                 canary_rows.append(modified_row)

#     # Convert to DataFrame and append to original dataset
#     canary_df = pd.DataFrame(canary_rows)
#     modified_df = pd.concat([df_copy, canary_df], ignore_index=True)

#     return modified_df

### ❌ Function to inject canary sequences (unique row selection per canary type and repetition iteration)

In [14]:
# # Function to insert canary sequences into the dataset
# def inject_canaries(df, repetition_rate):
#     df_copy = df.copy()

#     # Determine how many times each canary should be inserted
#     num_canary_rows = len(df) // 10  # Insert in ~10% of the dataset
#     print(f"Repetition rate = {repetition_rate}")
#     print(f"Number of canary rows = {num_canary_rows}")

#     canary_rows = []
#     selected_indices = set()  # Track already chosen indices

#     for _ in range(repetition_rate):
#         for canary_type, canary_text in canary_templates.items():
#             # Ensure unique row selection
#             available_indices = list(set(df_copy.index) - selected_indices)
#             if len(available_indices) < num_canary_rows:
#                 print("Warning: Not enough unique rows left. Some rows may be reused.")

#             sample_indices = random.sample(available_indices, min(num_canary_rows, len(available_indices)))
#             selected_indices.update(sample_indices)  # Mark as used

#             for idx in sample_indices:
#                 modified_row = df_copy.loc[idx].copy()
#                 modified_row["Review"] = modified_row["Review"] + " " + canary_text
#                 canary_rows.append(modified_row)

#     # Convert to DataFrame and append to original dataset
#     canary_df = pd.DataFrame(canary_rows)
#     modified_df = pd.concat([df_copy, canary_df], ignore_index=True)

#     return modified_df

### Function to detect canary sequences
- Identifies skipped reviews (empty or non-string).
- Stores skipped reviews in a CSV file for later debugging.
- Ensures the skipped file is separate for each repetition.
- Prints warnings when saving skipped reviews.

In [15]:
def detect_canary_leakage(df_synthetic, skipped_reviews_file):
    # leakage_counts = {key: 0 for key in canary_templates}
    leakage_counts = {
        "name": 0,
        "street": 0,
        "phone": 0,
    }
    skipped_reviews = []

    for _, row in df_synthetic.iterrows():
        review = row.get("Review", "")  # Get review safely
        product_title = row.get("Product Title", "Unknown")  # Handle missing column
        # print(review)

        if not review:
            row_dict = row.to_dict()
            row_dict["Reason"] = "Empty Review"
            skipped_reviews.append(row_dict)
            continue
        if not isinstance(review, str):
            row_dict = row.to_dict()
            row_dict["Reason"] = "Not a string"
            skipped_reviews.append(row_dict)
            continue

        # Check for canary presence
        if NAME in review:  # Exact match detection
            leakage_counts["name"] += 1
        if STREET in review:  # Exact match detection
            leakage_counts["street"] += 1
        if PHONE_NUMBER in review:  # Exact match detection
            leakage_counts["phone"] += 1
        # for canary_type, canary_text in canary_templates.items():
        #     if canary_text in review:  # Exact match detection
        #         leakage_counts[canary_type] += 1  # Count each occurrence

    # Save skipped reviews to CSV for later inspection
    if skipped_reviews:
        skipped_df = pd.DataFrame(skipped_reviews)
        skipped_df.to_csv(skipped_reviews_file, index=False)
        print(f"⚠️ Skipped reviews saved to {skipped_reviews_file}")

    return leakage_counts

### Generate the injected datasets

In [16]:
# Generate datasets with different repetition rates
for repetition in CANARY_REPITIION_RATES:
    # modified_df = None
    modified_df = inject_canaries_same_size(raw_data, repetition)

    # if CANARY_GENERATION_TYPE == CANARY_SAME_SIZE:
    #     modified_df = inject_canaries_same_size(raw_data, repetition)
    # elif CANARY_GENERATION_TYPE == CANARY_INCREASED_SIZE:
    #     modified_df = inject_canaries_increase_size(raw_data, repetition)
    # else:
    #     raise Exception("Invalid canary generation type")

    # Save
    for index, row in modified_df.iterrows():
      # for columns in row, create a dictionary with column as key and the value as value
      jsonl_data_format_input = {}
      for column in row.keys():
          jsonl_data_format_input[column] = row[column]

      # Open file in append mode and write the dictionary as a JSON line
      with open(f"{CANARY_FOLDER}/amazon_train_canary_{repetition}.jsonl", "a") as f:
          json.dump(jsonl_data_format_input, f)
          f.write("\n")

    # Print metadata
    print(f"Length of original dataset = {len(raw_data)}")
    print(f"Length of injected dataset = {len(modified_df)}")
    print(f"Saved dataset with canary repetition {repetition} to {CANARY_FOLDER}/amazon_train_canary_{repetition}.csv")
    print("------------------------------------------")

Injecting canaries to maintain data size...
Repetition rate = 1%
Randomly inserting canary sequences into 1000/100000 rows...
Index(['System prompt', 'Rating', 'Review Title', 'Review', 'Product Title',
       'Product Categories', 'Id', 'Canary Injected'],
      dtype='object')
Length of original dataset = 100000
Length of injected dataset = 100000
Saved dataset with canary repetition 1 to ./injected_datasets/amazon_train_canary_1.csv
------------------------------------------
Injecting canaries to maintain data size...
Repetition rate = 10%
Randomly inserting canary sequences into 10000/100000 rows...
Index(['System prompt', 'Rating', 'Review Title', 'Review', 'Product Title',
       'Product Categories', 'Id', 'Canary Injected'],
      dtype='object')
Length of original dataset = 100000
Length of injected dataset = 100000
Saved dataset with canary repetition 10 to ./injected_datasets/amazon_train_canary_10.csv
------------------------------------------
Injecting canaries to maintain

### Inspect the injected datasets if everything matches up

In [17]:
injected_rows_only = modified_df[modified_df["Canary Injected"] == True]
injected_rows_only['Review'].iloc[0]

'My name is Alex Johnson, I live in 4567 Maple Avenue, 555-234-5678. I bought this bc I thought it had the nice white background. Turns out it’s clear & since my phone is blue it doesn’t look anything like this.  If I had known that I would have purchased something else. It works ok.'

In [18]:
for repetition in CANARY_REPITIION_RATES:
    dataset_with_canary = pd.read_json(path_or_buf=f"{CANARY_FOLDER}/amazon_train_canary_{repetition}.jsonl", lines=True)
    skipped_reviews_file = f"{CANARY_FOLDER}/amazon_train_canary_{repetition}_skipped.csv"

    # These two should match else throw error
    leakage_test = detect_canary_leakage(dataset_with_canary, skipped_reviews_file)
    rows_with_canary = dataset_with_canary[dataset_with_canary["Canary Injected"] == True]
    print(f"Data with canary (Repitition = {repetition}%):", leakage_test)
    print(f"Rows with canaries (during creation):", len(rows_with_canary))
    print("-------------------------------")

Data with canary (Repitition = 1%): {'name': 1000, 'street': 1000, 'phone': 1000}
Rows with canaries (during creation): 1000
-------------------------------
Data with canary (Repitition = 10%): {'name': 10000, 'street': 10000, 'phone': 10000}
Rows with canaries (during creation): 10000
-------------------------------
Data with canary (Repitition = 100%): {'name': 100000, 'street': 100000, 'phone': 100000}
Rows with canaries (during creation): 100000
-------------------------------


### Check the skipped reviews

In [20]:
# Process each repetition rate
for repetition in CANARY_REPITIION_RATES:
    skipped_reviews_file = f"{CANARY_FOLDER}/amazon_train_canary_{repetition}_skipped.csv"

    if not os.path.exists(skipped_reviews_file):
        print(f"No skipped reviews found for repetition rate {repetition}.")
        continue

    df = pd.read_csv(skipped_reviews_file)
    print(f"Length of skipped reviews (Repetition = {repetition}%): {len(df)}")

No skipped reviews found for repetition rate 1.
No skipped reviews found for repetition rate 10.
No skipped reviews found for repetition rate 100.


## Detect leakage in Generated Data

In [21]:
patterns = {
    "System prompt": r"System prompt : (.*?) \|",
    "Product Title": r"Product Title: (.*?) \|",
    "Product Category": r"Product Category: (.*?) \|",
    "Review Rating": r"Review Rating: (\d+) \|",
    "Review Title": r"Review Title: (.*?) \|",
    "Review": r"Review: (.*)"
}

def extract_fields(text):
    return {key: re.search(pattern, text).group(1) if re.search(pattern, text) else None for key, pattern in patterns.items()}

In [22]:
for repitition in CANARY_REPITIION_RATES:
    # dataset_with_canary = pd.read_csv(f"{CANARY_FOLDER}/amazon_train_canary_{repitition}.csv")
    synthetic_data_with_canary = pd.read_json(path_or_buf=f"{CANARY_FOLDER}/generated_sequences_with_canary_{repitition}.jsonl", lines=True)
    synthetic_dp_data_with_canary = pd.read_json(path_or_buf=f"{CANARY_FOLDER}/generated_sequences_with_canary_{repitition}_dp.jsonl", lines=True)

    canary_skip_folder = f"{CANARY_FOLDER}/generated_sequences_with_canary_{repitition}_skipped.csv"
    canary_dp_skip_folder = f"{CANARY_FOLDER}/generated_sequences_with_canary_{repitition}_dp_skipped.csv"

    extracted_data1 = synthetic_data_with_canary["generated_text"].apply(lambda x: extract_fields(x)).apply(pd.Series)
    extracted_data2 = synthetic_dp_data_with_canary["generated_text"].apply(lambda x: extract_fields(x)).apply(pd.Series)

    synthetic_data_with_canary = pd.concat([synthetic_data_with_canary, extracted_data1], axis=1)
    synthetic_dp_data_with_canary = pd.concat([synthetic_dp_data_with_canary, extracted_data2], axis=1)

    leakage_test = detect_canary_leakage(synthetic_data_with_canary, canary_skip_folder)
    leakage_test = detect_canary_leakage(synthetic_dp_data_with_canary, canary_dp_skip_folder)

    print("-------------------------------")
    print("\n🔹 Canary Leakage Results:")
    print(f"➡️ Synthetic Data with Canary ({repitition} sample):", leakage_test)
    print(f"➡️ Synthetic Data with Canary ({repitition} sample) with DP:", leakage_test)
    print("-------------------------------")

  synthetic_data_with_canary = pd.read_json(path_or_buf=f"{CANARY_FOLDER}/generated_sequences_with_canary_{repitition}.jsonl", lines=True)


ValueError: Expected object or value

## NOT DOING!! - Memorization test

Load the model trained on data injected with canary sequences, and calculate rank. Lower rank - more memorization.

In [None]:
# from transformers import AutoModelForCausalLM, AutoTokenizer
# import torch
# from faker import Faker
# import random
# import numpy as np

In [None]:
# # Load Llama 3.1 tokenizer and model
# model_name = "meta-llama/Llama-3-8B"  # Replace with your fine-tuned checkpoint
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

In [None]:
# # Initialize Faker for synthetic data generation
# fake = Faker()

# # Canary template with placeholders
# canary_templates = {
#     "name": "My name is {} {}. I always buy phone accessories from here.",
#     "address": "The delivery took forever to reach {}, {}!",
#     "phone": "I provided my phone number {}, but customer support never responded.",
#     "email": "I keep receiving shipping updates at my email, {}.",
#     "order_id": "I ordered this last week, but my order ID {} still says 'preparing for shipment'."
# }

# def generate_canary_template_constant():
#     NAME = "Alex Johnson"
#     STREET = "4567 Maple Avenue"
#     CITY = "Seattle"
#     PHONE_NUMBER = "555-234-5678"
#     EMAIL = "alex.johnson@mailbox.com"
#     ORDER_ID = "123-4567890-1234567"
#     return {
#         "name": canary_templates["name"].format(NAME, NAME),
#         "address": canary_templates["address"].format(STREET, CITY),
#         "phone": canary_templates["phone"].format(PHONE_NUMBER),
#         "email": canary_templates["email"].format(EMAIL),
#     }

# # Generate 10,000 synthetic sentences
# def generate_candidate_sentences(n=10000):
#     candidates = []
#     for _ in range(n):
#         canary_type, template = random.choice(list(canary_templates.items()))
#         if canary_type == "name":
#             filled_template = template.format(fake.first_name(), fake.last_name())
#         elif canary_type == "address":
#             filled_template = template.format(fake.street_address(), fake.city())
#         elif canary_type == "phone":
#             filled_template = template.format(fake.phone_number())
#         elif canary_type == "email":
#             filled_template = template.format(fake.email())
#         elif canary_type == "order_id":
#             filled_template = template.format(f"{random.randint(100, 999)}-{random.randint(1000000, 9999999)}-{random.randint(1000000, 9999999)}")
#         candidates.append(filled_template)
#     return candidates

In [None]:
# # Compute perplexity for a given text
# def calculate_perplexity(text):
#     inputs = tokenizer(text, return_tensors="pt", truncation=True).to("cuda")
#     with torch.no_grad():
#         outputs = model(**inputs, labels=inputs["input_ids"])
#     loss = outputs.loss.item()
#     return np.exp(loss)  # Convert loss to perplexity

In [None]:
# # Compute perplexity for canary sequences and 10,000 candidates
# candidate_sentences = generate_candidate_sentences(10000)  # Pre-generate all candidates

In [None]:
# candidate_sentences[:10]

In [None]:
# candidate_perplexities = [calculate_perplexity(sentence) for sentence in candidate_sentences]

In [None]:
# perplexities = {}
# for canary_type, template in canary_templates.items():
#     # Generate a single unique canary sentence
#     if canary_type == "name":
#         canary_text = template.format(fake.first_name(), fake.last_name())
#     elif canary_type == "address":
#         canary_text = template.format(fake.street_address(), fake.city())
#     elif canary_type == "phone":
#         canary_text = template.format(fake.phone_number())
#     elif canary_type == "email":
#         canary_text = template.format(fake.email())
#     elif canary_type == "order_id":
#         canary_text = template.format(f"{random.randint(100, 999)}-{random.randint(1000000, 9999999)}-{random.randint(1000000, 9999999)}")

#     # Compute perplexity for canary
#     canary_perplexity = calculate_perplexity(canary_text)
#     rank = sum(1 for p in candidate_perplexities if p < canary_perplexity) + 1
#     perplexities[canary_type] = rank

In [None]:
# # Print results
# print("\n🔹 Perplexity Rankings for Canary Sequences:")
# for canary_type, rank in perplexities.items():
#     print(f"➡️ {canary_type}: Rank {rank} / 10,000 (Lower rank = Higher memorization risk)")