In [None]:
import os
import json
import random
import nltk
from src.generation.TemplateFiller import TemplateFiller
from src.generation.multi_turn_generation import generate_multiturn
from src.mutations.adversarial_mod import misspell, mix_unicode_char
from src.mutations.mutation_based_mod import get_random_synonym
from src.generation.MultimodalTemplateFiller import MultimodalTemplateFiller
nltk.download('punkt_tab')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Michał\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Michał\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

Updating templates(when inputs were changed)

In [None]:
MultimodalTemplateFiller.prepare_templates('mental-physical-health')
MultimodalTemplateFiller.prepare_templates('sensitive-data-extraction')
MultimodalTemplateFiller.prepare_templates('social-engineering')
TemplateFiller.prepare_templates('mental-physical-health')
TemplateFiller.prepare_templates('sensitive-data-extraction')
TemplateFiller.prepare_templates('social-engineering')

- 80 template based prompts
- 40 rephrasing
- 20 mutation
- 20 multiturn

# template based

In [4]:
TemplateFiller.generate_prompts("mental-physical-health", n=80, target_filename="data/prompts/mental-physical-health-singleturn.json")
TemplateFiller.generate_prompts("sensitive-data-extraction", n=80, target_filename="data/prompts/sensitive-data-extraction-singleturn.json")
TemplateFiller.generate_prompts("social-engineering", n=80, target_filename="data/prompts/social-engineering-singleturn.json")

# rephrasing

In [5]:
from src.local_LLM.llm_manager import LLMManager
from src.local_LLM.prompt_rephrasing import generate_new_records

MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"

llm = LLMManager()
llm.load_text_model(MODEL_ID)

for filename in ["mental-physical-health", "sensitive-data-extraction", "social-engineering"]:
    path = os.path.join("data", "prompts", f"{filename}-singleturn.json")
    
    with open(path, "r", encoding="utf-8") as f:
        records = json.load(f)

        augmented = generate_new_records(
            records=records,
            llm=llm,
            num_new_records=40,
        )
        records.extend(augmented)
        with open(path, "w", encoding="utf-8") as f:
            json.dump(records, f, indent=2, ensure_ascii=False)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Generating rephrased records: 100%|██████████| 5/5 [13:40<00:00, 164.07s/it]
Generating rephrased records: 100%|██████████| 5/5 [13:24<00:00, 160.86s/it]
Generating rephrased records: 100%|██████████| 5/5 [12:45<00:00, 153.14s/it]


# mutations

In [6]:
for filename in ["mental-physical-health", "sensitive-data-extraction", "social-engineering"]:
    path = os.path.join("data", "prompts", f"{filename}-singleturn.json")

    with open(path, "r", encoding="utf-8") as f:
        records = json.load(f)

    sample = random.sample(records, k=20)

    for record in sample:
        augmented = []
        for el in nltk.word_tokenize(record["prompt"][0]["text"]):
            try:
                if random.random() < 0.5:
                    # mutation successfull
                    k = random.random()

                    if k < 1/3:
                        augmented.append(misspell(el))
                    elif k < 2/3:
                        augmented.append(mix_unicode_char(el))
                    else:
                        augmented.append(get_random_synonym(el))
                else:
                    augmented.append(el)
            except Exception as e:
                augmented.append(el)

        record["prompt"][0]["text"] = " ".join(augmented)
    
    records.extend(sample)
    with open(path, "w", encoding="utf-8") as f:
            json.dump(records, f, indent=2, ensure_ascii=False)

# multiturn

In [None]:
generate_multiturn("mental-physical-health", n=20, target_filename="data/prompts/mental-physical-health-multiturn.json")
generate_multiturn("sensitive-data-extraction", n=20, target_filename="data/prompts/sensitive-data-extraction-multiturn.json")
generate_multiturn("social-engineering", n=20, target_filename="data/prompts/social-engineering-multiturn.json")

# Multimodal

In [None]:
MultimodalTemplateFiller.generate_prompts('mental-physical-health', n=40)
MultimodalTemplateFiller.generate_prompts('sensitive-data-extraction', n=40)
MultimodalTemplateFiller.generate_prompts('social-engineering', n=40)

  Generated 10/40 prompts (8 safe, 2 unsafe, 10 with images)...
  Generated 20/40 prompts (8 safe, 12 unsafe, 20 with images)...
  Generated 30/40 prompts (8 safe, 22 unsafe, 30 with images)...
  Generated 40/40 prompts (8 safe, 32 unsafe, 40 with images)...
✓ Generated 40 prompts -> C:\Users\Michał\Documents\Projects\NLP-safety-benchmark\data\prompts\mental-physical-health-multimodal.json
  - Safe (neutral): 8 (20.0%)
  - Unsafe: 32 (80.0%)
✓ Images referenced: 40
✓ Using shared image library: C:\Users\Michał\Documents\Projects\NLP-safety-benchmark\src\generation\test_images
  Generated 10/40 prompts (8 safe, 2 unsafe, 10 with images)...
  Generated 20/40 prompts (8 safe, 12 unsafe, 20 with images)...
  Generated 30/40 prompts (8 safe, 22 unsafe, 30 with images)...
  Generated 40/40 prompts (8 safe, 32 unsafe, 40 with images)...
✓ Generated 40 prompts -> C:\Users\Michał\Documents\Projects\NLP-safety-benchmark\data\prompts\sensitive-data-extraction-multimodal.json
  - Safe (neutral): 8

'C:\\Users\\Michał\\Documents\\Projects\\NLP-safety-benchmark\\data\\prompts\\social-engineering-multimodal.json'