In [13]:
dataset_path =  "aniket-curlscape/pii-masking-english-100"

In [15]:
from datasets import load_dataset

# Load the dataset from Hugging Face
dataset = load_dataset(dataset_path)

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
dataset['train']

Dataset({
    features: ['source_text', 'target_text', 'privacy_mask', 'span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id', 'language', 'set'],
    num_rows: 100
})

In [17]:
# Download dataset and save as JSONL
import json
from pathlib import Path

# Create output directory if it doesn't exist
output_dir = Path("data")
output_dir.mkdir(exist_ok=True)

# Save train split as JSONL
train_file = output_dir / "train.jsonl"
with open(train_file, "w") as f:
    for example in dataset['train']:
        json.dump(example, f)
        f.write('\n')

print(f"Dataset saved to {train_file}")
print(f"Number of examples: {len(dataset['train'])}")


Dataset saved to data/train.jsonl
Number of examples: 100


In [18]:
from langfuse import Langfuse
import os

os.environ["LANGFUSE_PUBLIC_KEY"] = "pk-lf-cf79d0fa-4c5b-44a5-9b8e-1a1a530a8b47" 
os.environ["LANGFUSE_SECRET_KEY"] = "sk-lf-12f32926-6f45-4d3e-88cd-9d40bf56bb08"
os.environ["LANGFUSE_HOST"] = "http://localhost:3001"
langfuse = Langfuse(public_key=os.environ["LANGFUSE_PUBLIC_KEY"], secret_key=os.environ["LANGFUSE_SECRET_KEY"], host=os.environ["LANGFUSE_HOST"])

In [22]:
import requests
import json
import time
from typing import Dict, Any, Optional

def call_chat_endpoint(content: str, model: str = "NousResearch/Meta-Llama-3-8B-Instruct") -> Optional[Dict[str, Any]]:
    """
    Call the chat completions endpoint with the given content.
    
    Args:
        content: The content to send as user message
        model: The model to use for completion
        
    Returns:
        Response dictionary or None if request failed
    """
    url = "https://curlscape--vllm-base-inference-serve.modal.run/v1/chat/completions"
    
    payload = {
        "messages": [
            {
                "content": "You are a helpful assistant that masks all personally identifiable information (PII) in text. Replace each detected PII entity with the correct placeholder from the list below:\nUsernames → [USERNAME]\nGiven names → [GIVENNAME1], [GIVENNAME2]\nLast names → [LASTNAME1], [LASTNAME2]\nTitles (e.g., Mr., Dr., Archduchess) → [TITLE]\nEmails → [EMAIL]\nPhone numbers / Telephones → [TEL]\nAddresses → [BUILDING], [STREET], [CITY], [STATE], [COUNTRY], [POSTCODE], [SECADDRESS]\nDates → [DATE], birthdates → [BOD], times → [TIME]\nIdentity numbers → [SOCIALNUMBER], [PASSPORT], [DRIVERLICENSE], [IDCARD]\nPasswords / secrets → [PASS]\nIP addresses → [IP]\nSex / Gender → [SEX]\nRules:\nAlways use the most specific placeholder available.\nSupport multiple occurrences of the same type (append numbering if needed).\nPreserve all non-PII text exactly.\nApply consistently across structured (CSV, JSON, XML) and unstructured (free text, comments) formats.\nWhen in doubt (e.g., certificate numbers, encoded IDs), map to the closest ID placeholder: [IDCARD] or [DRIVERLICENSE].",
                "role": "system"
            },
            {
                "content": content,
                "role": "user"
            }
        ],
        "model": model,
        "temperature": 1.0
    }
    
    headers = {
        "accept": "application/json",
        "Content-Type": "application/json"
    }
    
    try:
        response = requests.post(url, headers=headers, json=payload, timeout=300)
        response.raise_for_status()
        llm_answer = response.json().get('choices')[0].get('message').get('content')
        langfuse.trace(name='observe_vllm',input=content, output=llm_answer)
        return llm_answer
    except requests.exceptions.RequestException as e:
        print(f"Error making request: {e}")
        return None
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON response: {e}")
        return None


In [None]:
print(call_chat_endpoint("We have set a meeting for aniket_1234 at 9 am on 10-9-25"))

'You are a helpful assistant that masks all personally identifiable information (PII) in text. Replace each detected PII entity with the correct placeholder from the list below:\nUsernames → [USERNAME]\nGiven names → [GIVENNAME1], [GIVENNAME2]\nLast names → [LASTNAME1], [LASTNAME2]\nTitles (e.g., Mr., Dr., Archduchess) → [TITLE]\nEmails → [EMAIL]\nPhone numbers / Telephones → [TEL]\nAddresses → [BUILDING], [STREET], [CITY], [STATE], [COUNTRY], [POSTCODE], [SECADDRESS]\nDates → [DATE], birthdates → [BOD], times → [TIME]\nIdentity numbers → [SOCIALNUMBER], [PASSPORT], [DRIVERLICENSE], [IDCARD]\nPasswords / secrets → [PASS]\nIP addresses → [IP]\nSex / Gender → [SEX]\nRules:\nAlways use the most specific placeholder available.\nSupport multiple occurrences of the same type (append numbering if needed).\nPreserve all non-PII text exactly.\nApply consistently across structured (CSV, JSON, XML) and unstructured (free text, comments) formats.\nWhen in doubt (e.g., certificate numbers, encoded 

In [28]:
all_responses = []

# Iterate through the dataset
for i, item in enumerate(dataset['train']):
    # Access individual fields
    # id = item['id']
    question = item['source_text']
    result = item['target_text']
    
    # Record start time for processing
    start_time = time.time()
    llm_answer = call_chat_endpoint(question)
    print('-'*5)
    print(str(f'Question {question}'))
    print(str(f'Actual answer: {result}'))
    print(str(f'LLM Answer: {llm_answer}'))
    print('-'*5)
    print('\n\n')
    end_time = time.time()
    processing_time = end_time - start_time

    if i > 5:
        break
    
    # Create response object
    response_data = {
        "item_index": i,
        "question": question,
        "result": result,
        "llm_answer": llm_answer,
        "timestamp": time.time(),
        "processing_time_seconds": processing_time
    }
    
    # Add to list
    all_responses.append(response_data)
    
    # Save all responses to the same JSON file after each item
    filename = "responses_pii_redaction.json"
    try:
        with open(filename, 'w') as f:
            json.dump(all_responses, f, indent=2)
        print(f"Saved response for item {i} to {filename} (processed in {processing_time:.2f}s)")
    except Exception as e:
        print(f"Error saving item {i}: {e}")

-----
Question Subject: Group Messaging for Admissions Process

Good morning, everyone,

I hope this message finds you well. As we continue our admissions processes, I would like to update you on the latest developments and key information. Please find below the timeline for our upcoming meetings:

- wynqvrh053 - Meeting at 10:20am
- luka.burg - Meeting at 21
- qahil.wittauer - Meeting at quarter past 13
- gholamhossein.ruschke - Meeting at 9:47 PM
- pdmjrsyoz1460 
Actual answer: Subject: Group Messaging for Admissions Process

Good morning, everyone,

I hope this message finds you well. As we continue our admissions processes, I would like to update you on the latest developments and key information. Please find below the timeline for our upcoming meetings:

- [USERNAME] - Meeting at [TIME]
- [USERNAME] - Meeting at [TIME]
- [USERNAME] - Meeting at [TIME]
- [USERNAME] - Meeting at [TIME]
- [USERNAME] 
LLM Answer: You are a helpful assistant that masks all personally identifiable infor