In [3]:
import pandas as pd
import random
from datetime import datetime, timedelta

def generate_simple_discharge_summaries(num_records=50):
    first_names = ["John", "Jane", "Robert", "Emily", "Michael", "Sarah"]
    last_names = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia"]
    streets = ["Main St", "Oak Ave", "Pine Rd", "Elm St", "Maple Dr"]
    cities = ["Anytown", "Somewhere", "Otherville", "New City"]
    states = ["CA", "TX", "NY", "FL", "IL"]
    
    records = []
    for i in range(num_records):
        # Patient demographics
        patient_id = f"PID{10000 + i}"
        patient_name = f"{random.choice(first_names)} {random.choice(last_names)}"
        dob = (datetime.now() - timedelta(days=random.randint(18*365, 90*365))).strftime('%Y-%m-%d')
        address = f"{random.randint(100, 999)} {random.choice(streets)}, {random.choice(cities)}, {random.choice(states)} {random.randint(10000, 99999)}"
        phone = f"{random.randint(200, 999)}-{random.randint(200, 999)}-{random.randint(1000, 9999)}"
        email = f"{patient_name.lower().replace(' ', '.')}@example.com"
        
        # Clinical details
        admit_date = (datetime.now() - timedelta(days=random.randint(1, 365))).strftime('%Y-%m-%d')
        discharge_date = (datetime.strptime(admit_date, '%Y-%m-%d') + timedelta(days=random.randint(1, 21))).strftime('%Y-%m-%d')
        
        summary = f"""Patient admitted on {admit_date} with condition.
Presenting symptoms included discomfort.
Discharged on {discharge_date} with instructions."""
        
        records.append({
            'patient_id': patient_id,
            'patient_name': patient_name,
            'date_of_birth': dob,
            'address': address,
            'phone_number': phone,
            'email': email,
            'admit_date': admit_date,
            'discharge_date': discharge_date,
            'discharge_summary': summary
        })
    
    return pd.DataFrame(records)

df = generate_simple_discharge_summaries()
df.to_csv('simple_discharge_summaries.csv', index=False)
print("File 'simple_discharge_summaries.csv' created successfully!")

File 'simple_discharge_summaries.csv' created successfully!


In [6]:
import pandas as pd
import hashlib
import secrets
import json
import re
from datetime import datetime, timedelta
import base64

class DischargeSummaryPseudonymizer:
    def __init__(self):
        self.pii_fields = [
            'patient_name', 'patient_id', 'date_of_birth', 'address',
            'phone_number', 'email', 'insurance_id', 'ssn',
            'medical_record_number', 'physician_name'
        ]
        self.salt = secrets.token_hex(16)
        self.date_shift = secrets.randbelow(365) + 1
        self.mappings = {field: {} for field in self.pii_fields}
    
    def pseudonymize_value(self, field, value):
        """Pseudonymize a single value based on field type"""
        if pd.isna(value):
            return value
            
        if field not in self.mappings:
            self.mappings[field] = {}
            
        if value not in self.mappings[field]:
            if field == 'date_of_birth':
                # Shift date
                date_obj = pd.to_datetime(value)
                shifted = date_obj + timedelta(days=self.date_shift)
                pseudonymized = shifted.strftime('%Y-%m-%d')
            else:
                # Use cryptographic hashing
                salted_value = str(value) + self.salt + field
                pseudonymized = hashlib.sha256(salted_value.encode()).hexdigest()
            
            self.mappings[field][value] = pseudonymized
        
        return self.mappings[field][value]
    
    def deidentify_text(self, text):
        """Remove PII from free text"""
        if pd.isna(text):
            return text
        
        patterns = {
            'NAME': r'([A-Z][a-z]+ [A-Z][a-z]+)',
            'PHONE': r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})',
            'EMAIL': r'([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)',
            'SSN': r'\d{3}-\d{2}-\d{4}',
            'MRN': r'MRN\d+',
        }
        
        for label, pattern in patterns.items():
            text = re.sub(pattern, f'[{label}]', text)
            
        return text
    
    def pseudonymize_dataframe(self, df):
        """Pseudonymize an entire dataframe"""
        pseudonymized_df = df.copy()
        
        for field in self.pii_fields:
            if field in pseudonymized_df.columns:
                pseudonymized_df[f'pseudonymized_{field}'] = pseudonymized_df[field].apply(
                    lambda x: self.pseudonymize_value(field, x)
                )
        
        text_fields = ['discharge_summary', 'notes', 'diagnosis']
        for field in text_fields:
            if field in pseudonymized_df.columns:
                pseudonymized_df[f'deidentified_{field}'] = pseudonymized_df[field].apply(
                    self.deidentify_text
                )
        
        return pseudonymized_df
    
    def save_mappings(self, filename):
        """Save mappings with basic encoding"""
        data = {
            'mappings': self.mappings,
            'salt': self.salt,
            'date_shift': self.date_shift
        }
        with open(filename, 'w') as f:
            json.dump(data, f)
    
    @classmethod
    def load_mappings(cls, filename):
        """Load mappings from file"""
        with open(filename, 'r') as f:
            data = json.load(f)
        
        pseudonymizer = cls()
        pseudonymizer.mappings = data['mappings']
        pseudonymizer.salt = data['salt']
        pseudonymizer.date_shift = data['date_shift']
        
        return pseudonymizer

# Usage example:
if __name__ == "__main__":
    pseudonymizer = DischargeSummaryPseudonymizer()
    df = pd.read_csv('simple_discharge_summaries.csv')
    pseudonymized_df = pseudonymizer.pseudonymize_dataframe(df)
    pseudonymizer.save_mappings('pii_mappings.json')
    pseudonymized_df.to_csv('pseudonymized_discharge_summaries.csv', index=False)