In [1]:
import os
import warnings
import pandas as pd
import openai
from openai import AzureOpenAI
from typing import List, Dict
from dotenv import load_dotenv, find_dotenv
os.environ['CURL_CA_BUNDLE'] = ''
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)
_ = load_dotenv(find_dotenv())  # read local .env file
openai_api_key = os.getenv("AZURE_OPENAI_KEY")
openai_api_version = '2023-08-01-preview'
model_deployment_name = os.getenv('MODEL_DEPLOYMENT_NAME')
openai_api_base = os.getenv("AZURE_OPENAI_ENDPOINT") 

# 3. Private QAs on confidential docs with custom PIIs



In this notebook we will explore a solution for anonymizing sensitive information within travel-related documents, ensuring privacy while allowing for question-answering capabilities. 

The primary focus is on anonymizing entities typically found in travel documents, such as Passenger Name Records (PNR) and E-TICKET details. However, the solution can be extended to any travel-specific sensitive entity.


In [2]:
document_content = """Date: October 19, 2021
Claimant: John Doe,

Subject: Claim Regarding Lost Luggage

Hello Customer Service,

My name is John Doe, and I need to report the loss of my luggage during a recent flight. This unfortunate incident occurred on October 19, 2021, and I'm reaching out to provide you with the necessary details.

The flight information associated with this claim includes the Passenger Name Record (PNR) LHKQK9 and the E-ticket number 160-4837291830

My contact information is as follows: phone number 999-888-7777 and email johndoe@example.com.

In addition to the standard details of lost luggage, I want to highlight that the misplaced baggage contains crucial personal documents, such as my wallet that contains my credit card with number 4111 1111 1111 1111, which is registered under my name and linked to my bank account, PL61109010140000071219812874, driver's license with DL No: 999000680, Social Security Number 602-76-4532.

I believe the luggage went missing during the handling process in Atlanta airport, and I noticed its absence upon reaching my destination. I kindly request your immediate attention to this matter and appreciate any efforts to locate and return my luggage promptly.

Please treat this information with the utmost confidentiality and respect for my privacy. In case of any updates regarding my lost luggage, feel free to contact me via the provided phone number or email.

I've informed my bank about the situation, and for any financial matters related to this incident, you can reach out to Victoria Cherry at support@bankname.com or her business phone: 987-654-3210.

Your prompt assistance in resolving this matter is highly appreciated.

Thank you for your attention.

Sincerely,
John Doe
"""

In [3]:
from langchain.schema import Document

documents = [Document(page_content=document_content)]
print(document_content)

Date: October 19, 2021
Claimant: John Doe,

Subject: Claim Regarding Lost Luggage

Hello Customer Service,

My name is John Doe, and I need to report the loss of my luggage during a recent flight. This unfortunate incident occurred on October 19, 2021, and I'm reaching out to provide you with the necessary details.

The flight information associated with this claim includes the Passenger Name Record (PNR) LHKQK9 and the E-ticket number 160-4837291830

My contact information is as follows: phone number 999-888-7777 and email johndoe@example.com.

In addition to the standard details of lost luggage, I want to highlight that the misplaced baggage contains crucial personal documents, such as my wallet that contains my credit card with number 4111 1111 1111 1111, which is registered under my name and linked to my bank account, PL61109010140000071219812874, driver's license with DL No: 999000680, Social Security Number 602-76-4532.

I believe the luggage went missing during the handling proc

We only have one document, so before we move on to creating a QA system, let's focus on its content to begin with.
You may observe that the text contains several  PII values, some types occur repeatedly (names, phone numbers, emails), and some specific PIIs are repeated (John Doe).

In [4]:
# Util function for coloring the PII markers
# NOTE: It will not be visible on documentation page, only in the notebook
import re

def print_colored_pii(string):
    colored_string = re.sub(
        r"(<[^>]*>)", lambda m: "\033[31m" + m.group(1) + "\033[0m", string
    )
    print(colored_string)

Let's proceed and try to anonymize the text with the default settings. For now, we don't replace the data with synthetic, we just
mark it with markers (e.g. `<PERSON>`), so we set `add_default_faker_operators=False`:

In [5]:
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer

anonymizer = PresidioReversibleAnonymizer(
    add_default_faker_operators=False,
)

print_colored_pii(anonymizer.anonymize(document_content))

Date: [31m<DATE_TIME>[0m
Claimant: [31m<PERSON>[0m,

Subject: Claim Regarding Lost Luggage

Hello Customer Service,

My name is [31m<PERSON>[0m, and I need to report the loss of my luggage during a recent flight. This unfortunate incident occurred on [31m<DATE_TIME>[0m, and I'm reaching out to provide you with the necessary details.

The flight information associated with this claim includes the Passenger Name Record (PNR) LHKQK9 and the E-ticket number 160-[31m<US_BANK_NUMBER>[0m

My contact information is as follows: phone number [31m<PHONE_NUMBER>[0m and email [31m<EMAIL_ADDRESS>[0m.

In addition to the standard details of lost luggage, I want to highlight that the misplaced baggage contains crucial personal documents, such as my wallet that contains my credit card with number [31m<CREDIT_CARD>[0m, which is registered under my name and linked to my bank account, [31m<IBAN_CODE>[0m, driver's license with DL No: [31m<US_DRIVER_LICENSE>[0m, Social Security Number [

Let's also look at the mapping between original and anonymized values:

In [7]:
import pprint

pprint.pprint(anonymizer.deanonymizer_mapping)

{'CREDIT_CARD': {'<CREDIT_CARD>': '4111 1111 1111 1111'},
 'DATE_TIME': {'<DATE_TIME>': 'October 19, 2021'},
 'EMAIL_ADDRESS': {'<EMAIL_ADDRESS>': 'johndoe@example.com',
                   '<EMAIL_ADDRESS_2>': 'support@bankname.com'},
 'IBAN_CODE': {'<IBAN_CODE>': 'PL61109010140000071219812874'},
 'LOCATION': {'<LOCATION>': 'Atlanta'},
 'PERSON': {'<PERSON>': 'John Doe', '<PERSON_2>': 'Victoria Cherry'},
 'PHONE_NUMBER': {'<PHONE_NUMBER>': '999-888-7777'},
 'UK_NHS': {'<UK_NHS>': '987-654-3210'},
 'US_BANK_NUMBER': {'<US_BANK_NUMBER>': '4837291830'},
 'US_DRIVER_LICENSE': {'<US_DRIVER_LICENSE>': '999000680'},
 'US_SSN': {'<US_SSN>': '602-76-4532'}}


In general anonymizer works pretty well, but we can observe two things to improve here:

1. PNR - the PNR has unique pattern, which is not by default part of anonymizer recognizers. The value *LHKQK9* is not anonymized.
2. E-TICKET. The E-Ticket has a unique pattern, which is not by default part of anonymizer recognizers. The value 160-4837291830 is detected as 160-<US_BANK_number>


The solution is simple: we need to add a new recognizers to the anonymizer. You can read more about it in [presidio documentation](https://microsoft.github.io/presidio/analyzer/adding_recognizers/).


Let's add new recognizers:

In [8]:
from presidio_analyzer import Pattern, PatternRecognizer

pnr_patern = Pattern(
    name="pnr_patern",
    regex="[A-Z0-9]{5}\d{1}",
    score=1,
)


ticket_patern = Pattern(
    name="e-ticket_patern",
    regex="[0-9]{3}(-)?[0-9]{10}",
    score=1,
)

# Define the recognizer with one or more patterns
ticket_recognizer = PatternRecognizer(
    supported_entity="E-TICKET", patterns=[ticket_patern]
)
# Define the recognizer with one or more patterns
pnr_recognizer = PatternRecognizer(
    supported_entity="PNR", patterns=[pnr_patern]
)
anonymizer.add_recognizer(ticket_recognizer)
anonymizer.add_recognizer(pnr_recognizer)


And now, we're adding recognizers to our anonymizer:

In [9]:
anonymizer.add_recognizer(ticket_recognizer)
anonymizer.add_recognizer(pnr_recognizer)


Note that our anonymization instance remembers previously detected and anonymized values, including those that were not detected correctly (e.g., PNR). So it's worth removing this value, or resetting the entire mapping now that our recognizers have been updated:

In [10]:
anonymizer.reset_deanonymizer_mapping()

In [11]:
print_colored_pii(anonymizer.anonymize(document_content))

Date: [31m<DATE_TIME>[0m
Claimant: [31m<PERSON>[0m,

Subject: Claim Regarding Lost Luggage

Hello Customer Service,

My name is [31m<PERSON>[0m, and I need to report the loss of my luggage during a recent flight. This unfortunate incident occurred on [31m<DATE_TIME>[0m, and I'm reaching out to provide you with the necessary details.

The flight information associated with this claim includes the Passenger Name Record (PNR) [31m<PNR>[0m and the E-ticket number [31m<E-TICKET>[0m

My contact information is as follows: phone number [31m<PHONE_NUMBER>[0m and email [31m<EMAIL_ADDRESS>[0m.

In addition to the standard details of lost luggage, I want to highlight that the misplaced baggage contains crucial personal documents, such as my wallet that contains my credit card with number [31m<CREDIT_CARD>[0m, which is registered under my name and linked to my bank account, [31m<IBAN_CODE>[0m, driver's license with DL No: [31m<US_DRIVER_LICENSE>[0m, Social Security Number [31

As you can see, our new recognizers work as expected. The anonymizer has replaced the PNR and E-TICKET entities with the <PNR> and <E-TICKET> markers, and the deanonymizer mapping has been updated accordingly.

Now, when all PII values are detected correctly, we can proceed to the next step, which is replacing the original values with synthetic ones. To do this, we need to set add_default_faker_operators=True (or just remove this parameter, because it's set to True by default):

In [12]:
pprint.pprint(anonymizer.deanonymizer_mapping)

{'CREDIT_CARD': {'<CREDIT_CARD>': '4111 1111 1111 1111'},
 'DATE_TIME': {'<DATE_TIME>': 'October 19, 2021'},
 'E-TICKET': {'<E-TICKET>': '160-4837291830'},
 'EMAIL_ADDRESS': {'<EMAIL_ADDRESS>': 'johndoe@example.com',
                   '<EMAIL_ADDRESS_2>': 'support@bankname.com'},
 'IBAN_CODE': {'<IBAN_CODE>': 'PL61109010140000071219812874'},
 'LOCATION': {'<LOCATION>': 'Atlanta'},
 'PERSON': {'<PERSON>': 'John Doe', '<PERSON_2>': 'Victoria Cherry'},
 'PHONE_NUMBER': {'<PHONE_NUMBER>': '999-888-7777'},
 'PNR': {'<PNR>': 'LHKQK9'},
 'UK_NHS': {'<UK_NHS>': '987-654-3210'},
 'US_DRIVER_LICENSE': {'<US_DRIVER_LICENSE>': '999000680'},
 'US_SSN': {'<US_SSN>': '602-76-4532'}}


In [13]:
anonymizer = PresidioReversibleAnonymizer(
    add_default_faker_operators=True,
    # Faker seed is used here to make sure the same fake data is generated for the test purposes
    # In production, it is recommended to remove the faker_seed parameter (it will default to None)
    faker_seed=42,
)

anonymizer.add_recognizer(ticket_recognizer)
anonymizer.add_recognizer(pnr_recognizer)

print_colored_pii(anonymizer.anonymize(document_content))

Date: 1983-01-20
Claimant: William Roman,

Subject: Claim Regarding Lost Luggage

Hello Customer Service,

My name is William Roman, and I need to report the loss of my luggage during a recent flight. This unfortunate incident occurred on 1983-01-20, and I'm reaching out to provide you with the necessary details.

The flight information associated with this claim includes the Passenger Name Record (PNR) [31m<PNR>[0m and the E-ticket number [31m<E-TICKET>[0m

My contact information is as follows: phone number +1-318-645-1462x70482 and email ddavis@example.org.

In addition to the standard details of lost luggage, I want to highlight that the misplaced baggage contains crucial personal documents, such as my wallet that contains my credit card with number 4672423884966, which is registered under my name and linked to my bank account, GB29HDMI75255341928327, driver's license with DL No: 966647391, Social Security Number 368-45-9892.

I believe the luggage went missing during the handli

As you can see, almost all values have been replaced with synthetic ones. The only exception is the PNR the E-TICKET, which are not supported by the default faker operators. We can add new operators to the anonymizer, which will generate random data. 

In [14]:
from faker import Faker

fake = Faker()


def fake_pnr(_=None):
    return fake.bothify(text="?#?###").upper()


fake_pnr()

'U5L760'

In [15]:
def fake_e_ticket(_=None):
    return fake.bothify(text="###-#########").upper()


In [16]:
fake_e_ticket()

'397-684010260'

### Let's add newly created operators to the anonymizer:

In [17]:
from presidio_anonymizer.entities import OperatorConfig

new_operators = {
    "PNR": OperatorConfig("custom", {"lambda": fake_pnr}),
    "E-TICKET": OperatorConfig("custom", {"lambda": fake_e_ticket}),
}

anonymizer.add_operators(new_operators)

In [18]:
anonymizer.reset_deanonymizer_mapping()


And anonymize everything once again:

In [19]:
anonymizer.reset_deanonymizer_mapping()
print_colored_pii(anonymizer.anonymize(document_content))

Date: 2017-09-25
Claimant: Angel Lewis MD,

Subject: Claim Regarding Lost Luggage

Hello Customer Service,

My name is Angel Lewis MD, and I need to report the loss of my luggage during a recent flight. This unfortunate incident occurred on 2017-09-25, and I'm reaching out to provide you with the necessary details.

The flight information associated with this claim includes the Passenger Name Record (PNR) J2W275 and the E-ticket number 265-455862605

My contact information is as follows: phone number (719)639-9091x6998 and email rileyamy@example.net.

In addition to the standard details of lost luggage, I want to highlight that the misplaced baggage contains crucial personal documents, such as my wallet that contains my credit card with number 6573602606474689, which is registered under my name and linked to my bank account, GB70DVNL66701065133387, driver's license with DL No: 172370545, Social Security Number 725-81-0965.

I believe the luggage went missing during the handling process

In [20]:
pprint.pprint(anonymizer.deanonymizer_mapping)

{'CREDIT_CARD': {'6573602606474689': '4111 1111 1111 1111'},
 'DATE_TIME': {'2017-09-25': 'October 19, 2021'},
 'E-TICKET': {'265-455862605': '160-4837291830'},
 'EMAIL_ADDRESS': {'rileyamy@example.net': 'johndoe@example.com',
                   'sarahcampos@example.net': 'support@bankname.com'},
 'IBAN_CODE': {'GB70DVNL66701065133387': 'PL61109010140000071219812874'},
 'LOCATION': {'New Angelashire': 'Atlanta'},
 'PERSON': {'Angel Lewis MD': 'John Doe', 'Jessica Holmes': 'Victoria Cherry'},
 'PHONE_NUMBER': {'(719)639-9091x6998': '999-888-7777'},
 'PNR': {'J2W275': 'LHKQK9'},
 'UK_NHS': {'8128575500': '987-654-3210'},
 'US_DRIVER_LICENSE': {'172370545': '999000680'},
 'US_SSN': {'725-81-0965': '602-76-4532'}}


Voilà! Now all values are replaced with synthetic ones. Note that the deanonymizer mapping has been updated accordingly.

Let's now wrap it up together and create full question-answering system, based on PresidioReversibleAnonymizer and LangChain Expression Language (LCEL).

In [21]:
# 1. Initialize anonymizer
anonymizer = PresidioReversibleAnonymizer(
    # Faker seed is used here to make sure the same fake data is generated for the test purposes
    # In production, it is recommended to remove the faker_seed parameter (it will default to None)
    faker_seed=42,
)

anonymizer.add_recognizer(pnr_recognizer)
anonymizer.add_recognizer(ticket_recognizer)

anonymizer.add_operators(new_operators)

In [22]:
import openai
from langchain_openai import AzureOpenAIEmbeddings

embeddings = AzureOpenAIEmbeddings(
    api_key = openai_api_key,
    azure_deployment=os.getenv('EMBEDDING_DEPLOYMENT_NAME'),
    openai_api_version=openai_api_version
)

In [23]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

# 2. Load the data: In our case data's already loaded


# 3. Anonymize the data before indexing
for doc in documents:
    doc.page_content = anonymizer.anonymize(doc.page_content)

# 4. Split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_documents(documents)

# 5. Index the chunks (using OpenAI embeddings, because the data is already anonymized)
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings)


retriever = vectorstore.as_retriever()

In [24]:
# this is the name of the deployments you created in the Azure portal within the above resource
from typing import List, Dict

# gets the API Key from environment variable AZURE_OPENAI_API_KEY
from openai import AzureOpenAI

# gets the API Key from environment variable AZURE_OPENAI_API_KEY
client = AzureOpenAI(
    # https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#rest-api-versioning
    api_version=openai_api_version,
    api_key = openai_api_key,
    # https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal#create-a-resource
    azure_endpoint=openai_api_base,
)


# this is the name of the deployments you created in the Azure portal within the above resource
from typing import List, Dict

def get_chat_with_conversation(
        text,
        temperature: float = 0.2,
        **model_kwargs
) -> str:
    try:
        
        messages = [
            {"role": "system", "content": '"""'+ str(text) + '"""'}
        ]
        response = client.chat.completions.create(model=model_deployment_name,
                                                  messages=messages)
 
        return response.choices[0].message.content
    except openai.OpenAIError as e: # this is the base class of any openai exception
        print(f"The call to the Chat Completion API failed as a consequence "
              f"of the following exception: {e}")

        

<br>
<img src="assets/QA1.png" width="650" align="center">


In [25]:
from operator import itemgetter

from langchain.prompts import PromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda, RunnableMap, RunnablePassthrough

# 6. Create anonymizer chain

template = """Answer the question based only on the following context:
{context}

Question: {anonymized_question}
"""

prompt = PromptTemplate(template=template, input_variables=["question"])


#model = ChatOpenAI(temperature=0.3)


_inputs = RunnableMap(
    question=RunnablePassthrough(),
    # It is important to remember about question anonymization
    anonymized_question=RunnableLambda(anonymizer.anonymize),
)

anonymizer_chain = (
    _inputs
    | {
        "context": itemgetter("anonymized_question") | retriever,
        "anonymized_question": itemgetter("anonymized_question"),
    }
    | prompt
    | get_chat_with_conversation
    | StrOutputParser()
)

In [26]:
anonymizer_chain.invoke(
    "Where did the traveler lost his luggage can you retrieve the necessary personal information of the traveler and list them in seperate points ?"
)

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


"The traveler lost his luggage at Port Matthew airport. \n\nThe necessary personal information of the traveler is as follows:\n- Name: William Roman\n- Contact information: \n  - Phone number: +1-318-645-1462x70482\n  - Email: ddavis@example.org\n- Credit card number: 4672423884966\n- Bank account number: GB29HDMI75255341928327\n- Driver's license number: 966647391\n- Social Security Number: 368-45-9892"

In [27]:
# 7. Add deanonymization step to the chain
chain_with_deanonymization = anonymizer_chain | RunnableLambda(anonymizer.deanonymize)

<br>
<img src="assets/QA2.png" width="950" align="center">

In [28]:
print(
    chain_with_deanonymization.invoke(
        "Where did the travel lost his luggage can you retrieve the necessary personal information of the traveler and list them?"
    )
)

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


The traveler lost his luggage at the Atlanta airport. The necessary personal information of the traveler includes the following:

- Name: John Doe
- Phone number: 999-888-7777
- Email: johndoe@example.com
- Credit card number: 4111 1111 1111 1111
- Bank account number: PL61109010140000071219812874
- Driver's license number: 999000680
- Social Security Number: 602-76-4532
