# FINAL .ipynb file

In [62]:

print("\nCreating 'authorized_users.txt' file...")
with open('authorized_users.txt', 'w') as f:
    f.write("Alice\n")
    f.write("Bob\n")
    f.write("Charlie\n")
    f.write("Diana\n")
    f.write("Eve\n")
print("'authorized_users.txt' created successfully with 5 names.")



Creating 'authorized_users.txt' file...
'authorized_users.txt' created successfully with 5 names.


In [63]:
import re
from presidio_analyzer import AnalyzerEngine, EntityRecognizer, RecognizerResult # collect all their findings, blueprint for creating our own custom , report form
from presidio_analyzer.nlp_engine import SpacyNlpEngine
from presidio_anonymizer import AnonymizerEngine

print("Defining PII Handling classes...")


class GmailRecognizer(EntityRecognizer):
    """A custom Presidio recognizer to find only @gmail.com emails."""
    expected_confidence_level = 0.95
    
    def __init__(self, supported_entities=["GMAIL_ADDRESS"]):
        super().__init__(supported_entities=supported_entities)

    def load(self) -> None:
        pass

    def analyze(self, text: str, entities: list[str], nlp_artifacts=None) -> list[RecognizerResult]:
        results = []
        matches = re.finditer(r'\b[A-Za-z0-9._%+-]+@gmail\.com\b', text)
        for match in matches:
            result = RecognizerResult(
                entity_type="GMAIL_ADDRESS",
                start=match.start(),
                end=match.end(),
                score=self.expected_confidence_level,
            )
        
            results.append(result)
        return results
class AgeRecognizer(EntityRecognizer):
    """A custom recognizer to find standalone ages (e.g., '19', 'age 25')."""
    expected_confidence_level = 0.85
    
    def __init__(self, supported_entities=["AGE"]):
        super().__init__(supported_entities=supported_entities)

    def load(self) -> None:
        pass

    def analyze(self, text: str, entities: list[str], nlp_artifacts=None) -> list[RecognizerResult]:
        results = []
    
        matches = re.finditer(r'\b(age is|i am|i\'m)?\s?(\d{1,2})\b', text, re.IGNORECASE)
        for match in matches:
            # The actual number is in the second group of our regex match.
            age_text = match.group(2)
            start_pos = match.start(2)
            end_pos = match.end(2)

            result = RecognizerResult(
                entity_type="AGE",
                start=start_pos,
                end=end_pos,
                score=self.expected_confidence_level,
            )
            results.append(result)
        return results

class PiiHandler:
    """A class to handle all PII masking and unmasking operations."""
    def __init__(self):
        nlp_engine = SpacyNlpEngine(models=[{"lang_code": "en", "model_name": "en_core_web_lg"}])
        self.analyzer = AnalyzerEngine(nlp_engine=nlp_engine)
        self.anonymizer = AnonymizerEngine()
        
        gmail_recognizer = GmailRecognizer()
        self.analyzer.registry.add_recognizer(gmail_recognizer)
        age_recognizer = AgeRecognizer()
        self.analyzer.registry.add_recognizer(age_recognizer)
        
        self.unmask_map = {}

        
    def mask_text(self, text: str):
        self.unmask_map = {}
        extracted_pii = {}

        analyzer_results = self.analyzer.analyze(
            text=text,
            entities=["PERSON", "GMAIL_ADDRESS", "AGE"],
            language="en"
        )
        
        for result in analyzer_results:
            entity_type = result.entity_type
            original_value = text[result.start:result.end]
            placeholder = f"<{entity_type}>"
            
            self.unmask_map[placeholder] = original_value
            extracted_pii[entity_type] = original_value

        anonymized_result = self.anonymizer.anonymize(
            text=text,
            analyzer_results=analyzer_results,
        )
        
        return anonymized_result.text, extracted_pii

    def unmask_text(self, text: str):
        for placeholder, original_value in self.unmask_map.items():
            text = text.replace(placeholder, original_value)
        return text
        
    def get_unmasked_value(self, placeholder: str):
        return self.unmask_map.get(placeholder, None)

print("PII Handling module is defined and ready.")


Defining PII Handling classes...
PII Handling module is defined and ready.


In [64]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

print("Building the RAG Authorization module...")

model = SentenceTransformer('all-MiniLM-L6-v2')

file_path = 'authorized_users.txt'
try:
    with open(file_path, 'r') as f:
        authorized_names = [name.strip().lower() for name in f.readlines()]
    authorized_embeddings = model.encode(authorized_names)

    # FAISS
    dimension = authorized_embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(authorized_embeddings)
    print(f"Successfully loaded {len(authorized_names)} names into the vector store.")

except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found. Please create it.")
    authorized_names = []
    index = None

def is_user_authorized(name_to_check: str) -> bool:
    """Checks if a given name is in our authorized vector store."""
    if not index:
        print("Error: Vector store is not available.")
        return False
        
    name_embedding = model.encode([name_to_check])
    
    # Search for the 1 closest match 
    distances, indices = index.search(name_embedding, k=1)
    
    distance = distances[0][0]
    matched_index = indices[0][0]
    matched_name = authorized_names[matched_index]

    print(f"[RAG AUTH]: Checking name '{name_to_check}'...")
    print(f"[RAG AUTH]: Closest match is '{matched_name}' with distance {distance:.4f}. User is {'AUTHORIZED' if distance < 0.2 else 'NOT AUTHORIZED'}.")
    
    return distance < 0.2
# 0.01 might be stringent


Building the RAG Authorization module...
Successfully loaded 5 names into the vector store.


In [None]:

from openai import OpenAI
import json
import os

print("Preparing the final application...")

OPENAI_API_KEY = "xxxx"


client = OpenAI(api_key=OPENAI_API_KEY)

# tool definition

def check_eligibility(name: str, age: int, email: str) -> str: #outputs success or failure
    """Checks if a user is eligible based on age and email format."""
    print(f"[TOOL EXECUTION]: Running check_eligibility(name='{name}', age={age}, email='{email}')")
    is_eligible = True
    reasons = []
    if age <= 18:
        is_eligible = False
        reasons.append("user is not over 18")
    if not email.endswith("@gmail.com"):
        is_eligible = False
        reasons.append("email is not a Gmail address")
    if is_eligible:
        return f"SUCCESS: {name} is eligible."
    else:
        return f"FAILURE: {name} is not eligible for the following reasons: {', '.join(reasons)}."

tools = [
    {
        "type": "function",
        "function": {
            "name": "check_eligibility",
            "description": "Checks if a user is eligible based on their name, age, and email.",
            "parameters": {
                "type": "object",
                "properties": {
                    "name": {
                        "type": "string",
                        "description": "The user's full name."
                    },
                    "age": {
                        "type": "integer",
                        "description": "The user's age as a number."
                    },
                    "email": {
                        "type": "string",
                        "description": "The user's email address that ends with @gmail.com."
                    }
                },
                "required": ["name", "age", "email"]
            }
        }
    }
]

pii_handler = PiiHandler()


print("\n\n--- Chatbot Initialized ---")
print("Hello! Please provide your name, age, and email to begin.")
print("Type 'quit' to exit.")

# this dictionary has unmasked info
collected_info = {"name": None, "age": None, "email": None}
conversation_history = [
    {"role": "system", "content": "You are an efficient assistant. Your only goal is to collect a user's name, age, and email to call the 'check_eligibility' tool. Review the conversation history. Do NOT ask for information that has already been provided. Ask for the next single piece of missing information. Once all three are present, you must call the tool immediately."}
]

while True:
   
    if all(collected_info.values()):
        print("\n[SYSTEM]: All information collected. Proceeding to call the eligibility tool.")
        
        print("[AI CALL]: Sending final request to trigger tool call...")
        try:
            response = client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=conversation_history,
                tools=tools,
                tool_choice="required" # We force the tool call now.
            )
            response_message = response.choices[0].message

            if response_message.tool_calls:
                print("[AI CALL]: AI has correctly requested to call the 'check_eligibility' tool.")
                eligibility_result = check_eligibility(
                    name=collected_info["name"],
                    age=int(collected_info["age"]),
                    email=collected_info["email"]
                )
                print(f"\nBot: {eligibility_result}")
            else:
                # fallback
                print("\nBot: I have all your information, but encountered an issue trying to process it. Please try again.")

        except Exception as e:
            print(f"\nAn error occurred during the final API call: {e}")
        
        break # End the conversation.


    user_input = input("You: ")
    if user_input.lower().strip() == "quit":
        print("Goodbye!")
        break


    masked_input, extracted_pii = pii_handler.mask_text(user_input)
    print(f"[PII MASKING]: Extracted entities: {extracted_pii}")
    print(f"[PII MASKING]: Masked text is '{masked_input}'")
    conversation_history.append({"role": "user", "content": masked_input})


    if "PERSON" in extracted_pii and not collected_info["name"]:
        user_name_to_check = extracted_pii["PERSON"]
        # Authorization Gate is triggered
        if is_user_authorized(user_name_to_check):
            collected_info["name"] = user_name_to_check
        else:
            print("\nBot: I'm sorry, you are not an authorized user. Goodbye.")
            break
            
    if "AGE" in extracted_pii and not collected_info["age"]:
        collected_info["age"] = extracted_pii["AGE"]
    if "GMAIL_ADDRESS" in extracted_pii and not collected_info["email"]:
        collected_info["email"] = extracted_pii["GMAIL_ADDRESS"]


    next_question_topic = ""
    if not collected_info["name"]:

        next_question_topic = "ask for the user's full name"
    elif not collected_info["age"]:
        next_question_topic = "ask for the user's age"
    elif not collected_info["email"]:
        next_question_topic = "ask for the user's email address"
    

    if next_question_topic:
        print(f"[AI CALL]: Information missing. Asking AI to '{next_question_topic}'.")
        try:
            response = client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=conversation_history
            )
            bot_response_masked = response.choices[0].message.content
            bot_response_unmasked = pii_handler.unmask_text(bot_response_masked)
            print(f"\nBot: {bot_response_unmasked}")
            conversation_history.append({"role": "assistant", "content": bot_response_masked})
        except Exception as e:
            print(f"An error occurred: {e}")
            break


Preparing the final application...


--- Chatbot Initialized ---
Hello! Please provide your name, age, and email to begin.
Type 'quit' to exit.
[PII MASKING]: Extracted entities: {'GMAIL_ADDRESS': 'alice@gmail.com', 'PERSON': 'alice', 'AGE': '23'}
[PII MASKING]: Masked text is 'hi im <PERSON> and i am <AGE> and my email is <GMAIL_ADDRESS>'
[RAG AUTH]: Checking name 'alice'...
[RAG AUTH]: Closest match is 'alice' with distance 0.0000. User is AUTHORIZED.

[SYSTEM]: All information collected. Proceeding to call the eligibility tool.
[AI CALL]: Sending final request to trigger tool call...
[AI CALL]: AI has correctly requested to call the 'check_eligibility' tool.
[TOOL EXECUTION]: Running check_eligibility(name='alice', age=23, email='alice@gmail.com')

Bot: SUCCESS: alice is eligible.
