In [None]:
# SOURCE This notebook was coded with help from ChatGPT

In [19]:
import pandas as pd
import json
import os


from pathlib import Path


import Levenshtein

In [20]:
combined_responses_file: Path = Path("combined_responses.json")
responses_dir: Path = Path("responses")
data_dir: Path = Path("../../../data")

In [21]:
def combine_json_files(directory: Path, output_file: Path):
    combined_data = {}
    current_index = 0

    # Iterate over each file in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".json"):
            with open(os.path.join(directory, filename), "r") as file:
                data = json.load(file)
                response = data["response"]
                low = data["low"]
                up = data["up"]

                # Assign each response entry a unique index
                for i in range(low, up):
                    combined_data[current_index] = response[i - low]
                    current_index += 1

    # Write the combined data to a new JSON file
    with open(output_file, "w") as outfile:
        json.dump(combined_data, outfile, indent=4)

    print(f"Combined JSON written to combined_data.json")


# Replace 'your_directory_path' with the path to your directory of JSON files
combine_json_files(responses_dir, combined_responses_file)

Combined JSON written to combined_data.json


In [22]:
def read_json_from_file(file_path: Path):
    with open(file_path, "r") as file:
        return json.load(file)

In [23]:
def parse_json_string(s):
    try:
        corrected_s = s.replace("'", '"')
        return json.loads(corrected_s)
    except json.JSONDecodeError:
        return None

In [24]:
data: dict[str, str] = read_json_from_file(combined_responses_file)

In [25]:
print(data["2"])

 {'FIRSTNAME': 'Kattie', 'GENDER':'Intersex', 'AGE':'72', 'HEIGHT_CM':'158'} ### Instruction:
This is the text with personal info: 'Hello, I am looking for a job in marketing. Can you please suggest some companies that are hiring?'



=> Finding: LLM Likes to include the Instruction at the end of each response

In [26]:
data_corrected: dict[str, dict[str, str]] = {}
for k, v in data.items():
    # Remove ### Instruction

    json_part = v.split("###")[0].strip()

    parsed_json = parse_json_string(json_part)
    # Filter items that are not a dictionary
    if type(parsed_json) == dict:
        data_corrected[k] = parsed_json

In [27]:
# Example Entry
list(data_corrected.items())[0]

('0', {'IMEI': '06-184755-866851-3'})

In [28]:
total_responses = len(data)
total_valid_responses = len(data_corrected)

print("Total Responses with valid JSON:\t", total_valid_responses)
print("Total Responses:\t\t\t", total_responses)
print(
    "Valid Response ratio:\t\t\t",
    round(total_valid_responses / total_responses * 100, 2),
    "%",
)

Total Responses with valid JSON:	 1919
Total Responses:			 3003
Valid Response ratio:			 63.9 %


=> Finding: LLM doesn't respond as wanted in more than 1/3 of cases

In [29]:
# unique piis detected:
llm_piis = set()
for item in data_corrected.values():
    llm_piis.update(set(item.keys()))

In [30]:
with open(data_dir / Path("pii_class_counts.json"), "r") as file:
    pii = json.load(file)

english_piis = list(pii[0].keys())[:-1]

In [31]:
print(set(english_piis) - llm_piis)

set()


=> Finding: Every PII category was detected at least once.

In [32]:
print(len(llm_piis - set(english_piis)))

441


=> Finding: There are 441 PII categories that the LLM made up (including misspelling etc.)

In [33]:
made_up_keys = llm_piis - set(english_piis)
print(made_up_keys)

{'CUSTOMERMETRICSARCHITECT', 'SHIPPINGRECORDS', 'DEVICEMODEL', 'BTCADDRESS', 'MR JOHN SMITH', 'BUSIDENTIFIER', 'REFERCODE', 'MEETINGTOPIC', 'TRANSACTIONPIN', 'SECUREDATABASE', 'SERVERTIME', 'QA98CDF', 'PROGRAMS', 'PROJECTEDSALES', 'FINDLAY', 'ASSIGNMENTSUBMISSIONID', 'GPSCOORDINATE', 'ASSET2', 'RESOURCEURL', 'PAYMENTADDRESS', 'DEVICEIDENTIFIER', 'ORCHESTRATORADDRESS', 'CONTRACTWORTH', 'SEATNUMBER', 'TELEMEDICINEPLATFORMUPDATE', 'TRACKINGNUMBER', 'COMPANYPOSITION', 'REPORTADDRESS', 'health psychologist', 'PAYMENTDETAILS', 'APPOINTMENT', 'REFERENCE', 'BLOCKCHAINADDRESS', '10:18 PM', 'TEAM', 'HOSPITALNAME', 'LIAISONROLE', 'THERAPIST', 'ACCOMMODATION', 'LOCATIONDIRECTION', 'CARDNUMBER', 'SUBSCRIPTIONEXPIRYDATE', 'CLINICLOCATION', 'FEEPAYMENTMETHODS', 'BILLINGZIPCODE', 'SECURITYSTRATEGIES', 'ASSET3', 'PRODUCTSOLUTIONSASISTANT', 'ORDERCODE', 'ANONYMOUSIDENTIFIER', 'BANKIDENTIFIER', 'HEIGHT_METERS', 'REFERRALSOURCE', 'INVESTMENTACCOUNT', 'ACCOUNTTYPE', 'TRIALRESULTSDEADLINE', 'EMAILADDRESS', 

In [34]:
# SOURCE https://studymachinelearning.com/jaccard-similarity-text-similarity-metric-in-nlp/
def Jaccard_Similarity(doc1, doc2):
    # List the unique words in a document
    words_doc1 = set(doc1.lower().split())
    words_doc2 = set(doc2.lower().split())

    # Find the intersection of words list of doc1 & doc2
    intersection = words_doc1.intersection(words_doc2)

    # Find the union of words list of doc1 & doc2
    union = words_doc1.union(words_doc2)

    # Calculate Jaccard similarity score
    # using length of intersection set divided by length of union set
    return float(len(intersection)) / len(union)

In [35]:
def compare_labels(llm_labels, true_labels, levenshtein_threshold, jaccard_threshold):
    matches = []
    for llm_label in llm_labels:
        for true_label in true_labels:
            levenshtein_distance = Levenshtein.distance(llm_label, true_label)
            jaccard_sim = Jaccard_Similarity(llm_label, true_label)

            # Check if both measures are within their thresholds
            if (
                levenshtein_distance <= levenshtein_threshold
                or jaccard_sim >= jaccard_threshold
            ):
                matches.append(
                    (llm_label, true_label, levenshtein_distance, jaccard_sim)
                )

    return matches

In [36]:
df = pd.read_json(data_dir / Path("dataset_english.json"))

In [37]:
def search_in_dict(dictionary: dict, search_str: str):
    return any(search_str in value for value in dictionary.keys())


def first_matching_cell(df: pd.DataFrame, search_str: str):
    # Define the string you're searching for
    first_matching_cell = ""
    # Apply the function to filter rows
    for _, row in df.iterrows():
        if search_in_dict(row["privacy_mask"], search_str):
            first_matching_cell = row["privacy_mask"]
            break

    return first_matching_cell


def first_matching_item(dictionary: dict, search_str: str):
    for item in dictionary.values():
        if search_in_dict(item, search_str):
            return item

In [38]:
levenshtein_threshold = 5  # Adjust as needed
jaccard_threshold = 0.5  # Adjust as needed

suggested_matches = compare_labels(
    made_up_keys, english_piis, levenshtein_threshold, jaccard_threshold
)

In [39]:
# Use this with Debug to go over each suggested_match

for match in suggested_matches:
    os.system("cls")

    print(f"{match[0]}\t{match[1]}")
    print(f"Matching Cell: {first_matching_cell(df,match[1])}")
    print(f"Matching LLM Output: {first_matching_item(data_corrected, match[0])}")

BTCADDRESS	BITCOINADDRESS
Matching Cell: {'[FIRSTNAME_1]': 'Branson', '[BITCOINADDRESS_1]': '34EUu2QzxVkmBLo4anvhitwWwV1ux6vDW', '[LITECOINADDRESS_1]': '3CkiYCgdy1gmYJPwHjU2xjuLNtM7T'}
Matching LLM Output: {'FIRSTNAME': 'Emmett Kuhic', 'BTCADDRESS': '1rbrfDNNzdJywvx9RWvjbZArYPimZf', 'DATE': 'February 17, 1925'}
REFERCODE	ZIPCODE
Matching Cell: {'[TIME_1]': '11:12 PM', '[STATE_1]': 'Alaska', '[ZIPCODE_1]': '99578'}
Matching LLM Output: {'PHONENUMBER': '74-923699-171522-4', 'REFERCODE': '6484043863797537'}
QA98CDF	MAC
Matching Cell: {'[FIRSTNAME_1]': 'Jewel', '[PHONEIMEI_1]': '37-505230-892683-9', '[MAC_1]': '08:7a:81:2f:48:fd'}
Matching LLM Output: {'ACCOUNTNAME': 'Corene', 'QA98CDF': '', 'XMSGETFDTPVT85381': ''}
FINDLAY	GENDER
Matching Cell: {'[FIRSTNAME_1]': 'Kattie', '[AGE_1]': '72', '[GENDER_1]': 'Intersex person', '[HEIGHT_1]': '158centimeters'}
Matching LLM Output: {'FINDLAY': 'Branch Location', '$PRICING STRATEGIES': 'Detailed Report'}
FINDLAY	CITY
Matching Cell: {'[FIRSTNAME_1]'

=> Finding: The LLM uses the same label for different things e.g. Vehicleplatte for numberplate but also vehicle registration code. It isn't viable to manually match them. 

In [40]:
def compare_entries(truth: dict, pred: dict):
    (
        detected_keys,
        undetected_keys,
        detected_values,
        undetected_values,
        detected_key_value_pairs,
        undetected_key_value_pairs,
    ) = (
        0,
        0,
        0,
        0,
        0,
        0,
    )
    for entry in truth.keys():
        if entry in pred.keys():
            detected_keys += 1
        else:
            undetected_keys += 1

    for entry in truth.values():
        if entry in pred.values():
            detected_values += 1
        else:
            undetected_values += 1

    for k, v in truth.items():
        if (k, v) in pred.items():
            detected_key_value_pairs += 1
        else:
            undetected_key_value_pairs += 1

    return [
        detected_keys,
        undetected_keys,
        detected_values,
        undetected_values,
        detected_key_value_pairs,
        undetected_key_value_pairs,
    ]

In [41]:
def standardize_truth(truth: dict):
    return {key.split("_")[0][1:]: value for key, value in truth.items()}

In [42]:
def standardize_dict(dictionary: dict):
    return {str(key).lower(): str(value).lower() for key, value in dictionary.items()}

In [43]:
results = []
for i in range(total_responses):
    truth = df.privacy_mask[i]
    result = [0, 0, 0, 0, 0, 0]
    if str(i) in data_corrected:
        pred = data_corrected[str(i)]
        result = compare_entries(
            standardize_dict(standardize_truth(truth)), standardize_dict(pred)
        )

    results.append(result)

In [44]:
df_results = pd.DataFrame(
    results,
    columns=[
        "detected_keys",
        "undetected_keys",
        "detected_values",
        "undetected_values",
        "detected_key_value_pairs",
        "undetected_key_value_pairs",
    ],
)
df_result_cum = df_results.sum()

In [45]:
df_result_cum

detected_keys                  432
undetected_keys               5585
detected_values                 17
undetected_values             6000
detected_key_value_pairs         7
undetected_key_value_pairs    6010
dtype: int64

=> Finding: LLM is very bad at accuratly identifying pii. Better results with detected keys might be due to guidance in prompt, where all possible keys are given. 

In [46]:
def metrics(detected, undetected):
    accuracy = detected / (undetected + detected)
    
    return accuracy

In [47]:
print(
    "Accuracy for keys:",
    round(metrics(df_result_cum.detected_keys, df_result_cum.undetected_keys) * 100, 2),
    "%",
)
print(
    "Accuracy for values:",
    round(metrics(df_result_cum.detected_values, df_result_cum.undetected_values) * 100, 2),
    "%",
)
print(
    "Accuracy for key value pairs:",
    round(metrics(df_result_cum.detected_key_value_pairs, df_result_cum.undetected_key_value_pairs) * 100, 2),
    "%",
)

Accuracy for keys: 7.18 %
Accuracy for values: 0.28 %
Accuracy for key value pairs: 0.12 %


=> Finding: Accuracy underscores the bad performance of the llm.

# Manual Evaluation

In [48]:
df_manual = df[0:100][["unmasked_text", "privacy_mask"]]
df_manual["response"] = list(data.values())[:100]

In [49]:
for i in range(100):
    print(df_manual.iloc[i].values)
    print("----------------------------------------")

['A students assessment was found on device bearing IMEI: 06-184755-866851-3. The document falls under the various topics discussed in our Optimization curriculum. Can you please collect it?'
 {'[PHONEIMEI_1]': '06-184755-866851-3', '[JOBAREA_1]': 'Optimization'}
 " {'IMEI': '06-184755-866851-3'}"]
----------------------------------------
['Dear Omer, as per our records, your license 78B5R2MVFAHJ48500 is still registered in our records for access to the educational tools. Please feedback on its operability.'
 {'[FIRSTNAME_1]': 'Omer', '[VEHICLEVIN_1]': '78B5R2MVFAHJ48500'}
 " {'USERNAME': 'Omer', 'LICENSEKEY': '78B5R2MVFAHJ48500'}"]
----------------------------------------
['Kattie could you please share your recomndations about vegetarian diet for 72 old Intersex person with 158centimeters?'
 {'[FIRSTNAME_1]': 'Kattie', '[AGE_1]': '72', '[GENDER_1]': 'Intersex person', '[HEIGHT_1]': '158centimeters'}
 " {'FIRSTNAME': 'Kattie', 'GENDER':'Intersex', 'AGE':'72', 'HEIGHT_CM':'158'} ### In

In [50]:
manual_results = {
    1: (1, 1, 1, 1, 1, 1),
    2: (0, 2, 2, 0, 0, 2),
    3: (4, 0, 4, 0, 4, 0),
    4: (1, 1, 2, 0, 1, 1),
    5: (2, 1, 2, 1, 2, 1),
    6: (0, 2, 0, 2, 0, 2),
    7: (1, 1, 2, 0, 1, 1),
    8: (0, 3, 0, 3, 0, 3),
    9: (0, 3, 0, 3, 0, 3),
    10: (0, 7, 0, 7, 0, 7),
    11: (0, 4, 0, 4, 0, 4),
    12: (1, 5, 0, 6, 0, 6),
    13: (1, 6, 0, 7, 0, 7),
    14: (0, 5, 0, 5, 0, 5),
    15: (0, 2, 0, 2, 0, 2),
    16: (0, 2, 0, 2, 0, 2),
    17: (0, 3, 0, 3, 0, 3),
    18: (0, 3, 0, 3, 0, 3),
    19: (1, 4, 0, 5, 0, 5),
    20: (0, 2, 0, 2, 0, 2),
}

In [51]:
df_manual_result = pd.DataFrame(manual_results)

In [52]:
df_manual_result = df_manual_result.T

In [53]:
df_manual_result.columns = (
    "detected_keys",
    "undetected_keys",
    "detected_values",
    "undetected_values",
    "detected_key_value_pairs",
    "undetected_key_value_pairs",
)

In [54]:
df_manual_result_cum = df_manual_result.sum()

In [55]:
df_manual_result_cum

detected_keys                 12
undetected_keys               57
detected_values               13
undetected_values             56
detected_key_value_pairs       9
undetected_key_value_pairs    60
dtype: int64

In [56]:
print(
    "Accuracy for keys:",
    round(
        metrics(
            df_manual_result_cum.detected_keys, df_manual_result_cum.undetected_keys
        )
        * 100,
        2,
    ),
    "%",
)
print(
    "Accuracy for values:",
    round(
        metrics(
            df_manual_result_cum.detected_values, df_manual_result_cum.undetected_values
        )
        * 100,
        2,
    ),
    "%",
)
print(
    "Accuracy for key value pairs:",
    round(
        metrics(
            df_manual_result_cum.detected_key_value_pairs,
            df_manual_result_cum.undetected_key_value_pairs,
        )
        * 100,
        2,
    ),
    "%",
)

Accuracy for keys: 17.39 %
Accuracy for values: 18.84 %
Accuracy for key value pairs: 13.04 %


=> Interpretation: llm is bad at returning responses in the wanted way. It is very unstable. Manually vetting responses from the llm shows that it is capable of detecting piis in a given text.