In [3]:
import pandas as pd
import json
import os

The 18 identifiers that make health information PHI are:

 - Names
 - Dates, except year
 - Telephone numbers
 - Geographic data
 - FAX numbers
 - Social Security numbers
 - Email addresses
 - Medical record numbers
 - Account numbers
 - Health plan beneficiary numbers
 - Certificate/license numbers
 - Vehicle identifiers and serial numbers including license plates
 - Web URLs
 - Device identifiers and serial numbers
 - Internet protocol addresses
 - Full face photos and comparable images
 - Biometric identifiers (i.e. retinal scan, fingerprints)
 - Any unique identifying number or code

# Create a CDB CSV format file following the structure:

|name|cui|ontologies|name_status|type_ids|description|
|--|--|--|--|--|--|


__name__ - String/Name of that concept. It is important to write all possible names and abbreviations for a concept of interest.

__cui__ - The concept unique identifier, this is simply an ID in your database

__ontologies__ - Source ontology, e.g. HPO, SNOMED, HPC,... or in this case it is a custom created: __CAT_ANON__

__name_status__ - Term type e.g. PN - Primary Name. Primary names are important and I would always recommend to add this fields when creating your CDB. Important to distinguish from synoymns. 

__type_ids__ - Semantic type identifier - A unique top-level identifier parent cui for each concept.

__description__ - Description of this concept


In [17]:
# General structure
deid_dict = {
    "de-identification_root_concept": ["name",
                                       "contact_details",
                                       "healthcare_identifier",
                                       "date",
                                       "website"
                                      ],
    "name": ["fore_name",
             "surname",
             "initials"
            ],
    "contact_details": ["address",
                        "telephone_number",
                        "email",
                        "identification",
                        "url"],
    "address": ["address_line",
                "postcode"
               ],
    "identification": ["passport_number",
                       "driving_licence_number",
                       "national_insurance"
                      ],
    "healthcare_identifier": ["nhs_number",
                              "hospital_number",
                              "emergency_department_number",
                              "lab_number",
                              "gmc_number"],
    "date": ["date_of_birth"],
    }


In [18]:
# Create a CUI convention which reflects terminology structure
# code consists of 2 parts a letter = the term it belongs to (either the root, or the 1st child term) 
#then a 4 letter code linking the term into the rest of the dictionary [first_parent][second parent][third parent][fourth parent]
deid_cui_dict = {
"de-identification root concept": "r0000",
"name": "n1000",
"forename": "n1100",
"surname": "n1200",
"initials":"n1300",
"contact details": "c2000",
"address": "c2100",
"address lines": "c2110",
"postcode": "c2120",
"telephone number":"c2200",
"email": "c2300",
"personal identification": "c2400",
"passport number": "c2410",
"driving licence number": "c2420",
"national insurance number": "c2430",
"healthcare identifier": "h3000",
"nhs number": "h3100",
"hospital number": "h3200",
"emergency department_number": "h3300",
"lab number": "h3400",
"gmc number":"h3500",
"date": "d4000",
"date of birth": "h4100",
"website": "w5000",
}


In [31]:
# dictionary of descriptions for the de-id cui database
deid_desc = {
    "de-identification root concept": "root concept of de-identification",
    "name": "All names of an indervidual",
    "contact details": "All personal details which can identify an indervidual",
    "healthcare identifier": "Hospital derived ID",
    "date": "All dates",
    "forename": "Given names including middle names (each name, e.g. first and middle name, is treated as a separate concept)",
    "surname": "Last or family names",
    "initials": "Initials (initials that aren't seperated by a space are treated as a single concept, e.g. JD)",
    "address": "All address information (including a comma or full stop at the end of the address string)",
    "address lines": "All address line items including city and country but exclude postcode",
    "postcode": "Postcodes",
    "telephone number":"Telephone numbers both mobile and landline",
    "email": "Email addresses",
    "personal identification": "Non hospital identification",
    "passport number": "Passport ID number",
    "driving licence number": "Driving licence",
    "website":"All website/URL mentions",
    "national insurance number": "UK national insurance numbers",
    "nhs number": "NHS numbers",
    "hospital number": "Internally used hospital identification number",
    "emergency department number": "Emergency department identification number",
    "lab number":"Lab ids used to identify samples",
    "gmc number":"General Medical Council (GMC) number to identify a clinician",
    "date of birth": "date of birth",
    }

In [32]:
df = pd.DataFrame(deid_cui_dict.items(), columns=['name', 'cui'])
df["ontologies"] = "cat_anon"
df["name_status"] ="P"
df["type_ids"] = ""
desc_df = pd.DataFrame(deid_desc.items(), columns=['name', 'description'])
df = df.merge(desc_df, on="name")
df['cui'] = df['cui'].str.upper()
df

Unnamed: 0,name,cui,ontologies,name_status,type_ids,description
0,de-identification root concept,R0000,cat_anon,P,,root concept of de-identification
1,name,N1000,cat_anon,P,,All names of an indervidual
2,forename,N1100,cat_anon,P,,"Given names including middle names (each name,..."
3,surname,N1200,cat_anon,P,,Last or family names
4,initials,N1300,cat_anon,P,,Initials (initials that aren't seperated by a ...
5,contact details,C2000,cat_anon,P,,All personal details which can identify an ind...
6,address,C2100,cat_anon,P,,All address information (including a comma or ...
7,address lines,C2110,cat_anon,P,,All address line items including city and coun...
8,postcode,C2120,cat_anon,P,,Postcodes
9,telephone number,C2200,cat_anon,P,,Telephone numbers both mobile and landline


In [33]:
# save newly created concept CSV
df.to_csv("20211218_cat_anon_cdb.csv", index=False)