## Load Dataset

In [285]:
import pandas as pd

df = pd.read_csv('california.csv')
df

Unnamed: 0,Reference ID,Report Year,Diagnosis Category,Diagnosis Sub Category,Treatment Category,Treatment Sub Category,Determination,Type,Age Range,Patient Gender,Findings
0,MN16-22639,2016,Infectious,Hepatitis,Pharmacy/Prescription Drugs,Anti-virals,Overturned Decision of Health Plan,Medical Necessity,41-50,Male,Nature of Statutory Criteria/Case Summary: An ...
1,MN16-22638,2016,Mental,Eating Disorder,Mental Health Treatment,Residential Treatment Center - Admission,Upheld Decision of Health Plan,Medical Necessity,21-30,Female,Nature of Statutory Criteria/Case Summary: An...
2,MN16-22637,2016,Autism Spectrum,Autism-PDD-NOS,Autism Related Treatment,Speech Therapy,Upheld Decision of Health Plan,Medical Necessity,0-10,Female,Nature of Statutory Criteria/Case Summary: Th...
3,EI16-22636,2016,Prevention/Good Health,,"Diagnostic Imaging, Screening and Testing",Mammography,Overturned Decision of Health Plan,Experimental/Investigational,65+,Female,Nature of Statutory Criteria/Case Summary: An ...
4,EI06-5319,2006,Cardiac/Circulatory,,Cardio Vascular,,Upheld Decision of Health Plan,Experimental/Investigational,51-64,Male,Physician 1: The patient is a 62-year-old male...
...,...,...,...,...,...,...,...,...,...,...,...
19240,MN01-7,2001,Trauma/Injuries,Gunshot Wound,Neurosugery,Cranioplasty,Overturned Decision of Health Plan,Medical Necessity,,,The parents of a 17-year-old male requested a ...
19241,MN01-6,2001,Infectious,Onychomycosis/ Nail Fungus,Pharmacy/Prescription Drugs,Anti-Fungal,Upheld Decision of Health Plan,Medical Necessity,,,A 46-year-old male requested Penlac lacquer fo...
19242,MN01-5,2001,Orthopedic/ Musculoskeletal,Other,Orthopedic,Arthroscopy,Upheld Decision of Health Plan,Medical Necessity,,,A 46-year-old female requested an orthoscopic ...
19243,MN01-4,2001,Orthopedic/ Musculoskeletal,Back Pain,Reconstructive/Plastic Surgery,Breast Reduction,Overturned Decision of Health Plan,Medical Necessity,,,A 24-year-old female requested a bilateral bre...


## Aggregate Dataset With Fake Columns

In [286]:
from faker import Faker
import random

# Instantiate Faker
fake = Faker(locale='en_US')
fake.seed_instance(1234)

def fake_name(gender: str):
    if gender == 'Male':
        return fake.name_male()
    elif gender == 'Female':
        return fake.name_female()
    else:
        return fake.name()

def fake_blood_type():
    real_blood_types = ['A+', 'A-', 'B+', 'B-', 'O+', 'O-', 'AB+', 'AB-']
    
    # Randomly select and return a blood type
    return random.choice(real_blood_types)

def fake_insurance_provider():
    insurance_providers = [
        "UnitedHealth Group",
        "Kaiser Foundation",
        "Anthem Inc.",
        "Centene Corporation",
        "Humana",
        "CVS Health (Aetna)",
        "HCSC (Health Care Service Corporation)",
        "Cigna Health",
        "Molina Healthcare",
        "Independence Health Group",
        "GuideWell Mutual Holding",
        "WellCare",
        "Blue Cross Blue Shield",
        "Highmark",
        "Medicare",
        "Medicaid",
    ]

    return random.choice(insurance_providers)

def fake_emergency_contact():
    return f'{fake.name()}\n\n{fake.bothify(text='+1-###-###-####')}'

def fake_age_within_range(age_range):
    # With df['Age Range'].unique() we know that the age ranges are: 
    # ['41-50', '21-30', '0-10', '65+', '51-64', '11_20', '31-40', nan]
    if age_range == '0-10':
        return random.randint(0, 10)
    elif age_range == '11_20':
        return random.randint(11, 20)
    elif age_range == '21-30':
        return random.randint(21, 30)
    elif age_range == '31-40':
        return random.randint(31, 40)
    elif age_range == '41-50':
        return random.randint(41, 50)
    elif age_range == '51-64':
        return random.randint(51, 64)
    elif age_range == '65+':
        return random.randint(65, 100)  # Assuming maximum age of 100 for the example
    else:
        return None

def fake_consulting_physicians():
    consulting_physicians = ['Dr. Alexandria Gaines', 'Dr. Eddie Young', 'Dr. James Barber', 'Dr. Jerry Daniels', 'Dr. Michelle Lamb', 'Dr. Shelly Hunt']
    return random.choice(consulting_physicians)

In [287]:
df['Patient Name'] = df['Patient Gender'].apply(fake_name)
df['Patient Age'] = df['Age Range'].apply(fake_age_within_range)
df['Patient Phone'] = df.apply(lambda _: fake.bothify(text='+1-###-###-####'), axis=1)
df['Patient Address'] = df.apply(lambda _: fake.address(), axis=1)
df['Patient Blood Type'] = df.apply(lambda _: fake_blood_type(), axis=1)
df['Patient SSN'] = df.apply(lambda _: fake.ssn(), axis=1)
df['Patient Insurance Provider'] = df.apply(lambda _: fake_insurance_provider(), axis=1)
df['Patient Insurance Number'] = df.apply(lambda _: fake.bothify(text='?#??##?#?##?', letters='ABCDEFGHIJKLMNOPQRSTUVWXYZ'), axis=1)
df['Patient Emergency Contact'] = df.apply(lambda _: fake_emergency_contact(), axis=1)
df['Patient Occupation'] = df.apply(lambda _: fake.job(), axis=1)
df['Consulting Physician'] = df.apply(lambda _: fake_consulting_physicians(), axis=1)

## Rename Columns & Set Index

In [288]:
# Create a dictionary mapping old column names to new column names
rename_dict = {
    'Reference ID': 'reference_id',
    'Report Year': 'report_year',
    'Diagnosis Category': 'diagnosis_category',
    'Diagnosis Sub Category': 'diagnosis_sub_category',
    'Treatment Category': 'treatment_category',
    'Treatment Sub Category': 'treatment_sub_category',
    'Determination': 'determination',
    'Type': 'treatment_type',
    'Age Range': 'patient_age_range',
    'Patient Gender': 'patient_gender',
    'Findings': 'findings',
    'Patient Name': 'patient_name',
    'Patient Age': 'patient_age',
    'Patient Phone': 'patient_phone',
    'Patient Address': 'patient_address',
    'Patient Blood Type': 'patient_blood_type',
    'Patient SSN': 'patient_ssn',
    'Patient Insurance Provider': 'patient_insurance_provider',
    'Patient Insurance Number': 'patient_insurance_number',
    'Patient Emergency Contact': 'patient_emergency_contact',
    'Patient Occupation': 'patient_occupation',
    'Consulting Physician': 'consulting_physician'
}

# Rename columns using the dictionary
df.rename(columns=rename_dict, inplace=True)
df

Unnamed: 0,reference_id,report_year,diagnosis_category,diagnosis_sub_category,treatment_category,treatment_sub_category,determination,treatment_type,patient_age_range,patient_gender,...,patient_age,patient_phone,patient_address,patient_blood_type,patient_ssn,patient_insurance_provider,patient_insurance_number,patient_emergency_contact,patient_occupation,consulting_physician
0,MN16-22639,2016,Infectious,Hepatitis,Pharmacy/Prescription Drugs,Anti-virals,Overturned Decision of Health Plan,Medical Necessity,41-50,Male,...,42.0,+1-935-919-4421,"854 Mullins Hill Suite 006\nEast Chadland, NY ...",A-,536-99-6044,Kaiser Foundation,J0AK42S6G80E,Madison Santana\n\n+1-811-740-6445,Chartered loss adjuster,Dr. Jerry Daniels
1,MN16-22638,2016,Mental,Eating Disorder,Mental Health Treatment,Residential Treatment Center - Admission,Upheld Decision of Health Plan,Medical Necessity,21-30,Female,...,24.0,+1-903-435-4762,"89616 Walker Rest Apt. 322\nNorth Priscilla, I...",B+,790-58-4187,Highmark,C6ZI48W2C52H,Sandra Hicks\n\n+1-884-456-3480,Curator,Dr. Eddie Young
2,MN16-22637,2016,Autism Spectrum,Autism-PDD-NOS,Autism Related Treatment,Speech Therapy,Upheld Decision of Health Plan,Medical Necessity,0-10,Female,...,4.0,+1-987-485-6213,"3979 Werner Ports\nDixonberg, NY 79155",A-,245-67-0156,Anthem Inc.,Z0MD30C9B77O,Edward Hall\n\n+1-231-421-2973,Community arts worker,Dr. Michelle Lamb
3,EI16-22636,2016,Prevention/Good Health,,"Diagnostic Imaging, Screening and Testing",Mammography,Overturned Decision of Health Plan,Experimental/Investigational,65+,Female,...,75.0,+1-161-237-5048,"028 Kelly Forest Apt. 505\nEast Christine, DC ...",O+,234-07-1435,Cigna Health,M3YF76J3F57S,Richard Reilly\n\n+1-854-455-5639,Diagnostic radiographer,Dr. Eddie Young
4,EI06-5319,2006,Cardiac/Circulatory,,Cardio Vascular,,Upheld Decision of Health Plan,Experimental/Investigational,51-64,Male,...,60.0,+1-336-511-2487,"35460 Kimberly Ranch\nJohnville, AL 32011",O+,374-70-5150,HCSC (Health Care Service Corporation),Q0OJ75S8Z35D,Andrew Bailey MD\n\n+1-897-601-7345,Educational psychologist,Dr. Shelly Hunt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19240,MN01-7,2001,Trauma/Injuries,Gunshot Wound,Neurosugery,Cranioplasty,Overturned Decision of Health Plan,Medical Necessity,,,...,,+1-394-880-1197,"875 Misty Terrace\nAmyfort, CO 79231",O-,614-48-4940,Kaiser Foundation,Z6OW60C3J59N,Stephanie Berger\n\n+1-664-209-8786,Publishing rights manager,Dr. Jerry Daniels
19241,MN01-6,2001,Infectious,Onychomycosis/ Nail Fungus,Pharmacy/Prescription Drugs,Anti-Fungal,Upheld Decision of Health Plan,Medical Necessity,,,...,,+1-386-867-7901,"97158 Bush Fall Apt. 895\nFloreston, RI 14489",O+,738-95-4536,Kaiser Foundation,U1DU58X1C35I,Felicia Wagner\n\n+1-654-390-3043,"Psychologist, clinical",Dr. Alexandria Gaines
19242,MN01-5,2001,Orthopedic/ Musculoskeletal,Other,Orthopedic,Arthroscopy,Upheld Decision of Health Plan,Medical Necessity,,,...,,+1-634-671-0883,"56361 Ryan Lock\nLarsonside, MS 45295",O-,459-54-9909,Anthem Inc.,O5IL61E4R50K,Valerie Boyle\n\n+1-399-412-6569,Estate agent,Dr. Alexandria Gaines
19243,MN01-4,2001,Orthopedic/ Musculoskeletal,Back Pain,Reconstructive/Plastic Surgery,Breast Reduction,Overturned Decision of Health Plan,Medical Necessity,,,...,,+1-305-612-9521,"1676 Marc Crossroad\nSantostown, DC 89113",AB-,588-98-4283,Anthem Inc.,N4TU52P0O89K,Crystal Ayers\n\n+1-839-248-2666,Writer,Dr. Michelle Lamb


## Assign AIP & PIP

In [289]:
purposes = {
    'General-Purpose': {
        'id': 1,
        'parent_id': None,
        'code': 1,
        'aip_code': 8191,
        'pip_code': 8191
    },
    'Clinical-Care': {
        'id': 2,
        'parent_id': 1,
        'code': 2,
        'aip_code': 2,
        'pip_code': 3
    },
    'Research': {
        'id': 3,
        'parent_id': 1,
        'code': 4,
        'aip_code': 124,
        'pip_code': 125
    },
    'Public-Research': {
        'id': 4,
        'parent_id': 3,
        'code': 8,
        'aip_code': 56,
        'pip_code': 61
    },
    'Military-Research': {
        'id': 5,
        'parent_id': 4,
        'code': 16,
        'aip_code': 16,
        'pip_code': 29
    },
    'Non-Military-Research': {
        'id': 6,
        'parent_id': 4,
        'code': 32,
        'aip_code': 32,
        'pip_code': 45
    },
    'Private-Research': {
        'id': 7,
        'parent_id': 3,
        'code': 64,
        'aip_code': 64,
        'pip_code': 69
    },
    'Patient-Support-Service': {
        'id': 8,
        'parent_id': 1,
        'code': 128,
        'aip_code': 896,
        'pip_code': 897
    },
    'Billing': {
        'id': 9,
        'parent_id': 8,
        'code': 256,
        'aip_code': 256,
        'pip_code': 385
    },
    'Communication': {
        'id': 10,
        'parent_id': 8,
        'code': 512,
        'aip_code': 512,
        'pip_code': 641
    },
    'Third-Party': {
        'id': 11,
        'parent_id': 1,
        'code': 1024,
        'aip_code': 7168,
        'pip_code': 7169
    },
    'Marketing': {
        'id': 12,
        'parent_id': 11,
        'code': 2048,
        'aip_code': 2048,
        'pip_code': 3073
    },
    'Product-Development': {
        'id': 13,
        'parent_id': 11,
        'code': 4096,
        'aip_code': 4096,
        'pip_code': 5121
    }
}

In [290]:
df.columns

Index(['reference_id', 'report_year', 'diagnosis_category',
       'diagnosis_sub_category', 'treatment_category',
       'treatment_sub_category', 'determination', 'treatment_type',
       'patient_age_range', 'patient_gender', 'findings', 'patient_name',
       'patient_age', 'patient_phone', 'patient_address', 'patient_blood_type',
       'patient_ssn', 'patient_insurance_provider', 'patient_insurance_number',
       'patient_emergency_contact', 'patient_occupation',
       'consulting_physician'],
      dtype='object')

In [291]:
def encode(codes: list):
    encoded_int = 0
    for c in codes:
        encoded_int |= c
    return encoded_int

aip_purpose_codes_optional = [purposes.get('Research').get('aip_code'), 
                                purposes.get('Public-Research').get('aip_code'),
                                purposes.get('Military-Research').get('aip_code'),
                                purposes.get('Non-Military-Research').get('aip_code'),
                                purposes.get('Private-Research').get('aip_code'),
                                purposes.get('Third-Party').get('aip_code'),
                                purposes.get('Marketing').get('aip_code'),
                                purposes.get('Product-Development').get('aip_code')] 

pip_purpose_codes_optional = [purposes.get('Research').get('pip_code'), 
                                  purposes.get('Public-Research').get('pip_code'),
                                  purposes.get('Military-Research').get('pip_code'),
                                  purposes.get('Non-Military-Research').get('pip_code'),
                                  purposes.get('Private-Research').get('pip_code'),
                                  purposes.get('Third-Party').get('pip_code'),
                                  purposes.get('Marketing').get('pip_code'),
                                  purposes.get('Product-Development').get('pip_code')] 

def fake_aip(required: list[str], optional: list[str] = aip_purpose_codes_optional):
    purpose_codes = []
    for r in required:
        purpose_codes.append(purposes.get(r).get('aip_code'))
    purpose_codes.extend(random.sample(optional, random.randint(0, len(optional))))

    return encode(purpose_codes)

def fake_pip(optional: list[str] = pip_purpose_codes_optional):
    purpose_codes = []
    purpose_codes.extend(random.sample(optional, random.randint(0, len(optional))))
    
    return encode(purpose_codes)

def add_aip(code: int, add_purposes: list[str]):
    result = code
    for p in add_purposes:
        result |= purposes.get(p).get('aip_code')
    return result

def add_pip(code: int, add_purposes: list[str]):
    result = code
    for p in add_purposes:
        result |= purposes.get(p).get('aip_code')
    return result
    


## Create Metadata df

In [292]:
df_meta = pd.DataFrame(df['reference_id'].copy())
df_meta

Unnamed: 0,reference_id
0,MN16-22639
1,MN16-22638
2,MN16-22637
3,EI16-22636
4,EI06-5319
...,...
19240,MN01-7
19241,MN01-6
19242,MN01-5
19243,MN01-4


In [293]:
df_meta['diagnosis_category_aip'] = df_meta.apply(lambda _: fake_aip(['Clinical-Care']), axis=1)
df_meta['diagnosis_category_pip'] = df_meta.apply(lambda _: fake_pip(), axis=1)

df_meta['diagnosis_sub_category_aip'] = df_meta['diagnosis_category_aip']
df_meta['diagnosis_sub_category_pip'] = df_meta['diagnosis_category_pip']

df_meta['treatment_category_aip'] = df_meta['diagnosis_category_aip'].apply(lambda aip_code: add_aip(aip_code, ['Billing']))
df_meta['treatment_category_pip'] = df_meta['diagnosis_category_pip']

df_meta['treatment_sub_category_aip'] = df_meta['diagnosis_category_aip']
df_meta['treatment_sub_category_pip'] = df_meta['diagnosis_category_pip']

df_meta['determination_aip'] = df_meta['diagnosis_category_aip']
df_meta['determination_pip'] = df_meta['diagnosis_category_pip']

df_meta['treatment_type_aip'] = df_meta['diagnosis_category_aip']
df_meta['treatment_type_pip'] = df_meta['diagnosis_category_pip']

df_meta['patient_gender_aip'] = df_meta['diagnosis_category_aip'].apply(lambda aip_code: add_aip(aip_code, ['Patient-Support-Service']))
df_meta['patient_gender_pip'] = df_meta['diagnosis_category_pip']

df_meta['patient_age_range_aip'] = df_meta['diagnosis_category_aip']
df_meta['patient_age_range_pip'] = df_meta['diagnosis_category_pip'].apply(lambda pip_code: add_pip(pip_code, ['Clinical-Care']))

df_meta['findings_aip'] = df_meta['diagnosis_category_aip']
df_meta['findings_pip'] = df_meta['diagnosis_category_pip']

df_meta['patient_name_aip'] = df_meta['diagnosis_category_aip'].apply(lambda aip_code: add_aip(aip_code, ['Patient-Support-Service']))
df_meta['patient_name_pip'] = df_meta['diagnosis_category_pip']

df_meta['patient_age_aip'] = df_meta['diagnosis_category_aip'].apply(lambda aip_code: add_aip(aip_code, ['Patient-Support-Service']))
df_meta['patient_age_pip'] = df_meta['diagnosis_category_pip']

df_meta['patient_phone_aip'] = df_meta['diagnosis_category_aip'].apply(lambda aip_code: add_aip(aip_code, ['Patient-Support-Service']))
df_meta['patient_phone_pip'] = df_meta['diagnosis_category_aip']

df_meta['patient_address_aip'] = df_meta['diagnosis_category_aip'].apply(lambda aip_code: add_aip(aip_code, ['Patient-Support-Service']))
df_meta['patient_address_pip'] = df_meta['diagnosis_category_pip']

df_meta['patient_blood_type_aip'] = df_meta['diagnosis_category_aip']
df_meta['patient_blood_type_pip'] = df_meta['diagnosis_category_pip']

df_meta['patient_ssn_aip'] = encode([purposes.get('Billing').get('aip_code')])
df_meta['patient_ssn_pip'] = 0

df_meta['patient_insurance_provider_aip'] = df_meta['diagnosis_category_aip'].apply(lambda aip_code: add_aip(aip_code, ['Billing']))
df_meta['patient_insurance_provider_pip'] = df_meta['diagnosis_category_pip']

df_meta['patient_insurance_number_aip'] = encode([purposes.get('Billing').get('aip_code')])
df_meta['patient_insurance_number_pip'] = 0

df_meta['patient_emergency_contact_aip'] = encode([purposes.get('Communication').get('aip_code')])
df_meta['patient_emergency_contact_pip'] = 0

df_meta['patient_occupation_aip'] = df_meta['diagnosis_category_aip'].apply(lambda aip_code: add_aip(aip_code, ['Billing']))
df_meta['patient_occupation_pip'] = df_meta['diagnosis_category_pip']

df_meta['consulting_physician_aip'] = encode([purposes.get('Clinical-Care').get('aip_code')])
df_meta['consulting_physician_pip'] = 0


## Export

In [294]:
df.to_csv('california_fake_data.csv', index=False)
df

Unnamed: 0,reference_id,report_year,diagnosis_category,diagnosis_sub_category,treatment_category,treatment_sub_category,determination,treatment_type,patient_age_range,patient_gender,...,patient_age,patient_phone,patient_address,patient_blood_type,patient_ssn,patient_insurance_provider,patient_insurance_number,patient_emergency_contact,patient_occupation,consulting_physician
0,MN16-22639,2016,Infectious,Hepatitis,Pharmacy/Prescription Drugs,Anti-virals,Overturned Decision of Health Plan,Medical Necessity,41-50,Male,...,42.0,+1-935-919-4421,"854 Mullins Hill Suite 006\nEast Chadland, NY ...",A-,536-99-6044,Kaiser Foundation,J0AK42S6G80E,Madison Santana\n\n+1-811-740-6445,Chartered loss adjuster,Dr. Jerry Daniels
1,MN16-22638,2016,Mental,Eating Disorder,Mental Health Treatment,Residential Treatment Center - Admission,Upheld Decision of Health Plan,Medical Necessity,21-30,Female,...,24.0,+1-903-435-4762,"89616 Walker Rest Apt. 322\nNorth Priscilla, I...",B+,790-58-4187,Highmark,C6ZI48W2C52H,Sandra Hicks\n\n+1-884-456-3480,Curator,Dr. Eddie Young
2,MN16-22637,2016,Autism Spectrum,Autism-PDD-NOS,Autism Related Treatment,Speech Therapy,Upheld Decision of Health Plan,Medical Necessity,0-10,Female,...,4.0,+1-987-485-6213,"3979 Werner Ports\nDixonberg, NY 79155",A-,245-67-0156,Anthem Inc.,Z0MD30C9B77O,Edward Hall\n\n+1-231-421-2973,Community arts worker,Dr. Michelle Lamb
3,EI16-22636,2016,Prevention/Good Health,,"Diagnostic Imaging, Screening and Testing",Mammography,Overturned Decision of Health Plan,Experimental/Investigational,65+,Female,...,75.0,+1-161-237-5048,"028 Kelly Forest Apt. 505\nEast Christine, DC ...",O+,234-07-1435,Cigna Health,M3YF76J3F57S,Richard Reilly\n\n+1-854-455-5639,Diagnostic radiographer,Dr. Eddie Young
4,EI06-5319,2006,Cardiac/Circulatory,,Cardio Vascular,,Upheld Decision of Health Plan,Experimental/Investigational,51-64,Male,...,60.0,+1-336-511-2487,"35460 Kimberly Ranch\nJohnville, AL 32011",O+,374-70-5150,HCSC (Health Care Service Corporation),Q0OJ75S8Z35D,Andrew Bailey MD\n\n+1-897-601-7345,Educational psychologist,Dr. Shelly Hunt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19240,MN01-7,2001,Trauma/Injuries,Gunshot Wound,Neurosugery,Cranioplasty,Overturned Decision of Health Plan,Medical Necessity,,,...,,+1-394-880-1197,"875 Misty Terrace\nAmyfort, CO 79231",O-,614-48-4940,Kaiser Foundation,Z6OW60C3J59N,Stephanie Berger\n\n+1-664-209-8786,Publishing rights manager,Dr. Jerry Daniels
19241,MN01-6,2001,Infectious,Onychomycosis/ Nail Fungus,Pharmacy/Prescription Drugs,Anti-Fungal,Upheld Decision of Health Plan,Medical Necessity,,,...,,+1-386-867-7901,"97158 Bush Fall Apt. 895\nFloreston, RI 14489",O+,738-95-4536,Kaiser Foundation,U1DU58X1C35I,Felicia Wagner\n\n+1-654-390-3043,"Psychologist, clinical",Dr. Alexandria Gaines
19242,MN01-5,2001,Orthopedic/ Musculoskeletal,Other,Orthopedic,Arthroscopy,Upheld Decision of Health Plan,Medical Necessity,,,...,,+1-634-671-0883,"56361 Ryan Lock\nLarsonside, MS 45295",O-,459-54-9909,Anthem Inc.,O5IL61E4R50K,Valerie Boyle\n\n+1-399-412-6569,Estate agent,Dr. Alexandria Gaines
19243,MN01-4,2001,Orthopedic/ Musculoskeletal,Back Pain,Reconstructive/Plastic Surgery,Breast Reduction,Overturned Decision of Health Plan,Medical Necessity,,,...,,+1-305-612-9521,"1676 Marc Crossroad\nSantostown, DC 89113",AB-,588-98-4283,Anthem Inc.,N4TU52P0O89K,Crystal Ayers\n\n+1-839-248-2666,Writer,Dr. Michelle Lamb


In [295]:
df_meta.to_csv('california_fake_metadata.csv', index=False)
df_meta

Unnamed: 0,reference_id,diagnosis_category_aip,diagnosis_category_pip,diagnosis_sub_category_aip,diagnosis_sub_category_pip,treatment_category_aip,treatment_category_pip,treatment_sub_category_aip,treatment_sub_category_pip,determination_aip,...,patient_insurance_provider_aip,patient_insurance_provider_pip,patient_insurance_number_aip,patient_insurance_number_pip,patient_emergency_contact_aip,patient_emergency_contact_pip,patient_occupation_aip,patient_occupation_pip,consulting_physician_aip,consulting_physician_pip
0,MN16-22639,7294,7293,7294,7293,7550,7293,7294,7293,7294,...,7550,7293,256,0,512,0,7550,7293,2,0
1,MN16-22638,2050,7293,2050,7293,2306,7293,2050,7293,2050,...,2306,7293,256,0,512,0,2306,7293,2,0
2,MN16-22637,7290,7293,7290,7293,7546,7293,7290,7293,7290,...,7546,7293,256,0,512,0,7546,7293,2,0
3,EI16-22636,126,7293,126,7293,382,7293,126,7293,126,...,382,7293,256,0,512,0,382,7293,2,0
4,EI06-5319,7294,5245,7294,5245,7550,5245,7294,5245,7294,...,7550,5245,256,0,512,0,7550,5245,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19240,MN01-7,7294,0,7294,0,7550,0,7294,0,7294,...,7550,0,256,0,512,0,7550,0,2,0
19241,MN01-6,2,7293,2,7293,258,7293,2,7293,2,...,258,7293,256,0,512,0,258,7293,2,0
19242,MN01-5,2098,125,2098,125,2354,125,2098,125,2098,...,2354,125,256,0,512,0,2354,125,2,0
19243,MN01-4,7294,7293,7294,7293,7550,7293,7294,7293,7294,...,7550,7293,256,0,512,0,7550,7293,2,0


In [3]:
import pandas as pd

df = pd.read_csv("california_fake_data.csv")
for column in df.columns:
    print(column, len(df[column].unique()))

reference_id 19245
report_year 16
diagnosis_category 30
diagnosis_sub_category 279
treatment_category 32
treatment_sub_category 284
determination 2
treatment_type 3
patient_age_range 8
patient_gender 3
findings 19199
patient_name 17285
patient_age 102
patient_phone 19245
patient_address 19245
patient_blood_type 8
patient_ssn 19244
patient_insurance_provider 16
patient_insurance_number 19245
patient_emergency_contact 19245
patient_occupation 639
consulting_physician 6


patient_gender
Female    10400
Male       7635
Name: count, dtype: int64

In [5]:
for column in df.columns:
    if len(df[column].unique()) < 33:
        print(column, df[column].unique(), '\n')

report_year [2016 2006 2015 2014 2010 2005 2004 2009 2008 2007 2001 2013 2012 2002
 2003 2011] 

diagnosis_category ['Infectious' 'Mental' 'Autism Spectrum' 'Prevention/Good Health'
 'Cardiac/Circulatory' 'OB-Gyn/ Pregnancy'
 'Digestive System/ Gastrointestinal' 'Orthopedic/ Musculoskeletal'
 'Central Nervous System/ Neuromuscular' 'Endocrine/ Metabolic'
 'Pediatrics' 'Chronic Pain' 'Respiratory System' 'Cancer'
 'Morbid Obesity' 'Ears, Nose, Throat' 'Post Surgical Complication'
 'Immunologic' 'Skin' 'Not Applicable' 'Foot' 'Dental' 'Blood Related'
 'Genetic' 'Genitourinary/ Kidney' 'Vision' 'Trauma/Injuries' nan
 'Organ Failure' 'Alcohol and Drug Addiction'] 

treatment_category ['Pharmacy/Prescription Drugs' 'Mental Health Treatment'
 'Autism Related Treatment' 'Diagnostic Imaging, Screening and Testing'
 'Cardio Vascular' 'Durable Medical Equipment'
 'Diagnostic/Physician Evaluation' 'Orthopedic' 'Emergency/Urgent Care'
 'General Surgery' 'Acute Medical Services - Outpatient' 'Not A