In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# sets the theme of the charts
plt.style.use('seaborn-v0_8-darkgrid')

%matplotlib inline

In [11]:
# imports the csv files
px_df = pd.read_csv('px.csv', low_memory=False)
doctors_df = pd.read_csv('doctors.csv', low_memory=False, encoding='unicode_escape')
clinics_df = pd.read_csv('clinics.csv', low_memory=False, encoding='unicode_escape')
appointments_df = pd.read_csv('appointments.csv', low_memory=False, encoding='unicode_escape')

In [12]:
# checks the content of the csv files
px_df.info()
doctors_df.info()
clinics_df.info()
appointments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6507813 entries, 0 to 6507812
Data columns (total 3 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   pxid    object
 1   age     object
 2   gender  object
dtypes: object(3)
memory usage: 149.0+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60024 entries, 0 to 60023
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   doctorid       60024 non-null  object 
 1   mainspecialty  27055 non-null  object 
 2   age            20028 non-null  float64
dtypes: float64(1), object(2)
memory usage: 1.4+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53962 entries, 0 to 53961
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   clinicid      53962 non-null  object
 1   hospitalname  17538 non-null  object
 2   IsHospital    53962 non-null  bool  
 3   City          53962 non-null  object
 4   P

## doctors dataset
1. `doctorid`: unique identifiers for doctors
2. `mainspecialty`: main specialty of the doctors
3. `age`: age of the doctors

In [13]:
doctors_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60024 entries, 0 to 60023
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   doctorid       60024 non-null  object 
 1   mainspecialty  27055 non-null  object 
 2   age            20028 non-null  float64
dtypes: float64(1), object(2)
memory usage: 1.4+ MB


`doctorid` data cleaning

In [14]:
doctors_df['doctorid']

0        AD61AB143223EFBC24C7D2583BE69251
1        D09BF41544A3365A46C9077EBB5E35C3
2        FBD7939D674997CDB4692D34DE8633C4
3        28DD2C7955CE926456240B2FF0100BDE
4        35F4A8D465E6E1EDC05F3D8AB658C551
                       ...               
60019    CD532DBEF6547A66D2138FAB49AA3B94
60020    4473D870B5E31FAA40D2C45E1FF6DC27
60021    A4F554EB2C0934E7FDE2511E8C1573BA
60022    E540A361D93D37A33BB2F55D43DA79D9
60023    23BA85862DD19C3550E7C0F0AF84C7ED
Name: doctorid, Length: 60024, dtype: object

In [15]:
# step 1: check for duplicates
unique_doctorid_count = doctors_df['doctorid'].nunique()

# step 2: check for missing values
missing_doctorid = doctors_df['doctorid'].isnull().sum()

# step 3: check if values are in capital letters
lowercase_doctorid = doctors_df['doctorid'].str.islower().sum()

# step 4: check if datatype is consistent
nonstring_doctorid = doctors_df['doctorid'].apply(type).ne(str).sum()

print("no. of unique doctor ids:", unique_doctorid_count)
print("no. of missing values:", missing_doctorid)
print("no. of lowercase values:",  lowercase_doctorid)
print("no. of non-string values:", nonstring_doctorid)


no. of unique doctor ids: 60024
no. of missing values: 0
no. of lowercase values: 0
no. of non-string values: 0


`mainspecialty` data cleaning

In [16]:
doctors_df['mainspecialty']

0            General Medicine
1             Family Medicine
2           Vascular Medicine
3           Otolaryngologists
4           General Dentistry
                 ...         
60019    General Practitioner
60020                     NaN
60021                     NaN
60022    General Practitioner
60023       Internal Medicine
Name: mainspecialty, Length: 60024, dtype: object

In [17]:
# check for unique values
unique_specialties = doctors_df['mainspecialty'].unique()
for specialty in unique_specialties:
    print(specialty)

General Medicine
Family Medicine
Vascular Medicine
Otolaryngologists
General Dentistry
Orthopedic
Acupunturist1
Orthopaedic Sports Medicine
Masters of Science in Preventive & Regenerative Medicine
Anti Aging and Regenerative Medicine
Surgery
Internal Medicine
Otolaryngology
Pediatrics
Ophthalmology
Orthopedic surgery
Orthopedic Surgeon
Internal medicine
General Surgery
Radiology
nan
Anesthesiology
Gastroenterology and Hepatology
General Physician
Obstetrics &Gynecology
HPB Surgery
PEDIATRICS
Orthopedics
Dermatology
Orthodontics
Family medicine
docmark_coli@yahoo.com
s
a
Pedia
Neurosurgery
Otolaryngology Head and Neck Surgery
Software Developer
Family and general dentistry
Otolaryngology-Head & Neck Surgery
Ob gyn
Obstetrics and Gynecology
Internal Medicine - Endocrinology
general pediatrics
internal medicine
Family dentistry
Pediatrician
Urology, Endoscopic Urology, Laparoscopic and Minimally Invasive Urology
Pediatric Dentistry
orthodontist 
Endocrinology
Orthodontist
Neurosurgery 
En

In [18]:
# duplicate the original dataframe
doctors_df_copy = doctors_df

# since fuzzywuzzy is slower the higher the number of rows, we split the dataset into three
split = np.array_split(doctors_df_copy, 3)
split_1, split_2, split_3, = split[0], split[1], split[2]

  return bound(*args, **kwds)


In [19]:
# step 1: define pecialties (sorted according to NowServing filters)
specialties = [
    "Primary Care", "General Medicine", "Emergency Medicine", "Family Medicine", "Urgent Care Medicine", "Internal Medicine",
    "Public Health", "Addiction Medicine", "Vascular Medicine", "Nursing", "Obstetrics And Gynecology (OBGYN)",
    "Reproductive Endocrinology and Infertility", "Ob Sonology", "Transgender Health And Wellness", "Pediatrics",
    "Diabetology", "Endocrinology", "Ophthalmology", "Optometry", "Cardiology", "Aesthetic, Reconstructive & Burn",
    "Dermatology", "Dermatopathology", "Pulmonology", "Respiratory Therapy", "Gastroenterology", "Bariatric Endoscopy",
    "Otolaryngology (ENT)", "Neurotology", "Nephrology", "Urology", "Neurology", "Neurosurgery", "Neuropsychology",
    "General Surgery", "Bariatric Surgery", "Cardiothoracic Surgery", "Colorectal Surgery", "Cosmetic Surgery",
    "Oral Surgery", "Hepato-Pancreatico-Biliary (HPB) Surgery", "Hepatobiliary Surgery", "Maxillofacial Surgery",
    "Oncology - Surgical", "Orthopedic Surgery", "Orthopedic Surgery - Foot & Ankle", "Orthopedics - Interventional",
    "Pancreatic Surgery", "Plastic Surgery", "Transplant Surgery", "Vascular Surgery", "Orthopedic Surgery - Spine",
    "Thoracic Cardiovascular Surgery", "Colorectal Surgery", "Veterinary Surgery",
    "Oncology", "Oncology - Medical", "Oncology - Radiation", "Oncology - Surgical", "Oncology - Trophoblastic Diseases",
    "Oncology - Gynecology", "Psychiatry", "Psychiatry - Child & Adolescent", "Psychiatry - Interventional",
    "Psychology", "Psychotherapy", "Guidance & Counseling", "Allergology & Immunology", "Infectious Disease",
    "Sleep Medicine", "Pain Medicine", "Physiatry", "Physical Medicine & Rehabilitation", "Sports Medicine",
    "Physical Therapy", "Physiotherapy", "Occupational Therapy", "Neonatology", "Perinatology", "Lactation Consultant",
    "Midwifery", "Hematology", "Hepatology", "Foot, Ankle, and Lower Leg", "Podiatry", "Joints, Muscles, and Bones",
    "Rheumatology", "Geriatric Medicine", "Gerontology", "Proctology", "Hearing", "Audiology", "Otology",
    "Speech-Language Pathology", "Hospice And Palliative Care", "Anesthesiology", "Imaging and Radiology", "Poisons and Toxins",
    "Toxicology", "General Dentistry", "Dentistry - Endodontics", "Dentistry - Oral Surgery", "Dentistry - Orthodontics",
    "Dentistry - Pediatrics", "Dentistry - Periodontics", "Dentistry - Prosthodontics", "Alternative Medicine", "Acupuncture",
    "Chiropractic", "Integrative Medicine", "Naturopathy", "Veterinary"
]

# step 2: perform matching via fuzzywuzzy
def match_specialty(value, choices, score_cutoff=80):
    match, score = process.extractOne(str(value), choices)
    if score > score_cutoff:
        return match
    return "NA"

# step 3: copy changes to new column
split_1['mainspecialty_cleaned'] = split_1['mainspecialty'].apply(lambda x: match_specialty(x, specialties))



In [20]:
# step 1: define pecialties (sorted according to NowServing filters)
specialties = [
    "Primary Care", "General Medicine", "Emergency Medicine", "Family Medicine", "Urgent Care Medicine", "Internal Medicine",
    "Public Health", "Addiction Medicine", "Vascular Medicine", "Nursing", "Obstetrics And Gynecology (OBGYN)",
    "Reproductive Endocrinology and Infertility", "Ob Sonology", "Transgender Health And Wellness", "Pediatrics",
    "Diabetology", "Endocrinology", "Ophthalmology", "Optometry", "Cardiology", "Aesthetic, Reconstructive & Burn",
    "Dermatology", "Dermatopathology", "Pulmonology", "Respiratory Therapy", "Gastroenterology", "Bariatric Endoscopy",
    "Otolaryngology (ENT)", "Neurotology", "Nephrology", "Urology", "Neurology", "Neurosurgery", "Neuropsychology",
    "General Surgery", "Bariatric Surgery", "Cardiothoracic Surgery", "Colorectal Surgery", "Cosmetic Surgery",
    "Oral Surgery", "Hepato-Pancreatico-Biliary (HPB) Surgery", "Hepatobiliary Surgery", "Maxillofacial Surgery",
    "Oncology - Surgical", "Orthopedic Surgery", "Orthopedic Surgery - Foot & Ankle", "Orthopedics - Interventional",
    "Pancreatic Surgery", "Plastic Surgery", "Transplant Surgery", "Vascular Surgery", "Orthopedic Surgery - Spine",
    "Thoracic Cardiovascular Surgery", "Colorectal Surgery", "Veterinary Surgery",
    "Oncology", "Oncology - Medical", "Oncology - Radiation", "Oncology - Surgical", "Oncology - Trophoblastic Diseases",
    "Oncology - Gynecology", "Psychiatry", "Psychiatry - Child & Adolescent", "Psychiatry - Interventional",
    "Psychology", "Psychotherapy", "Guidance & Counseling", "Allergology & Immunology", "Infectious Disease",
    "Sleep Medicine", "Pain Medicine", "Physiatry", "Physical Medicine & Rehabilitation", "Sports Medicine",
    "Physical Therapy", "Physiotherapy", "Occupational Therapy", "Neonatology", "Perinatology", "Lactation Consultant",
    "Midwifery", "Hematology", "Hepatology", "Foot, Ankle, and Lower Leg", "Podiatry", "Joints, Muscles, and Bones",
    "Rheumatology", "Geriatric Medicine", "Gerontology", "Proctology", "Hearing", "Audiology", "Otology",
    "Speech-Language Pathology", "Hospice And Palliative Care", "Anesthesiology", "Imaging and Radiology", "Poisons and Toxins",
    "Toxicology", "General Dentistry", "Dentistry - Endodontics", "Dentistry - Oral Surgery", "Dentistry - Orthodontics",
    "Dentistry - Pediatrics", "Dentistry - Periodontics", "Dentistry - Prosthodontics", "Alternative Medicine", "Acupuncture",
    "Chiropractic", "Integrative Medicine", "Naturopathy", "Veterinary"
]

# step 2: perform matching via fuzzywuzzy
def match_specialty(value, choices, score_cutoff=80):
    match, score = process.extractOne(str(value), choices)
    if score > score_cutoff:
        return match
    return "NA"

# step 3: copy changes to new column
split_2['mainspecialty_cleaned'] = split_2['mainspecialty'].apply(lambda x: match_specialty(x, specialties))



In [21]:
# step 1: define pecialties (sorted according to NowServing filters)
specialties = [
    "Primary Care", "General Medicine", "Emergency Medicine", "Family Medicine", "Urgent Care Medicine", "Internal Medicine",
    "Public Health", "Addiction Medicine", "Vascular Medicine", "Nursing", "Obstetrics And Gynecology (OBGYN)",
    "Reproductive Endocrinology and Infertility", "Ob Sonology", "Transgender Health And Wellness", "Pediatrics",
    "Diabetology", "Endocrinology", "Ophthalmology", "Optometry", "Cardiology", "Aesthetic, Reconstructive & Burn",
    "Dermatology", "Dermatopathology", "Pulmonology", "Respiratory Therapy", "Gastroenterology", "Bariatric Endoscopy",
    "Otolaryngology (ENT)", "Neurotology", "Nephrology", "Urology", "Neurology", "Neurosurgery", "Neuropsychology",
    "General Surgery", "Bariatric Surgery", "Cardiothoracic Surgery", "Colorectal Surgery", "Cosmetic Surgery",
    "Oral Surgery", "Hepato-Pancreatico-Biliary (HPB) Surgery", "Hepatobiliary Surgery", "Maxillofacial Surgery",
    "Oncology - Surgical", "Orthopedic Surgery", "Orthopedic Surgery - Foot & Ankle", "Orthopedics - Interventional",
    "Pancreatic Surgery", "Plastic Surgery", "Transplant Surgery", "Vascular Surgery", "Orthopedic Surgery - Spine",
    "Thoracic Cardiovascular Surgery", "Colorectal Surgery", "Veterinary Surgery",
    "Oncology", "Oncology - Medical", "Oncology - Radiation", "Oncology - Surgical", "Oncology - Trophoblastic Diseases",
    "Oncology - Gynecology", "Psychiatry", "Psychiatry - Child & Adolescent", "Psychiatry - Interventional",
    "Psychology", "Psychotherapy", "Guidance & Counseling", "Allergology & Immunology", "Infectious Disease",
    "Sleep Medicine", "Pain Medicine", "Physiatry", "Physical Medicine & Rehabilitation", "Sports Medicine",
    "Physical Therapy", "Physiotherapy", "Occupational Therapy", "Neonatology", "Perinatology", "Lactation Consultant",
    "Midwifery", "Hematology", "Hepatology", "Foot, Ankle, and Lower Leg", "Podiatry", "Joints, Muscles, and Bones",
    "Rheumatology", "Geriatric Medicine", "Gerontology", "Proctology", "Hearing", "Audiology", "Otology",
    "Speech-Language Pathology", "Hospice And Palliative Care", "Anesthesiology", "Imaging and Radiology", "Poisons and Toxins",
    "Toxicology", "General Dentistry", "Dentistry - Endodontics", "Dentistry - Oral Surgery", "Dentistry - Orthodontics",
    "Dentistry - Pediatrics", "Dentistry - Periodontics", "Dentistry - Prosthodontics", "Alternative Medicine", "Acupuncture",
    "Chiropractic", "Integrative Medicine", "Naturopathy", "Veterinary"
]

# step 2: perform matching via fuzzywuzzy
def match_specialty(value, choices, score_cutoff=80):
    match, score = process.extractOne(str(value), choices)
    if score > score_cutoff:
        return match
    return "NA"

# step 3: copy changes to new column
split_3['mainspecialty_cleaned'] = split_3['mainspecialty'].apply(lambda x: match_specialty(x, specialties))



In [25]:
# after cleaning the data per batch, concatenate the splits to a new dataframe
merge_df = pd.concat([split_1, split_2, split_3])

# copy the new row to the mainspecialty row in the dataframe copy
doctors_df_copy['mainspecialty'] = merge_df['mainspecialty_cleaned']

# drop the rows with NA mainspecialty
doctors_df_copy = doctors_df_copy[doctors_df_copy['mainspecialty'] != 'NA']

# check if rows are successfully dropped
print("Before cleaning:", len(doctors_df))
print("After cleaning:", len(doctors_df_copy))

# replace the copy to the original
doctors_df = doctors_df_copy

Before cleaning: 60024
After cleaning: 22508


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  doctors_df_copy['mainspecialty'] = merge_df['mainspecialty_cleaned']
