In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# sets the theme of the charts
plt.style.use('seaborn-v0_8-darkgrid')

%matplotlib inline

In [2]:
# imports the csv files
px_df = pd.read_csv('px.csv', low_memory=False)
doctors_df = pd.read_csv('doctors.csv', low_memory=False, encoding='unicode_escape')
clinics_df = pd.read_csv('clinics.csv', low_memory=False, encoding='unicode_escape')
appointments_df = pd.read_csv('appointments.csv', low_memory=False, encoding='unicode_escape')

In [3]:
# checks the content of the csv files
px_df.info()
doctors_df.info()
clinics_df.info()
appointments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6507813 entries, 0 to 6507812
Data columns (total 3 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   pxid    object
 1   age     object
 2   gender  object
dtypes: object(3)
memory usage: 149.0+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60024 entries, 0 to 60023
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   doctorid       60024 non-null  object 
 1   mainspecialty  27055 non-null  object 
 2   age            20028 non-null  float64
dtypes: float64(1), object(2)
memory usage: 1.4+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53962 entries, 0 to 53961
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   clinicid      53962 non-null  object
 1   hospitalname  17538 non-null  object
 2   IsHospital    53962 non-null  bool  
 3   City          53962 non-null  object
 4   P

## doctors dataset
1. `doctorid`: unique identifiers for doctors
2. `mainspecialty`: main specialty of the doctors
3. `age`: age of the doctors

In [4]:
doctors_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60024 entries, 0 to 60023
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   doctorid       60024 non-null  object 
 1   mainspecialty  27055 non-null  object 
 2   age            20028 non-null  float64
dtypes: float64(1), object(2)
memory usage: 1.4+ MB


`doctorid` data cleaning

In [6]:
doctors_df['doctorid']

0        AD61AB143223EFBC24C7D2583BE69251
1        D09BF41544A3365A46C9077EBB5E35C3
2        FBD7939D674997CDB4692D34DE8633C4
3        28DD2C7955CE926456240B2FF0100BDE
4        35F4A8D465E6E1EDC05F3D8AB658C551
                       ...               
60019    CD532DBEF6547A66D2138FAB49AA3B94
60020    4473D870B5E31FAA40D2C45E1FF6DC27
60021    A4F554EB2C0934E7FDE2511E8C1573BA
60022    E540A361D93D37A33BB2F55D43DA79D9
60023    23BA85862DD19C3550E7C0F0AF84C7ED
Name: doctorid, Length: 60024, dtype: object

In [10]:
# step 1: check for duplicates
unique_doctorid_count = doctors_df['doctorid'].nunique()

# step 2: check for missing values
missing_doctorid = doctors_df['doctorid'].isnull().sum()

# step 3: check if values are in capital letters
lowercase_doctorid = doctors_df['doctorid'].str.islower().sum()

# step 4: check if datatype is consistent
nonstring_doctorid = doctors_df['doctorid'].apply(type).ne(str).sum()

print("no. of unique doctor ids:", unique_doctorid_count)
print("no. of missing values:", missing_doctorid)
print("no. of lowercase values:",  lowercase_doctorid)
print("no. of non-string values:", nonstring_doctorid)


no. of unique doctor ids: 60024
no. of missing values: 0
no. of lowercase values: 0
no. of non-string values: 0
