In [18]:
#. Load the Dataset
import pandas as pd

df = pd.read_csv("KaggleV2-May-2016.csv")
print("Initial Data:")
print(df.head())
print("\nShape:", df.shape)


Initial Data:
      PatientId  AppointmentID Gender          ScheduledDay  \
0  2.987250e+13        5642903      F  2016-04-29T18:38:08Z   
1  5.589978e+14        5642503      M  2016-04-29T16:08:27Z   
2  4.262962e+12        5642549      F  2016-04-29T16:19:04Z   
3  8.679512e+11        5642828      F  2016-04-29T17:29:31Z   
4  8.841186e+12        5642494      F  2016-04-29T16:07:23Z   

         AppointmentDay  Age      Neighbourhood  Scholarship  Hipertension  \
0  2016-04-29T00:00:00Z   62    JARDIM DA PENHA            0             1   
1  2016-04-29T00:00:00Z   56    JARDIM DA PENHA            0             0   
2  2016-04-29T00:00:00Z   62      MATA DA PRAIA            0             0   
3  2016-04-29T00:00:00Z    8  PONTAL DE CAMBURI            0             0   
4  2016-04-29T00:00:00Z   56    JARDIM DA PENHA            0             1   

   Diabetes  Alcoholism  Handcap  SMS_received No-show  
0         0           0        0             0      No  
1         0           0 

In [19]:
#Check Missing Values
print("\nMissing Values:\n", df.isnull().sum())



Missing Values:
 PatientId         0
AppointmentID     0
Gender            0
ScheduledDay      0
AppointmentDay    0
Age               0
Neighbourhood     0
Scholarship       0
Hipertension      0
Diabetes          0
Alcoholism        0
Handcap           0
SMS_received      0
No-show           0
dtype: int64


In [20]:
#Remove Duplicate Rows
df = df.drop_duplicates()
print("\nShape after removing duplicates:", df.shape)



Shape after removing duplicates: (110527, 14)


In [21]:
#Clean Text Data (e.g., Gender, No-show)
df['Gender'] = df['Gender'].str.strip().str.upper()
df['No-show'] = df['No-show'].str.strip().str.upper()

print("\nUnique values in Gender:", df['Gender'].unique())
print("Unique values in No-show:", df['No-show'].unique())



Unique values in Gender: ['F' 'M']
Unique values in No-show: ['NO' 'YES']


In [22]:
#Convert Date Columns
df['ScheduledDay'] = pd.to_datetime(df['ScheduledDay'])
df['AppointmentDay'] = pd.to_datetime(df['AppointmentDay'])

print("\nData types after datetime conversion:\n", df.dtypes)



Data types after datetime conversion:
 PatientId                     float64
AppointmentID                   int64
Gender                         object
ScheduledDay      datetime64[ns, UTC]
AppointmentDay    datetime64[ns, UTC]
Age                             int64
Neighbourhood                  object
Scholarship                     int64
Hipertension                    int64
Diabetes                        int64
Alcoholism                      int64
Handcap                         int64
SMS_received                    int64
No-show                        object
dtype: object


In [23]:
#Rename Columns
df.columns = df.columns.str.strip().str.lower().str.replace('-', '_')
print("\nRenamed columns:\n", df.columns)



Renamed columns:
 Index(['patientid', 'appointmentid', 'gender', 'scheduledday',
       'appointmentday', 'age', 'neighbourhood', 'scholarship', 'hipertension',
       'diabetes', 'alcoholism', 'handcap', 'sms_received', 'no_show'],
      dtype='object')


In [24]:
#Check and Fix Data Types
print("\nAge column type:", df['age'].dtype)

# to ensure age is integer and remove invalid values
df = df[df['age'] >= 0]
df['age'] = df['age'].astype(int)



Age column type: int64


In [25]:
#Save Cleaned Data
df.to_csv("cleaned_medical_appointments.csv", index=False)
print("\nSaved as cleaned_medical_appointments.csv")



Saved as cleaned_medical_appointments.csv
