In [1]:
import pandas as pd 
from datetime import datetime

In [2]:
# Reading data from input file
data = pd.read_csv("Appointment-No-Show-Data.csv")
print(data.dtypes)
print("------------------------------------------")
data.shape

PatientId         float64
AppointmentID       int64
Gender             object
ScheduledDay       object
AppointmentDay     object
Age                 int64
Neighbourhood      object
Scholarship         int64
Hipertension        int64
Diabetes            int64
Alcoholism          int64
Handcap             int64
SMS_received        int64
No-show            object
dtype: object
------------------------------------------


(110527, 14)

In [3]:
# Converting the data type to date time
data["ScheduledDay"] = pd.to_datetime(data["ScheduledDay"])
data["AppointmentDay"] = pd.to_datetime(data["AppointmentDay"])

In [4]:
# Weekday function identifies the day of the week. 0-6 corresponds to Monday-Sunday

data["WeekdayScheduled"] = data["ScheduledDay"].dt.weekday
data["WeekdayAppointment"] = data["AppointmentDay"].dt.weekday

In [5]:
data['isweekdayschedule'] = data['WeekdayScheduled'].apply(lambda x: 1 if x >= 5 else 0)

print('Yes. We have {} entries that are scheduled on a weekend.'.format(sum(data['isweekdayschedule'])))
print("--------------------------------------------------------------------------")
print("Suggestion : We can delete all those entries for our analysis")

Yes. We have 24 entries that are scheduled on a weekend.
--------------------------------------------------------------------------
Suggestion : We can delete all those entries for our analysis


In [6]:
data['isweekdayappointment'] = data['WeekdayAppointment'].apply(lambda x: 1 if x >= 5 else 0)
 
print('Yes. We have {} entries that have an appointment on Weekend. These entries are incorrect.'.format(sum(data['isweekdayappointment'])))
print("--------------------------------------------------------------------------")
print("Suggestion : We can delete all those entries for our analysis")

Yes. We have 39 entries that have an appointment on Weekend. These entries are incorrect.
--------------------------------------------------------------------------
Suggestion : We can delete all those entries for our analysis


In [7]:
# Part 4
# (a) - Scheduleday should take place before appointment day
data["Differencedays"] = data["ScheduledDay"] - data["AppointmentDay"]

print('Yes. We have {} entries instances where in the appointmentdate is earlier than scheduled date.'.format(sum(data['Differencedays'].apply(lambda x: 1 if x.days > 0 else 0))))
print("--------------------------------------------------------------------------")
print("Suggestion : We can delete all those entries for our analysis")

Yes. We have 5 entries instances where in the appointmentdate is earlier than scheduled date.
--------------------------------------------------------------------------
Suggestion : We can delete all those entries for our analysis


In [8]:
# (b) - Gender, handicap and scholarship should have unique value for a patient
gender_dict = {}
for i in range(len(data)): 
    if data.loc[i, "PatientId"] in gender_dict.keys():
        if(data.loc[i, "Gender"] in gender_dict[data.loc[i, "PatientId"]]):
            continue
        else:
            gender_dict[data.loc[i, "PatientId"]].append(data.loc[i, "Gender"]) 
                
    else:
        gender_dict[data.loc[i, "PatientId"]] = [data.loc[i,"Gender"]]
            
print(sum(1 for i in gender_dict.values() if len(i) >= 2))

print("Conclusion: None of the patient has difference in gender values multiple visits")

0
Conclusion: None of the patient has difference in gender values multiple visits


In [9]:
handicap = {}
for i in range(len(data)): 
    if data.loc[i, "PatientId"] in handicap.keys():
        if(data.loc[i, "Handcap"] in handicap[data.loc[i, "PatientId"]]):
            continue
        else:
            handicap[data.loc[i, "PatientId"]].append(data.loc[i, "Handcap"]) 
                
    else:
        handicap[data.loc[i, "PatientId"]] = [data.loc[i,"Handcap"]]
            
print(sum(1 for i in handicap.values() if len(i) >= 2))

print("Conclusion: None of the patient has difference in handicap values for multiple visits")

0
Conclusion: None of the patient has difference in handicap values for multiple visits


In [10]:
Scholarship = {}
for i in range(len(data)): 
    if data.loc[i, "PatientId"] in Scholarship.keys():
        if(data.loc[i, "Scholarship"] in Scholarship[data.loc[i, "PatientId"]]):
            continue
        else:
            Scholarship[data.loc[i, "PatientId"]].append(data.loc[i, "Scholarship"]) 
                
    else:
        Scholarship[data.loc[i, "PatientId"]] = [data.loc[i,"Scholarship"]]
            
print(sum(1 for i in Scholarship.values() if len(i) >= 2))

print("Conclusion: None of the patient has difference in Scholarship values for multiple visits")

0
Conclusion: None of the patient has difference in Scholarship values for multiple visits


In [11]:
age = {}


for i in range(len(data)): 
    if data.loc[i, "PatientId"] in age.keys():
        #if data.loc[i, "Age"] not in age[data.loc[i, "PatientId"]]:
            age[data.loc[i, "PatientId"]].append(data.loc[i, "Age"]) 
        
    else:
        age[data.loc[i, "PatientId"]] = [data.loc[i,"Age"]]

for i in age.keys():
    if(age[i] != sorted(age[i])):
        print(i,age[i])

# Conclusion : We have entried with mismatch in age. 
# Criteria not met - The age is different in different appointments, then the appointment with older age should take place after the appointment with younger age.

441431798896847.0 [89, 90, 89, 89]
5567159523622.0 [2, 3, 2, 3, 3, 3, 3]
526457271918178.0 [17, 18, 17, 17]
94866145193937.0 [26, 26, 26, 27, 26, 27]
7343477663565.0 [77, 77, 77, 78, 77]
24448321454.0 [1, 1, 1, 2, 2, 1]
91581872616666.0 [34, 35, 34]
81213966782532.0 [47, 47, 47, 47, 48, 48, 48, 48, 48, 48, 48, 48, 48, 47, 48, 48, 47, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48]
67793226134888.0 [2, 3, 2, 3]
841147961165.0 [17, 18, 18, 18, 17]
27989988316.0 [38, 38, 39, 38, 38, 38, 38]
9712431915876.0 [21, 22, 21, 22, 22]
726946859782.0 [73, 73, 73, 73, 73, 73, 74, 73, 73, 74, 74]
834798139251.0 [49, 49, 49, 49, 50, 49, 49, 49, 50, 49, 49, 49, 49, 50, 50, 50, 50]
28532485743924.0 [61, 62, 62, 62, 61, 62, 62, 62, 61, 62, 62, 62, 62, 62, 62]
31359178796118.0 [48, 48, 48, 49, 48, 48, 49, 49, 49, 48, 48, 49, 48, 48, 49, 49]
863266615443557.0 [52, 53, 52, 53, 53, 53, 52, 53, 53, 53, 52, 53, 53, 52, 53, 53, 53, 53]
548284689473265.0 [60, 60, 60, 60, 60, 60, 60, 61, 60, 60, 60, 60, 61, 61, 

In [12]:
print("Conclusion: We have entried with mismatch in age. Criteria not met - The age is different in different appointments, then the appointment with older age should take place after the appointment with younger age. ")

Conclusion: We have entried with mismatch in age. Criteria not met - The age is different in different appointments, then the appointment with older age should take place after the appointment with younger age. 


In [13]:
# Part 5
# Negative ages
data[data["Age"] < 0]

print('Yes. We have {} entries where patient is having negative age.'.format(len(data[data["Age"] < 0])))
print("--------------------------------------------------------------------------")
print("Suggestion : We can delete all those entries for our analysis")

Yes. We have 1 entries where patient is having negative age.
--------------------------------------------------------------------------
Suggestion : We can delete all those entries for our analysis


In [14]:
# Part 6 - Removing people with age greater than 100 as they are outliers
print('Yes. We have {} entries where patient is having 100+ age.'.format(len(data[data["Age"] >100])))
print("Removing people with age greater than 100")
data = data[data["Age"] <100]
print(data.shape)

Yes. We have 7 entries where patient is having 100+ age.
Removing people with age greater than 100
(110516, 19)
