In [43]:
import numpy as np
import pandas as pd

In [None]:
data = pd.read_csv("Patient Record")

In [45]:
data.isnull().sum()

Unnamed: 0,0
PatientId,0
AppointmentID,0
Gender,0
ScheduledDay,0
AppointmentDay,0
Age,0
Neighbourhood,0
Scholarship,0
Hipertension,0
Diabetes,0


In [46]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   PatientId       110527 non-null  float64
 1   AppointmentID   110527 non-null  int64  
 2   Gender          110527 non-null  object 
 3   ScheduledDay    110527 non-null  object 
 4   AppointmentDay  110527 non-null  object 
 5   Age             110527 non-null  int64  
 6   Neighbourhood   110527 non-null  object 
 7   Scholarship     110527 non-null  int64  
 8   Hipertension    110527 non-null  int64  
 9   Diabetes        110527 non-null  int64  
 10  Alcoholism      110527 non-null  int64  
 11  Handcap         110527 non-null  int64  
 12  SMS_received    110527 non-null  int64  
 13  No-show         110527 non-null  object 
dtypes: float64(1), int64(8), object(5)
memory usage: 11.8+ MB


In [47]:
data.head(2)

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No


In [48]:
def clean_data(df : pd.DataFrame) -> pd.DataFrame:
  df = df.copy()

  df.columns = df.columns.str.lower().str.replace('-','_').str.strip()

  df = df.rename(columns={'handcap':'handicap'})

  df['scheduledday'] = pd.to_datetime(df['scheduledday'])
  df['appointmentday'] = pd.to_datetime(df['appointmentday'])

  df['waiting_days'] = (df['appointmentday'] - df['scheduledday']).dt.days

  df = df[df['waiting_days'] >= 0]

  df['gender'] = df['gender'].str.strip().str.upper()
  df['gender'] = df['gender'].map({'F':0,'M':1})
  df['no_show'] = df['no_show'].map({'No':0,'Yes':1})

  df.loc[df['age']<0,'age'] = None
  df.loc[df['age']>=110,'age'] = None
  df['age'] = df['age'].fillna(df['age'].median())

  return df


In [49]:
data = clean_data(data)

In [50]:
data.head(2)

Unnamed: 0,patientid,appointmentid,gender,scheduledday,appointmentday,age,neighbourhood,scholarship,hipertension,diabetes,alcoholism,handicap,sms_received,no_show,waiting_days
5,95985130000000.0,5626772,0,2016-04-27 08:36:51+00:00,2016-04-29 00:00:00+00:00,76.0,REPÚBLICA,0,1,0,0,0,0,0,1
6,733688200000000.0,5630279,0,2016-04-27 15:05:12+00:00,2016-04-29 00:00:00+00:00,23.0,GOIABEIRAS,0,0,0,0,0,0,1,1


In [51]:
numeric_columns = [
    'age',
    'waiting_days',
    'scholarship',
    'hipertension',
    'diabetes',
    'alcoholism',
    'handicap',
    'sms_received',
    'gender'
    ]

categorical_columns = ['neighbourhood']

target = 'no_show'

In [52]:
data.head(2)

Unnamed: 0,patientid,appointmentid,gender,scheduledday,appointmentday,age,neighbourhood,scholarship,hipertension,diabetes,alcoholism,handicap,sms_received,no_show,waiting_days
5,95985130000000.0,5626772,0,2016-04-27 08:36:51+00:00,2016-04-29 00:00:00+00:00,76.0,REPÚBLICA,0,1,0,0,0,0,0,1
6,733688200000000.0,5630279,0,2016-04-27 15:05:12+00:00,2016-04-29 00:00:00+00:00,23.0,GOIABEIRAS,0,0,0,0,0,0,1,1


In [54]:
from sklearn.preprocessing import OneHotEncoder , StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X = data[numeric_columns + categorical_columns]

y = data[target]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=1/3,random_state=42)

preprocess = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),numeric_columns),
        ('cat',OneHotEncoder(handle_unknown='ignore',drop='first'),categorical_columns)
    ],
    remainder='drop'
)

clf = Pipeline(steps=[
    ('preprocess',preprocess),
    ('model',LogisticRegression(max_iter=1000))
])

clf.fit(X_train,y_train)

print("Score :",clf.score(X_train,y_train))

Score : 0.714937880430251


In [56]:
sample = data[numeric_columns + categorical_columns].sample(5, random_state=0)

clf.predict(sample)
clf.predict_proba(sample)


array([[0.73299109, 0.26700891],
       [0.70057133, 0.29942867],
       [0.70456722, 0.29543278],
       [0.67045202, 0.32954798],
       [0.64104957, 0.35895043]])

In [57]:
import joblib

joblib.dump(clf, "noshow_pipeline.pkl")

# later:
loaded = joblib.load("noshow_pipeline.pkl")
loaded.predict(sample)


array([0, 0, 0, 0, 0])