In [1]:
! pip install cardea
! pip install 'urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1'



In [2]:
# imports 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

from cardea import Cardea

In [3]:
# optional
import warnings
warnings.filterwarnings("ignore")

In [4]:
cd = Cardea()

In [5]:
! curl -O https://dai-cardea.s3.amazonaws.com/kaggle.zip && unzip kaggle.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2988k  100 2988k    0     0  15.0M      0 --:--:-- --:--:-- --:--:-- 15.0M
Archive:  kaggle.zip
  inflating: Patient.csv             
  inflating: Coding.csv              
  inflating: Appointment_Participant.csv  
  inflating: Address.csv             
 extracting: CodeableConcept.csv     
  inflating: Reference.csv           
  inflating: Observation.csv         
  inflating: Identifier.csv          
  inflating: Appointment.csv         


In [6]:
cd.load_entityset(data='kaggle')

# to view the loaded entityset
cd.es

Entityset: fhir
  Entities:
    Reference [Rows: 6100, Columns: 1]
    Address [Rows: 81, Columns: 2]
    CodeableConcept [Rows: 4, Columns: 2]
    Appointment [Rows: 110527, Columns: 5]
    Coding [Rows: 3, Columns: 2]
    Observation [Rows: 110527, Columns: 3]
    Patient [Rows: 6100, Columns: 4]
    Identifier [Rows: 227151, Columns: 1]
    Appointment_Participant [Rows: 6100, Columns: 2]
  Relationships:
    CodeableConcept.coding -> Coding.object_id
    Appointment.participant -> Appointment_Participant.object_id
    Observation.code -> CodeableConcept.object_id
    Observation.subject -> Reference.identifier
    Patient.address -> Address.object_id
    Appointment_Participant.actor -> Reference.identifier

In [7]:
cd.list_problems()

{'DiagnosisPrediction',
 'LengthOfStay',
 'MissedAppointment',
 'MortalityPrediction',
 'ProlongedLengthOfStay',
 'Readmission'}

In [8]:
# select problem
label_times = cd.select_problem('MissedAppointment')

In [9]:
# feature engineering
feature_matrix = cd.generate_features(label_times[:1000]) # takes a while for the full dataset
feature_matrix.head(5)

Built 13 features
Elapsed: 00:32 | Progress: 100%|██████████


Unnamed: 0,participant = 2680425062,participant = 4275143764,participant = 2615334244,participant = 2410824900,participant = 2406221984,participant = 1868414665,participant = 1692482157,participant = 1125465544,participant = 846537388,participant = 4121228070,participant is unknown,DAY(created) = 29,DAY(created) = 28,DAY(created) = 27,DAY(created) = 18,DAY(created) = 15,DAY(created) = 26,DAY(created) = 25,DAY(created) = 5,DAY(created) = 1,DAY(created) = 8,DAY(created) is unknown,DAY(start) = 29,DAY(start) is unknown,IS_WEEKEND(created),IS_WEEKEND(start),MONTH(created) = 4,MONTH(created) = 3,MONTH(created) = 2,MONTH(created) = 1,MONTH(created) is unknown,MONTH(start) = 4,MONTH(start) is unknown,WEEKDAY(created) = 4,WEEKDAY(created) = 2,WEEKDAY(created) = 1,WEEKDAY(created) = 3,WEEKDAY(created) = 0,WEEKDAY(created) is unknown,WEEKDAY(start) = 4,WEEKDAY(start) is unknown,YEAR(created) = 2016,YEAR(created) is unknown,YEAR(start) = 2016,YEAR(start) is unknown,Appointment_Participant.actor = 74200000000000,Appointment_Participant.actor = 713000000000000,Appointment_Participant.actor = 41400000000000,Appointment_Participant.actor = 28200000000000,Appointment_Participant.actor = 7270000000000,Appointment_Participant.actor = 7230000000000,Appointment_Participant.actor = 4920000000000,Appointment_Participant.actor = 3880000000000,Appointment_Participant.actor = 2760000000000,Appointment_Participant.actor = 735000000000000,Appointment_Participant.actor is unknown,Appointment_Participant.COUNT(Appointment),label
0,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,True,False,False,False,False,False,True,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,True,39,noshow
1,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,True,False,False,False,False,False,True,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,True,27,noshow
2,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,True,False,False,False,False,False,True,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,True,55,noshow
3,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,True,False,False,False,False,False,True,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,True,39,noshow
4,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,True,False,False,False,False,False,True,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,True,28,noshow


In [10]:
# shuffle the dataframe
feature_matrix = feature_matrix.sample(frac=1)

# pop the target labels
y = list(feature_matrix.pop('label'))
X = feature_matrix.values

X_train, X_test, y_train, y_test = cd.train_test_split(
	X, y, test_size=0.2, shuffle=True)

In [11]:
# modeling
cd.select_pipeline('Random Forest')
cd.fit(X_train, y_train)
y_pred = cd.predict(X_test)

In [12]:
cd.evaluate(X, y, test_size=0.2, shuffle=True)

{'Accuracy': 0.77, 'Confusion Matrix': array([[ 10,  15],
        [ 31, 144]]), 'F1 Macro': 0.5826528760660498, 'Precision': 0.5747814081914404, 'Recall': 0.6114285714285714}

In [13]:
# modeling1
cd.select_pipeline('Logistic Regression')
cd.fit(X_train, y_train)
y_pred = cd.predict(X_test)

In [14]:
cd.evaluate(X, y, test_size=0.2, shuffle=True)

{'Accuracy': 0.835, 'Confusion Matrix': array([[  0,  31],
        [  2, 167]]), 'F1 Macro': 0.45504087193460496, 'Precision': 0.4217171717171717, 'Recall': 0.4940828402366864}