In [1]:
import os
import pandas as pd
from datetime import datetime
import re
import numpy as np

In [2]:
data_path = "../data/"

In [3]:
df = pd.read_csv(os.path.join(data_path, "20210616_recent_past_epilepsy_clinic_document_anns_with_ASD.csv"))

In [4]:
df.reset_index(drop=True, inplace=True)

# Fix dates

In [5]:
df['clinic_date'] = pd.to_datetime(df['clinic_date'])

In [6]:
df = df.sort_values(by=['client_idcode', 'clinic_date'], ascending=False)

In [7]:
print(f" The number of unique patients are {df['client_idcode'].nunique()}")

 The number of unique patients are 9816


In [8]:
print(f"The clinic dates range from {df['clinic_date'].min()} to {df['clinic_date'].max()}")

The clinic dates range from 1957-08-25 00:00:00 to 2213-08-09 00:00:00


In [9]:
df['typed_on'] = df['body_analysed'].str.extract(r'Typed on (\d{1,2}[a-z]{0,2} [A-z]{3,9} \d{4})')

In [10]:
df['entered_on'] = df['body_analysed'].str.extract(r'Entered on (\d{1,2}[a-z]{0,2} [A-z]{3,9} \d{4})')

In [11]:
df['clinic_date2'] = df['body_analysed'].str.extract(r'clinic date.{1,3}(\d{1,2}-\d{2}-\d{2,4})', flags=re.IGNORECASE)

In [12]:
df['clinic_date2'] = pd.to_datetime(df['clinic_date2'], errors='coerce')

In [13]:
df['typed_on'] = pd.to_datetime(df['typed_on'], errors='coerce')
df['entered_on'] = pd.to_datetime(df['entered_on'], errors='coerce')

In [14]:
pd.notnull(df.iloc[24454,:]['entered_on'])

False

In [15]:
df.iloc[24454,:]['entered_on']

NaT

In [16]:
#clean dates
correct_clinic_date = []

for i, row in df.iterrows():
    if pd.notnull(row['typed_on']):
        correct_clinic_date.append(row['typed_on'])
    elif pd.notnull(row['entered_on']):
        correct_clinic_date.append(row['entered_on'])
    elif pd.notnull(row['clinic_date2']):
        correct_clinic_date.append(row['clinic_date2'])
    else:
        correct_clinic_date.append(row['clinic_date'])
    

In [17]:
df['correct_clinic_date'] = pd.to_datetime(correct_clinic_date)

In [18]:
df.iloc[2628, -1] = "26-11-2019"

In [19]:
# filter clinic dates by a range
df = df[(df['correct_clinic_date']>"2013-01-01")&(df['correct_clinic_date']<"2020-01-01")]

In [20]:
# remove irrelevant cols

df.drop(['clinic_date', 'typed_on', 'entered_on', 'clinic_date2'], axis=1, inplace=True)

In [22]:
for col in enumerate(df.columns):
    print(list(col))

[0, 'id']
[1, 'client_idcode']
[2, 'documentoutput_doc_dob']
[3, 'client_dob']
[4, 'client_gendercode']
[5, 'client_racecode']
[6, 'document_filename']
[7, 'updatetime']
[8, 'document_dateadded']
[9, 'body_analysed']
[10, 'clientvisit_providerdisplayname_analysed']
[11, 'clientvisit_touchedwhen']
[12, 'client_createdwhen']
[13, 'document_description']
[14, 'clientvisit_typecode']
[15, 'clientvisit_guid']
[16, 'Epilepsy']
[17, 'epilepsy_class_Unknown']
[18, 'epilepsy_class_Focal']
[19, 'epilepsy_class_Generalised']
[20, 'epilepsy_class_Combined generalised and focal']
[21, 'epilepsy_class_None']
[22, 'epilepsy_class_seizureonset_Unknown']
[23, 'epilepsy_class_seizureonset_Focal']
[24, 'epilepsy_class_seizureonset_Generalised']
[25, 'epilepsy_class_seizureonset_None']
[26, 'aetiology_Unknown']
[27, 'aetiology_Unknown presumed genetic']
[28, 'aetiology_Genetic']
[29, 'aetiology_Structural']
[30, 'aetiology_Metabolic']
[31, 'aetiology_Immune']
[32, 'aetiology_Infectious']
[33, 'aetiology_N

In [23]:
relevant_cols = [1]
relevant_cols.extend(range(16,117))

In [24]:
agg_df = df.iloc[:, relevant_cols].groupby('client_idcode', as_index=False).agg(sum)

# Descriptive stats

In [30]:
print(f"The clinic dates range from {df['correct_clinic_date'].min()} to {df['correct_clinic_date'].max()}")

The clinic dates range from 2013-01-02 00:00:00 to 2019-12-30 00:00:00


In [25]:
print(f" The number of unique patients are {df['client_idcode'].nunique()}")

 The number of unique patients are 9793


In [26]:
agg_df.astype(bool).sum()

client_idcode                          9793
Epilepsy                               4011
epilepsy_class_Unknown                 2854
epilepsy_class_Focal                   1592
epilepsy_class_Generalised              718
                                       ... 
Tiagabine_hydrochloride                   0
Tiagabine_hydrochloride_monohydrate       0
Gabapentin_enacarbil                      1
Valpromide                                0
Autism                                  653
Length: 102, dtype: int64

In [27]:
# with epilepsy
agg_df[agg_df['Epilepsy']>0].astype(bool).sum()

client_idcode                          4011
Epilepsy                               4011
epilepsy_class_Unknown                 2854
epilepsy_class_Focal                   1592
epilepsy_class_Generalised              718
                                       ... 
Tiagabine_hydrochloride                   0
Tiagabine_hydrochloride_monohydrate       0
Gabapentin_enacarbil                      0
Valpromide                                0
Autism                                  403
Length: 102, dtype: int64

In [31]:
# with epilepsy but seizure free
agg_df[(agg_df['Epilepsy']>0)&(agg_df['seizure free']>0)].astype(bool).sum()

client_idcode                          1425
Epilepsy                               1425
epilepsy_class_Unknown                 1031
epilepsy_class_Focal                    659
epilepsy_class_Generalised              285
                                       ... 
Tiagabine_hydrochloride                   0
Tiagabine_hydrochloride_monohydrate       0
Gabapentin_enacarbil                      0
Valpromide                                0
Autism                                  165
Length: 102, dtype: int64

In [33]:
# with epilepsy or seizures
agg_df[(agg_df['Epilepsy']>0)|(agg_df['seizure']>0)].astype(bool).sum()

client_idcode                          6810
Epilepsy                               4011
epilepsy_class_Unknown                 2854
epilepsy_class_Focal                   1592
epilepsy_class_Generalised              718
                                       ... 
Tiagabine_hydrochloride                   0
Tiagabine_hydrochloride_monohydrate       0
Gabapentin_enacarbil                      1
Valpromide                                0
Autism                                  548
Length: 102, dtype: int64

In [29]:
# without epilepsy or seizures
agg_df[(agg_df['Epilepsy']==0)&(agg_df['seizure']==0)].astype(bool).sum()

client_idcode                          2983
Epilepsy                                  0
epilepsy_class_Unknown                    0
epilepsy_class_Focal                      0
epilepsy_class_Generalised                0
                                       ... 
Tiagabine_hydrochloride                   0
Tiagabine_hydrochloride_monohydrate       0
Gabapentin_enacarbil                      0
Valpromide                                0
Autism                                  105
Length: 102, dtype: int64

# Classifier

In [None]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn import metrics

In [None]:
TODO: edit this!!
train, test, y_train, y_test = train_test_split(data.drop(["ARRIVAL_DELAY"], axis=1), data["ARRIVAL_DELAY"],
                                                random_state=10, test_size=0.25)

In [None]:
def auc(m, train, test): 
    return (metrics.roc_auc_score(y_train,m.predict_proba(train)[:,1]),
                            metrics.roc_auc_score(y_test,m.predict_proba(test)[:,1]))

In [None]:
# Parameter Tuning
model = xgb.XGBClassifier()
param_dist = {"max_depth": [10,30,50],
              "min_child_weight" : [1,3,6],
              "n_estimators": [200],
              "learning_rate": [0.05, 0.1,0.16],}
grid_search = GridSearchCV(model, param_grid=param_dist, cv = 3, 
                                   verbose=10, n_jobs=-1)
grid_search.fit(train, y_train)

In [None]:
grid_search.best_estimator_

In [None]:
model = xgb.XGBClassifier(max_depth=50, min_child_weight=1,  n_estimators=200,\
                          n_jobs=-1 , verbose=1,learning_rate=0.16)

In [None]:
model.fit(train,y_train)

In [None]:
auc(model, train, test)