In [1]:
import os
import pickle
from typing import Tuple

import numpy
import numpy as np
import pandas
import pandas as pd
import tensorflow as tf
import tqdm
from keras.layers import TextVectorization
from numba import cuda
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from configuration import FilePaths
device = cuda.get_current_device()
device.reset()
from configuration import Config

In [2]:

FILE_PATHS = FilePaths()

In [3]:
%run constants.py
# Set seeds to ensure reproduceability
tf.random.set_seed(Config.seed)
np.random.seed(Config.seed)
os.environ["PYTHONHASHSEED"] = str(Config.seed)

In [4]:
# Read all three CSVs into a pandas data frame (I was provided with a combined_documents.csv, secondary_patient_level_annotations.csv and a secondary_problem_level_annotations.csv)
for i in tqdm(range(0, 3), ncols=100, desc="Loading data.."):
    raw_df_as_read = pd.read_csv(FILE_PATHS.RAW_DATA)
    patient_level_annotations_as_read = pd.read_csv(FILE_PATHS.PATIENT_LEVEL_DATA)
    problem_level_annotations_as_read = pd.read_csv(FILE_PATHS.PROBLEM_LEVEL_DATA)
print("------Loading is completed ------")

Loading data..: 100%|█████████████████████████████████████████████████| 3/3 [00:36<00:00, 12.05s/it]

------Loading is completed ------





In [5]:
# We are given lots of extraeneous information, so lets strip non relavent data. Note that patient ID is unique to each patient. 
# Firstly we create a copy of the data, create an additional column called Date which contains the date record was made
# Then we drop unnessicary columns, and filter all records by the study date (1/1/2019 -> 31/12/2019)

raw_df = raw_df_as_read.copy()

# Then we split the recorded date to DateTimeDay column, so we can get concat records existing on a single day"
raw_df[["DateTimeDay", "DateTimeSeconds"]] = raw_df["RecordedDate"].str.split(
    "T", expand=True
)
raw_df["Date"] = pd.to_datetime(raw_df["DateTimeDay"], format="%Y-%m-%d")


raw_df.drop(Config.raw_data_columns_drop, axis=1, inplace=True, errors='ignore') #Drop un-needed columns
raw_df = raw_df[(raw_df["Date"] >= Config.study_start_date) & (raw_df["Date"] <= Config.study_end_date)] #Only include patients withint study start date
raw_df

Unnamed: 0,PatientId,RecordedDate,Document,DateTimeDay,Date
0,20482934,2019-12-12T08:55:12,Reason: 1-8 1st Vac And Microchips \r\nAppoint...,2019-12-12,2019-12-12
1,20482934,2019-12-19T09:50:16,Reason: 1-8 1st Vac And Microchips \r\nAppoint...,2019-12-19,2019-12-19
2,20482934,2019-12-19T09:50:16,DHPPi vaccination; Leptospira vaccination,2019-12-19,2019-12-19
3,20482934,2019-12-19T09:50:16,Doc:Vaccination Certificate ; Discount ; Updat...,2019-12-19,2019-12-19
15,8549342,2019-02-15T11:21:45,Discount: Care Plan ; Bravecto sml dog tablet ...,2019-02-15,2019-02-15
...,...,...,...,...,...
4371310,12688144,2019-12-01T00:00:00,Advocate 100 m/dog (4 -10kg) per pack (3) Alfi...,2019-12-01,2019-12-01
4371323,16198379,2019-09-11T15:48:24,"Reason: Restart Vaccines, Vet\r\nAppointment N...",2019-09-11,2019-09-11
4371324,16198379,2019-09-11T15:48:24,Vacc 2wk Dog ; VAR-Vaccinations & Health Chec...,2019-09-11,2019-09-11
4371325,16198379,2019-10-12T11:34:29,Reason: 2nd Restart Vac/nurse\r\nAppointment N...,2019-10-12,2019-10-12


In [6]:
patient_level_annotations = patient_level_annotations_as_read.copy()

patient_level_annotations.drop(Config.patient_level_columns_drop, axis=1, inplace=True)
patient_level_annotations

Unnamed: 0,PatientID,DateFirstRegistered,BirthDate,Colour,Sex,IsNeutered,SexSource,SourceBreed,SourceClinicPostcode,Is this patient included in the study_1
0,1368834,01/09/2014 12:46:00,06/07/2014 00:00:00,Tan Black,Male,True,Male,Shih Tzu X,,Yes
1,1446525,24/06/2015 12:07:23,01/05/2015 00:00:00,White & Gray,Male,True,Male,Border Terrier,,Yes
2,1528314,03/11/2015 10:26:44,08/05/2011 00:00:00,Brindle,Male,True,Male,French Bulldog,,Yes
3,2029771,15/10/2010 12:06:17,30/08/2010 00:00:00,Red,Female,True,Female,Cavalier King Charles Spaniel X,,Yes
4,2115458,14/01/2013 13:58:31,14/11/2012 00:00:00,White & Brown,Male,True,Male,Shih Tzu,,Yes
...,...,...,...,...,...,...,...,...,...,...
1617,20545218,,17/03/2010 00:00:00,,Unrecorded,True,Male - Neutered,MIX BREED,NG9 7AS,Yes
1618,20612250,09/09/2019 15:49:29,09/09/2008 00:00:00,,Unrecorded,True,Male - Neutered,TERRIER - JACK RUSSELL,SS8 7BW,Yes
1619,20627848,14/10/2019 16:28:11,27/08/2019 00:00:00,,Male,False,Male,Cavapoo,PE3 8DJ,Yes
1620,20650226,29/08/2019 09:13:05,29/04/2008 00:00:00,,Female,False,Female,Retriever - Labrador,ST14 5JU,Yes


In [7]:
study_patients = set(patient_level_annotations.loc[patient_level_annotations['Is this patient included in the study_1'] == 'Yes', 'PatientID'].tolist())

In [8]:
problem_level_annotations = problem_level_annotations_as_read.copy()

# Drop all patients NOT in study
problem_level_annotations  =  problem_level_annotations[problem_level_annotations.PatientID.isin(study_patients)]

problem_level_annotations[["DateTimeDay", "DateTimeSeconds"]] = problem_level_annotations["DocumentDate"].str.split(
    " ", expand=True
)
problem_level_annotations["Date"] = pd.to_datetime(problem_level_annotations["DateTimeDay"], format="%d/%m/%Y")

problem_level_annotations.drop(Config.problem_level_columns_drop, axis=1, inplace=True)

In [9]:
def check_multiple_entries(df)->list:
    "Check there are not multiple entries per day, if so remove these days from the data pool "
    multiple_entries_list = []
    for i in set(df.PatientID.tolist()):
        results = df.loc[df.PatientID ==i]
        if not results.Date.is_unique:
            multiple_entries_list.append(i)
    return multiple_entries_list

df = problem_level_annotations[~problem_level_annotations.PatientID.isin(check_multiple_entries(problem_level_annotations))]

if not len(check_multiple_entries(df)) == 0:
    raise Exception("Sorry, but it appears you have multiple categorised days!")
else:
    print("Data looks good")

Data looks good


In [10]:

print(f"You have {len(df.PatientID.unique())} patients included in this study")
print(f"and a total of {len(df.index)} 24 hour periods classified")

You have 1523 patients included in this study
and a total of 7449 24 hour periods classified


In [11]:
#Merge the EHRs to the DF
ehrs = []
for index, row in df.iterrows():
    ehr = raw_df[(raw_df.PatientId ==row.PatientID) & (raw_df.Date==row.Date)]
    ehr_list = ehr.Document.to_list()
    s = ' '.join(ehr_list)
    ehrs.append(s)
    
df.loc[:,'ehr'] = ehrs.copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,'ehr'] = ehrs.copy()


In [12]:
patient_id_set:set = set(df[Config.patient_id_column].values)

In [13]:
tmp_list: list = []
for patient_id in patient_id_set:
    print(patient_id)

15056905
8183822
19824655
12427280
13787152
20447252
19824664
2220078
8626223
2498616
1302593
8339527
12697678
17506384
18718801
11886676
13541461
20193368
17662044
18661481
8028270
19193979
8487038
11501694
8233092
18686092
20029583
13574290
18186389
18645143
12550300
2441376
18432167
18456744
13811886
18997423
19177651
13852852
11272373
19939524
2162887
11722951
11739335
19407050
3834060
20340948
7864540
7135464
8339691
14508270
20701427
14229749
13803775
11837698
13975831
18194715
13689116
13852996
19308870
19882315
3293526
16327015
19939687
19898732
8618355
16417150
12542337
20078984
20627848
8233357
13377939
2023828
7840149
16318870
19276182
2589082
19382688
19726769
8528306
3834295
18792889
19825081
20095428
14590411
16384471
19653090
8233447
18948587
7291373
8511986
8290804
17146358
20570618
20185597
13279750
17023500
13263380
20718104
14254617
17392153
17695262
3957284
13615663
18244153
12132926
20439623
11813452
12632654
20693583
20513365
20464215
19399259
6881888
19989094
150

In [16]:
df[df[Config.is_visit_column] == 'Yes']

Unnamed: 0,PatientID,DateFirstRegistered,BirthDate,Colour,Sex,IsNeutered,SexSource,SourceBreed,SourceClinicPostcode,Is this note a visit_2,Date,ehr
2,977030,06/04/2015 09:53:45,16/07/2011 00:00:00,Cream & Ginger,Female,True,Female,Jack Russell Terrier X,,Yes,2019-06-20,Health Assessment Reason: Loose Tooth - Vet\r\...
3,977030,06/04/2015 09:53:45,16/07/2011 00:00:00,Cream & Ginger,Female,True,Female,Jack Russell Terrier X,,Yes,2019-06-25,Re-check / Repeat / Recall / Medical Progress ...
4,977030,06/04/2015 09:53:45,16/07/2011 00:00:00,Cream & Ginger,Female,True,Female,Jack Russell Terrier X,,Yes,2019-07-02,Reason: Discuss Dental And Tooth Brushing - Nu...
6,977030,06/04/2015 09:53:45,16/07/2011 00:00:00,Cream & Ginger,Female,True,Female,Jack Russell Terrier X,,Yes,2019-09-14,Health Assessment Reason: Cut Paw - Vet\r\nApp...
7,977030,06/04/2015 09:53:45,16/07/2011 00:00:00,Cream & Ginger,Female,True,Female,Jack Russell Terrier X,,Yes,2019-09-24,Reason: Recheck - Vet\r\nAppointment Notes: sw...
...,...,...,...,...,...,...,...,...,...,...,...,...
8585,21228285,01/10/2019 10:15:00,28/07/2019 00:00:00,Red And White,Male,False,Male,Pembroke Welsh Corgi,SW8 1UQ,Yes,2019-10-03,consultation ; Drontal Dog Tasty Bone Tab Give...
8586,21228285,01/10/2019 10:15:00,28/07/2019 00:00:00,Red And White,Male,False,Male,Pembroke Welsh Corgi,SW8 1UQ,Yes,2019-10-07,2nd vac
8587,21228285,01/10/2019 10:15:00,28/07/2019 00:00:00,Red And White,Male,False,Male,Pembroke Welsh Corgi,SW8 1UQ,Yes,2019-12-21,BAR\nOral exam NAD\nTracheal palp NAD\nThoraci...
8588,21335865,20/12/2019 13:54:53,02/11/2019 00:00:00,,Male,False,Male,Lurcher,UB4 9AX,Yes,2019-12-28,vac puppy 1st including health check check all...
