# Model for detecting if a patient electronic health record is a physical visit

## About the notebook
- This jupyter notebook will take VetCompass data, format it for use within machine learning models, then use simple embedding techniques to train a model for classification.
- A physical visit is any electronic health record in where a patient recieved a physical examination. 
- The embedding technique used is #
- While multiple neural network architypes are available to solve this problem, this example uses a convulational neural network as an example. 

## About the author
- This notebook was born from a MSc project which used this very data - link to it


## Notes to reader
- Explanatory instructions provided within cells


In [None]:
# Set imports and relative data paths.

import pandas as pd
from tqdm import tqdm
import numpy as np
import os
import tensorflow as tf
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from nltk.tokenize import word_tokenize
from tensorflow.keras.layers import Embedding
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow import keras
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D, Bidirectional
import matplotlib.pyplot as plt   
plt.style.use('ggplot')

nltk.download('stopwords');
nltk.download('punkt')

SEED = 2023


DATA_DIR: str = "/home/aaron/aaron_data/"
UPDATED_DATA_DIR: str = "/home/aaron/updated_data"
RAW_DF: str = os.path.join(DATA_DIR, "combined_documents.csv")
PATIENT_LEVEL_DF: str = os.path.join(
    UPDATED_DATA_DIR, "secondary_patient_level_annotations_v1.csv"
)
PROBLEM_LEVEL_DF: str = os.path.join(
    UPDATED_DATA_DIR, "secondary_problem_level_annotations_v1.csv"
)
REARRANGED_DATA_FILEPATH: str = os.path.join(
    "/home/aaron/timeseries_nlp/data/", "rearranged_data.csv"
)
REARRANGED_DATA_FILEPATH_FLAT: str = os.path.join(
    "/home/aaron/timeseries_nlp/data/", "rearranged_data_flat.csv"
)

class Config:
    vocab_size = 20000
    batch_size = 16
    epochs = 50
    labels = ['Visit']
    cleaned_ehr_column = "rejoined"
    stopword_ehr_column = "stopwords_removed"
    tokenized_ehr_column = "tokenized_ehr"
    target_column = "Visit"
    ehr_column = "ehr"
    max_len = 200
    max_features = 5000
    dimensions = 100
config = Config()

# Set seeds to ensure reproduceability

tf.random.set_seed(
    SEED
)
np.random.seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)

import random
random.seed(SEED)

# https://odsc.medium.com/properly-setting-the-random-seed-in-ml-experiments-not-as-simple-as-you-might-imagine-219969c84752

# Organise VC Files

In [None]:
# Read all three CSVs into a pandas data frame (I was provided with a combined_documents.csv, secondary_patient_level_annotations.csv and a secondary_problem_level_annotations.csv)
for i in tqdm(range(0, 3), ncols=100, desc="Loading data.."):
    raw_df_as_read = pd.read_csv(RAW_DF)
    patient_level_annotations_as_read = pd.read_csv(PATIENT_LEVEL_DF)
    problem_level_annotations_as_read = pd.read_csv(PROBLEM_LEVEL_DF)
print("------Loading is completed ------")


In [None]:
# We are given lots of extraeneous information, so lets strip non relavent data. Note that patient ID is unique to each patient. 
# Firstly we create a copy of the data, create an additional column called Date which contains the date record was made
# Then we drop unnessicary columns, and filter all records by the study date (1/1/2019 -> 31/12/2019)

raw_df = raw_df_as_read.copy()

"Split the recorded date to DateTimeDay column, so we can get records existing on a single day"
raw_df[["DateTimeDay", "DateTimeSeconds"]] = raw_df["RecordedDate"].str.split(
    "T", expand=True
)
raw_df["Date"] = pd.to_datetime(raw_df["DateTimeDay"], format="%Y-%m-%d")


raw_df.drop(['Type', 'DateTimeSeconds', 'CaseNumber', 'DataSiloName', 'LatestPatientVersionID'], axis=1, inplace=True, errors='ignore') #Drop un-needed columns
raw_df = raw_df[(raw_df["Date"] >= "2019-1-1") & (raw_df["Date"] <= "2019-12-31")] #Only include patients withint study start date
raw_df

In [None]:
patient_level_annotations = patient_level_annotations_as_read.copy()

patient_level_annotations.drop([
 'CaseNumber',
 'LatestPatientVersionID',
 'BreedVeNomID',
 'DataSiloName',
 'VetCompassBreed',
 'SpeciesVeNomID',
 'SourceSpecies',
 'VetCompassSpecies',
 'FirstNoteDate',
 'LatestPatientVersionDate',
 'FirstVersionDate',
 'LastNoteDate',
 'CodingStarted',
 'FirstClinicId',
 'SourceClinicName',
 'IsArchived'], axis=1, inplace=True)
patient_level_annotations

In [None]:
study_patients = set(patient_level_annotations.loc[patient_level_annotations['Is this patient included in the study_1'] == 'Yes', 'PatientID'].tolist())

In [None]:
problem_level_annotations = problem_level_annotations_as_read.copy()

# Drop all patients NOT in study
problem_level_annotations  =  problem_level_annotations[problem_level_annotations.PatientID.isin(study_patients)]

problem_level_annotations[["DateTimeDay", "DateTimeSeconds"]] = problem_level_annotations["DocumentDate"].str.split(
    " ", expand=True
)
problem_level_annotations["Date"] = pd.to_datetime(problem_level_annotations["DateTimeDay"], format="%d/%m/%Y")

problem_level_annotations.drop([ 'DataSiloName', 'ProblemID', 'VeNomID',
       'VetCompassProblemID', 'TermName', 'Context',
       'TextHighlightedWhenCreated','CodingStarted',
       'CaseNumber', 'LatestPatientVersionID', 'BreedVeNomID',
       'VetCompassBreed', 'SpeciesVeNomID', 'SourceSpecies',
       'VetCompassSpecies', 'FirstVersionDate', 'LatestPatientVersionDate',
       'FirstNoteDate', 'LastNoteDate', 'FirstClinicId', 'SourceClinicName',"DateTimeDay", "DateTimeSeconds","DocumentDate",
 'IsArchived'], axis=1, inplace=True)

In [None]:
def check_multiple_entries(df)->list:
    "Check there are not multiple entries per day, if so remove these days from the data pool "
    multiple_entries_list = []
    for i in set(df.PatientID.tolist()):
        results = df.loc[df.PatientID ==i]
        if not results.Date.is_unique:
            multiple_entries_list.append(i)
    return multiple_entries_list

df = problem_level_annotations[~problem_level_annotations.PatientID.isin(check_multiple_entries(problem_level_annotations))]

if not len(check_multiple_entries(df)) == 0:
    raise Exception("Sorry, but it appears you have multiple categorised days!")
    

In [None]:

print(f"You have {len(df.PatientID.unique())} patients included in this study")
print(f"and a total of {len(df.index)} 24 hour periods classified")


In [None]:
#Merge the EHRs to the DF
ehrs = []
for index, row in df.iterrows():
    ehr = raw_df[(raw_df.PatientId ==row.PatientID) & (raw_df.Date==row.Date)]
    ehr_list = ehr.Document.to_list()
    s = ' '.join(ehr_list)
    ehrs.append(s)
    
df.loc[:,'ehr'] = ehrs.copy()

# Data Preprocessing

In [None]:
# some prettying of the dataframe
df = df.replace({'Is this note a visit_2': {'Yes': 1, 'No': 0}})
df.rename(columns={'Is this note a visit_2': 'Visit'}, inplace=True)

# Rules based breed disambugation. 

# df['SourceBreed'].replace('TERRIER - WEST HIGHLAND W','West Highland White Terrier',inplace=True)
# df['SourceBreed'].replace('SPANIEL - CAVALIER KING C','Cavalier King Charles Spaniel',inplace=True)
# df['SourceBreed'].replace('King Charles Spaniel','Cavalier King Charles Spaniel',inplace=True)
# df['SourceBreed'].replace('English Cocker Spaniel','Spaniel (Cocker)',inplace=True)
# df['SourceBreed'].replace('German Shepherd','German Shepherd Dog',inplace=True)
# df['SourceBreed'].replace('SHEPHERD - GERMAN UNSPEC','German Shepherd Dog',inplace=True)
# df['SourceBreed'].replace('SHEPHERD - GERMAN OLD','German Shepherd Dog',inplace=True)
# df['SourceBreed'].replace('SHEPHERD DOG - GERMAN','German Shepherd Dog',inplace=True)
# df['SourceBreed'].replace('Chihuahua - Longhaired','Chihuahua (Long Coat)',inplace=True)
# df['SourceBreed'].replace('Chihuahua','Chihuahua (Smooth Coat)',inplace=True)
# df['SourceBreed'].replace('Chihuahua Smooth Coat','Chihuahua (Smooth Coat)',inplace=True)
# df['SourceBreed'].replace('Chihuahua, Short-Haired','Chihuahua (Smooth Coat)',inplace=True)
# df['SourceBreed'].replace('Chihuahua, Long-Haired','Chihuahua (Long Coat)',inplace=True)
# df['SourceBreed'].replace('Wire Fox Terrier','Fox Terrier (Wire)',inplace=True)
# df['SourceBreed'].replace('TERRIER - FOX (UNSPECIFIE','Fox Terrier (Smooth)',inplace=True)
# df['SourceBreed'].replace('Husky','Siberian Husky',inplace=True)
# df['SourceBreed'].replace('Husky - Siberian','Siberian Husky',inplace=True)
# df['SourceBreed'].replace('Doberman Pinscher','Dobermann',inplace=True)
# df['SourceBreed'].replace('Chinese Shar-Pei','Shar Pei',inplace=True)
# df['SourceBreed'].replace('English Springer Spaniel','Spaniel (English Springer)',inplace=True)
# df['SourceBreed'].replace('Springer Spaniel','Spaniel (English Springer)',inplace=True)
# df['SourceBreed'].replace('Mastiff','Mastiff',inplace=True)
# df['SourceBreed'].replace('Poodle Standard','Poodle (Standard)',inplace=True)
# df['SourceBreed'].replace('Collie','Border Collie',inplace=True)
# df['SourceBreed'].replace('Collie - Border','Border Collie',inplace=True)
# df['SourceBreed'].replace('Dachshund Miniature           ','Dachshund',inplace=True)
# df['SourceBreed'].replace('Dachshund Miniature Smooth Haired','Dachshund',inplace=True)
# df['SourceBreed'].replace('Dachshund Miniature Long Haired','Dachshund',inplace=True)
# df['SourceBreed'].replace('Dachshund Miniature (Smooth-Haired)','Dachshund',inplace=True)
# df['SourceBreed'].replace('DACHSHUND, MINI LONG-HAIR','Dachshund',inplace=True)
# df['SourceBreed'].replace('DACHSHUND, STAND WIRE-H','Dachshund',inplace=True)
# df['SourceBreed'].replace('DACHSHUND, MINI SMOOTH-H','Dachshund',inplace=True)
# df['SourceBreed'].replace('Dachshund Standard','Dachshund',inplace=True)
# df['SourceBreed'].replace('Dachshund Smooth Haired','Dachshund',inplace=True)
# df['SourceBreed'].replace('Longhaired Miniature Dachshund','Dachshund',inplace=True)
# df['SourceBreed'].replace('Golden Retriever','Retriever (Golden)',inplace=True)
# df['SourceBreed'].replace('RETRIEVER - GOLDEN (GOLDE','Retriever (Golden)',inplace=True)
# df['SourceBreed'].replace('RETRIEVER - LABRADOR (LAB','Retriever (Labrador)',inplace=True)
# df['SourceBreed'].replace('Retriever - Labrador','Retriever (Labrador)',inplace=True)
# df['SourceBreed'].replace('Retriever','Retriever (Labrador)',inplace=True)
# df['SourceBreed'].replace('JAPANESE  AKITA','Japanese Akita Inu',inplace=True)
# df['SourceBreed'].replace('Akita ','Japanese Akita Inu',inplace=True)
# df['SourceBreed'].replace('American Akita','Japanese Akita Inu',inplace=True)
# df['SourceBreed'].replace('Akita - Japanese Inu','Japanese Akita Inu',inplace=True)
# df['SourceBreed'].replace('Shiba Inu','Japanese Shiba Inu',inplace=True)
# df['SourceBreed'].replace('Weimeraner','Weimaraner',inplace=True)
# df['SourceBreed'].replace('Shih-tzu','Shih Tzu',inplace=True)
# df['SourceBreed'].replace('Miniature Poodle','Poodle (Miniature)',inplace=True)
# df['SourceBreed'].replace('Chinese Crested, Hairless','Chinese Crested',inplace=True)
# df['SourceBreed'].replace('Poodle, Toy','Poodle (Miniature)',inplace=True)
# df['SourceBreed'].replace('Cocker Spaniel','Spaniel (Cocker)',inplace=True)
# df['SourceBreed'].replace('SPANIEL - COCKER (UNSPECI','Spaniel (Cocker)',inplace=True)
# df['SourceBreed'].replace('Spaniel - Cocker, working','Spaniel (Cocker)',inplace=True)
# df['SourceBreed'].replace('SPANIEL - SPRINGER, ENGLI','Spaniel (English Springer)',inplace=True)
# df['SourceBreed'].replace('SPANIEL - SPRINGER (UNSPE','Spaniel (English Springer)',inplace=True)
# df['SourceBreed'].replace('ENGLISH SPANIEL','Spaniel (English Springer)',inplace=True)
# df['SourceBreed'].replace('STAFFORDSHIRE BULL TERRIE','Staffordshire Bull Terrior',inplace=True)
# df['SourceBreed'].replace('TERRIER - STAFF BULL ENG','Staffordshire Bull Terrior',inplace=True)
# df['SourceBreed'].replace('TERRIER - AMERICAN PIT BU','Staffordshire Bull Terrior',inplace=True)
# df['SourceBreed'].replace('TERRIER - STAFF BULL UNSP','Staffordshire Bull Terrior',inplace=True)
# df['SourceBreed'].replace('TERRIER - STAFFORDSHIRE BULL','Staffordshire Bull Terrior',inplace=True)
# df['SourceBreed'].replace('Terrier - Staffordshire Bull, English','Staffordshire Bull Terrior',inplace=True)
# df['SourceBreed'].replace('Irish Staffordshire Bull Terrier','Staffordshire Bull Terrior',inplace=True)
# df['SourceBreed'].replace('Olde English Bulldogge','Bulldog',inplace=True)
# df['SourceBreed'].replace('SCHNAUZER, MINIATURE (MIN','Minature Schanuzer',inplace=True)
# df['SourceBreed'].replace('SCHNAUZER, MINIATURE (MIN','Minature Schanuzer',inplace=True)
# df['SourceBreed'].replace('Setter - Irish, Red','Irish Setter',inplace=True)
# df['SourceBreed'].replace('SETTER - IRISH, RED AND W','Irish Setter',inplace=True)
# df['SourceBreed'].replace('TERRIER - YORKSHIRE (YORK','Yorkshire Terrier',inplace=True)
# df['SourceBreed'].replace('TERRIER - YORKSHIRE, MINI','Yorkshire Terrier',inplace=True)
# df['SourceBreed'].replace('Terrier - Yorkshire','Yorkshire Terrier',inplace=True)
# df['SourceBreed'].replace('TERRIER - JACK RUSSELL','Jack Russell Terrier',inplace=True)
# df['SourceBreed'].replace('American Pit Bull Terrier','Bull Terrier',inplace=True)
# df['SourceBreed'].replace('Terrier - Tibetan','Tibetan Terrior',inplace=True)
# df['SourceBreed'].replace('Terrier - Tibetan','Tibetan Terrior',inplace=True)
# df['SourceBreed'].replace('Bulldog - British','Bulldog',inplace=True)
# df['SourceBreed'].replace('Collie - Scottish Smooth','Collie (Smooth)',inplace=True)
# df['SourceBreed'].replace('Collie - Scottish Rough','Collie (Rough)',inplace=True)
# df['SourceBreed'].replace('TERRIER - BORDER (BORDER','Border Terrier',inplace=True)
# df['SourceBreed'].replace('POINTER - GERMAN, ROUGH-H','German Wirehaired Pointer',inplace=True)
# df['SourceBreed'].replace('Setter - English','English Setter',inplace=True)
# df['SourceBreed'].replace('HUNGARIAN VIZSLA, SMOOTH-','Hungarian Visla',inplace=True)
# df['SourceBreed'].replace('Ridgeback - Rhodesian','Rohdesian Ridgeback',inplace=True)
# df['SourceBreed'].replace('English Bull Terrier','Bull Terrier',inplace=True)
# df['SourceBreed'].replace('Poodle Toy','Poodle (Toy)',inplace=True)
# df['SourceBreed'].replace('Bulldog - French','French Bulldog',inplace=True)
# df['SourceBreed'].replace('Terrier - Bull American','Bull Terrier',inplace=True)
# df['SourceBreed'].replace('HEELER - LANCASHIRE (HEEL','Lancashire Heeler',inplace=True)
# df['SourceBreed'].replace('Bichon - Frise','Bichon Frise',inplace=True)
# df['SourceBreed'].replace('bichon','Bichon Frise',inplace=True)
# df['SourceBreed'].replace('Sheepdog - Old English','Old English Sheepdog',inplace=True)
# df['SourceBreed'].replace('Terrier - Boston','Boston Terrier',inplace=True)
# df['SourceBreed'].replace('Terrier - Patterdale','Patterdale Terrier',inplace=True)
# df['SourceBreed'].replace('Terrier - Welse','Welsh Terrier',inplace=True)
# df['SourceBreed'].replace('Italian Spinone (Spinoni)','Italian Spinone',inplace=True)
# df['SourceBreed'].replace('SHEEPDOG - SHETLAND (SHEL','Shetland Sheepdog',inplace=True)
# df['SourceBreed'].replace('Basset','Basset Hound',inplace=True)
# df['SourceBreed'].replace('Schnauzer, Giant','Giant Schnauzer',inplace=True)
# df['SourceBreed'].replace('Terrier - Scottish','Scottish Terrier',inplace=True)
# df['SourceBreed'].replace('Hound - Basset','Basset Hound',inplace=True)
# df['SourceBreed'].replace('Terrier - Lakeland','Lakeland Terrier',inplace=True)
# df['SourceBreed'].replace('POINTER - GERMAN, SHORT-H','German Shorthaired Pointed',inplace=True)
# df['SourceBreed'].replace('TERRIER - JACK RUSS. MINI','Jack Russel Terrier',inplace=True)
# df['SourceBreed'].replace('Schnauzer, Miniature','Minature Schnauzer',inplace=True)
# df['SourceBreed'].replace('TERRIER - STAFFORDSHIRE,','Staffordshire Terrier',inplace=True)
# df['SourceBreed'].replace('Beagle - English','Beagle',inplace=True)
# df['SourceBreed'].replace('Toy Poodle','Poodle (Toy)',inplace=True)
# df['SourceBreed'].replace('Nova Scotia Duck Tolling Retriever ','Retriever (Nova Scotia Duck Tolling)',inplace=True)

# df['SourceBreed'].replace('English Cocker Spaniel X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Pedenco Maneto','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Jug','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Plummer Terrier','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Shikoku','Crossbreed',inplace=True)
# df['SourceBreed'].replace('CHINA JACK','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Cavachon','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Cavapoo','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Cockerpoo','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Huntaway','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Sprocker','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Labrador Retriever X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Shih Tzu X ','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Jack Russell Terrier X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Collie X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Golden Retriever X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('English Springer Spaniel X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Maltese X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Border Collie X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Bichon Frise X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Northern Inuit Dog X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Rottweiler X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Chihuahua X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Yorkshire Terrier X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Schnauzer X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Chihuahua X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('CROSSBREED - SMALL','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Lhasa Apso X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Irish Wolfhound X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('German Shepherd X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Cavalier King Charles Spaniel X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Pug X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Boxer X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Rhodesian Ridgeback X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Lurcher X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Staffie X German Shepherd','Crossbreed',inplace=True)
# df['SourceBreed'].replace('X breed sprocker','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Cross Breed Small','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Terrier X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('jack russellxchihuahua','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Staffordshire Bull Terrier X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Border Terrier X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Husky X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('BORDER COLLIE CROSS DACHSHUND','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Cross Breed Medium','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Springerdor (springer x lab)','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Jack Russell Cross','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Australian labradoodle','Crossbreed',inplace=True)
# df['SourceBreed'].replace('crossbreed small','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Mixed','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Doberman X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('crossbreed - medium','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Labradoodle - medium','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Basset Hound X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Poodle x Schnauzer','Crossbreed',inplace=True)
# df['SourceBreed'].replace('French Bulldog X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Pomeranian X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Akita X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Bulldog X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Miniature Schnauzer X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('German Pointer X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Beagle X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('MIX BREED','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Plummer Terrier X','Crossbreed',inplace=True)
# df['SourceBreed'].replace('SHIH-TZU CROSS','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Cross Breed','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Yorkshire Terrier cross Poodle','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Terrier Cross','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Crossbreed - Large','Crossbreed',inplace=True)
# df['SourceBreed'].replace('SHIH-TZU CROSS','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Patterdale Cross','Crossbreed',inplace=True)
# df['SourceBreed'].replace('Chorkie','Crossbreed',inplace=True)

In [None]:
def tokenize(column):
    """Tokenizes a Pandas dataframe column and returns a list of tokens.

    Args:
        column: Pandas dataframe column (i.e. df['text']).

    Returns:
        tokens (list): Tokenized list, i.e. [Donald, Trump, tweets]

    """

    tokens = nltk.word_tokenize(column)
    return [w for w in tokens if w.isalpha()]   

In [None]:

def remove_stopwords(tokenized_column):
    """Return a list of tokens with English stopwords removed. 

    Args:
        column: Pandas dataframe column of tokenized data from tokenize()

    Returns:
        tokens (list): Tokenized list with stopwords removed.

    """
    stops = set(stopwords.words("english"))
    return [word for word in tokenized_column if not word in stops]



In [None]:
def rejoin_words(tokenized_column):
    """Rejoins a tokenized word list into a single string. 
    
    Args:
        tokenized_column (list): Tokenized column of words. 
        
    Returns:
        string: Single string of untokenized words. 
    """
    return ( " ".join(tokenized_column))


In [None]:
df[config.tokenized_ehr_column] = df.apply(lambda x: tokenize(x[config.ehr_column]), axis=1)
df[config.stopword_ehr_column ] = df.apply(lambda x: remove_stopwords(x[config.tokenized_ehr_column]), axis=1)
df[config.cleaned_ehr_column] = df.apply(lambda x: rejoin_words(x[config.stopword_ehr_column]), axis=1)
df[config.cleaned_ehr_column] = df[config.cleaned_ehr_column].astype(str).str.lower()

In [None]:
df.sample(5)

# Helper Functions

In [None]:
def df_split(df:pd.DataFrame, split_ratio: float):
    """
    This function generates the two splits from an input dataframe, based on a ratio
    
    Parameters:
        dataframe: pandas dataframe
        split_ratio: float between 0.0 and 1.0 and represent the proportion of the dataset split
    
    Returns:
        train_samples: list of strings in the training dataset
        val_samples: list of strings in the validation dataset
        train_labels: list of labels (0 or 1) in the training dataset
        val_labels: list of labels (0 or 1) in the validation dataset      
    """
       
    text = df[config.cleaned_ehr_column].values.tolist()
    targets = df[config.target_column].values.tolist()
    rng = np.random.RandomState(SEED)
    rng.shuffle(text)
    rng = np.random.RandomState(SEED)
    rng.shuffle(targets)

    num_validation_samples = int(split_ratio * len(text))

    train_samples = text[:-num_validation_samples]
    val_samples = text[-num_validation_samples:]
    train_labels = targets[:-num_validation_samples]
    val_labels = targets[-num_validation_samples:]

    
    return train_samples, val_samples, train_labels, val_labels

In [None]:
df = df[df[config.target_column].notna()]
train_samples, val_samples, train_labels, val_labels = df_split(df, 0.2)
val_samples, test_samples, val_labels, test_labels = df_split( pd.DataFrame(
    {config.cleaned_ehr_column: val_samples,
     config.target_column: val_labels
    }), 0.3)

print(f"Total size of the dataset: {df.shape[0]}.")
print(f"Training dataset: {len(train_samples)}.")
print(f"Validation dataset: {len(val_samples)}.")
print(f"Test dataset: {len(test_samples)}.")

In [None]:
def make_embedding_matrix(train_samples, val_samples, embeddings_index):
    """
    This function computes the embedding matrix that will be used in the embedding layer
    
    Parameters:
        train_samples: list of strings in the training dataset
        val_samples: list of strings in the validation dataset
        embeddings_index: Python dictionary with word embeddings
    
    Returns:
        embedding_matrix: embedding matrix with the dimensions (num_tokens, embedding_dim), where num_tokens is the vocabulary of the input data, and emdebbing_dim is the number of components in the GloVe vectors (can be 50,100,200,300)
        vectorizer: TextVectorization layer      
    """
    
    vectorizer = tf.keras.layers.TextVectorization(max_tokens=30000, output_sequence_length=config.max_len)
    text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128)
    vectorizer.adapt(text_ds)
    
    voc = vectorizer.get_vocabulary()
    word_index = dict(zip(voc, range(len(voc))))
      
    num_tokens = len(voc)
    
    hits = 0
    misses = 0

#   creating an embedding matrix
    embedding_dim = len(embeddings_index['the'])
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1
            
    print(f"Converted {hits} words ({misses} misses).")

    return embedding_matrix, vectorizer

In [None]:
path_to_glove_file = '../timeseries_nlp/data/glove.6B.100d.txt'

embeddings_index = {}

f = open(path_to_glove_file, 'r', encoding='utf8')
for line in f:
    splitLine = line.split(' ')
    word = splitLine[0]                                  # the first entry is the word
    coefs = np.asarray(splitLine[1:], dtype='float32')   # these are the vectors representing word embeddings
    embeddings_index[word] = coefs
print("Glove data loaded! In total:",len(embeddings_index)," words.")

In [None]:

def initialize_lstm_nn(embedding_matrix):  
    num_tokens = embedding_matrix.shape[0]
    embedding_dim = embedding_matrix.shape[1]
    embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
    trainable=False
    ) 
    int_sequences_input = tf.keras.Input(shape=(None,), dtype="int64")
    x = embedding_layer(int_sequences_input) 
    x = layers.Dropout(0.5)(x)
    x = layers.LSTM(128,return_sequences=True)(x)
    x = layers.Conv1D(128, 3, activation='relu')(x)
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dense(64, activation="relu")(x)
    preds = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(int_sequences_input, preds)
    
    return model

In [None]:
def initialize_cnn_nn(embedding_matrix):  
    num_tokens = embedding_matrix.shape[0]
    embedding_dim = embedding_matrix.shape[1]
    embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
    trainable=False
    ) 
    int_sequences_input = tf.keras.Input(shape=(None,), dtype="int64")
    x = embedding_layer(int_sequences_input) 
    x = layers.Dropout(0.5)(x)
    x = layers.Conv1D(128, 3, activation='relu')(x)
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dense(64, activation="relu")(x)
    preds = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(int_sequences_input, preds)
    
    return model

In [None]:
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D, Bidirectional
def initialize_bilstm_nn(embedding_matrix):  
    num_tokens = embedding_matrix.shape[0]
    embedding_dim = embedding_matrix.shape[1]
    embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
    trainable=False
    ) 
    int_sequences_input = tf.keras.Input(shape=(None,), dtype="int64")
    x = embedding_layer(int_sequences_input) 
    x = layers.Dropout(0.5)(x)
    x = Bidirectional(layers.LSTM(128,return_sequences=True))(x)
    x = layers.Conv1D(128, 3, activation='relu')(x)
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dense(64, activation="relu")(x)
    preds = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(int_sequences_input, preds)
    
    return model 

In [None]:
def initialize_bilstm_stacked_nn(embedding_matrix):  
    num_tokens = embedding_matrix.shape[0]
    embedding_dim = embedding_matrix.shape[1]
    embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
    trainable=False
    ) 
    int_sequences_input = tf.keras.Input(shape=(None,), dtype="int64")
    x = embedding_layer(int_sequences_input) 
    x = layers.Dropout(0.5)(x)
    x = Bidirectional(layers.LSTM(128,return_sequences=True))(x)
    x = Bidirectional(layers.LSTM(128))(x)
    x = layers.Flatten()(x)
    x = layers.Dense(64, activation="relu")(x)
    preds = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(int_sequences_input, preds)
    
    return model

In [None]:
embedding_matrix, vectorizer = make_embedding_matrix(train_samples, val_samples, embeddings_index)

In [None]:
from keras.optimizers import Adam
def train_nn(model, train_samples, val_samples, train_labels, val_labels, vectorizer, stop = False, verbose=1):
    
    model.compile(loss="binary_crossentropy", 
              optimizer=Adam(learning_rate=1e-3), 
              metrics=["acc"])
    
    x_train = vectorizer(np.array([[s] for s in train_samples])).numpy()
    x_val = vectorizer(np.array([[s] for s in val_samples])).numpy()
    
    y_train = np.asarray(train_labels).astype('float32').reshape((-1,1))
    y_val = np.asarray(val_labels).astype('float32').reshape((-1,1))
    
    if stop:
        early_stopping = EarlyStopping(monitor='val_loss', patience=1)
        history = model.fit(x_train, y_train, batch_size=config.batch_size, epochs=config.epochs, validation_data=(x_val, y_val), callbacks=[early_stopping], verbose=verbose)
    else:
        history = model.fit(x_train, y_train, batch_size=config.batch_size, epochs=config.epochs, validation_data=(x_val, y_val), verbose=verbose)
        
    return model, history

In [None]:
models: list = []
models.append([initialize_cnn_nn(embedding_matrix), 'CNN'])
models.append([initialize_bilstm_stacked_nn(embedding_matrix), 'BILSTM_STACKED'])
models.append([initialize_bilstm_nn(embedding_matrix), 'BILSTM'])
models.append([initialize_lstm_nn(embedding_matrix), 'LSTM'])


In [None]:
histories: list = []
for model in models: 
    print(f"Training model: {model[1]}")
    print()
    trained_model, history = train_nn(model[0], train_samples, val_samples, train_labels, val_labels, vectorizer, stop=True)
    print()
    print(f"{model[1]} trained for {len(history.history['loss'])} epochs")
    print()
    histories.append([history, model[1]])

In [None]:
# here we define a function to plot the history of Keras model training
def plot_history(history, model_name:str):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    fig = plt.figure(figsize=(12,5))
    ax1 = fig.add_subplot(121)    
    ax1.plot(x, acc, 'b', label='Training acc')
    ax1.plot(x, val_acc, 'r', label='Validation acc')
    ax1.set_title(f'Training and validation accuracy for {model_name}')
    ax1.set_ylim(0,1)
    ax1.legend()
    
    ax2 = fig.add_subplot(122)
    ax2.plot(x, loss, 'b', label='Training loss')
    ax2.plot(x, val_loss, 'r', label='Validation loss')
    ax2.set_title(f'Training and validation loss {model_name}')
    ax2.legend()

In [None]:
for history in histories:
    plot_history(history[0], history[1])

In [None]:
def predict_nn(df, model):
       
    string_input = keras.Input(shape=(1,), dtype="string")
    x = vectorizer(string_input)
    preds = model(x)
    end_to_end_model = keras.Model(string_input, preds)

    probabilities = end_to_end_model.predict(df[config.cleaned_ehr_column])
    
    predictions = [1 if i > 0.5 else 0 for i in probabilities]
    return predictions

df1 = pd.DataFrame(
    {config.cleaned_ehr_column: test_samples,
     config.target_column: test_labels
    })
    
for model in models:
    predictions = predict_nn(df1, model[0])
    m = tf.keras.metrics.Accuracy()
    m.update_state(test_labels, predictions)
    print(f'{model[1]} Test Model Accuracy: {m.result().numpy()}')