# Preeclampsia Patient Flow Analysis with Synthea Data 

Based on:  
https://github.com/synthetichealth/synthea  
https://github.com/Neo4jSolutions/patient-journey-model/tree/master/ingest  
https://github.com/ccdgui/Patient_Flows_Sankey  
  

In [2]:
import numpy as np
import re
import datetime

In [3]:
from py2neo import  Graph, Node
import pandas as pd

![](../synthea/schema.png)

In [4]:
import importlib
import secrets 

In [5]:
db = Graph(scheme="bolt", host=secrets.host, port=secrets.port, secure=False, 
                auth=(secrets.user, secrets.password))

In [6]:
query="""MATCH (c:Condition {description:"Preeclampsia"}) <-[:HAS_CONDITION]-(e)
return count(e)
"""
df=db.run(query).to_data_frame()
df

Unnamed: 0,count(e)
0,1140


In [7]:
query="""MATCH (c:Condition {description:"Preeclampsia"}) <-[:HAS_CONDITION]-(e)
with e
MATCH (patient)-[:HAS_ENCOUNTER]-(e)-[:NEXT*]->(e2)-[:HAS_CONDITION|:HAS_DRUG|:HAS_CARE_PLAN|:HAS_ALLERGY|:HAS_PROCEDURE]->(x) 
WHERE e2.date <= ( e.date + duration("P90D") )
OPTIONAL MATCH (e2)-[:HAS_END]->(end)
RETURN labels(x)[0] AS eventType, x.description AS name, 
     e2.date AS startDate,coalesce(end.date, "NA") AS endDate, id(patient) as patient, e2.isEnd as isEnd
     ORDER BY startDate
      """

df=db.run(query).to_data_frame()

In [8]:
df.loc[df['patient']==34947]

Unnamed: 0,eventType,name,startDate,endDate,patient,isEnd
607,Procedure,Evaluation of uterine fundal height,2012-03-19T20:41:33.000000000+00:00,,34947,False
608,Procedure,Streptococcus pneumoniae group B antigen test,2012-03-19T20:41:33.000000000+00:00,,34947,False
609,Procedure,Auscultation of the fetal heart,2012-03-19T20:41:33.000000000+00:00,,34947,False
634,Procedure,Auscultation of the fetal heart,2012-04-02T20:41:33.000000000+00:00,,34947,False
635,Procedure,Evaluation of uterine fundal height,2012-04-02T20:41:33.000000000+00:00,,34947,False
651,Procedure,Evaluation of uterine fundal height,2012-04-09T20:41:33.000000000+00:00,,34947,False
652,Procedure,Auscultation of the fetal heart,2012-04-09T20:41:33.000000000+00:00,,34947,False
655,Procedure,Auscultation of the fetal heart,2012-04-16T20:41:33.000000000+00:00,,34947,False
656,Procedure,Evaluation of uterine fundal height,2012-04-16T20:41:33.000000000+00:00,,34947,False
666,CarePlan,Routine antenatal care,2012-04-23T00:00:00.000000000+00:00,,34947,True


In [9]:
df=df.drop_duplicates()

In [10]:
df=df[df['eventType']=="Procedure"]


In [11]:
df[df['name']=="Preeclampsia"]

Unnamed: 0,eventType,name,startDate,endDate,patient,isEnd


In [12]:
df['startDate']=df['startDate'].apply(lambda x: pd.to_datetime(str(x.year)+"-"+str(x.month)+"-"+str(x.day)))

In [13]:
for p in df.patient.unique():
    df.loc[df['patient']==p,'delta']=df.loc[df['patient']==p,'startDate']-df.loc[df['patient']==p,'startDate'].shift(1)

In [14]:
df['delta']=df['delta'].apply(lambda x: pd.Timedelta(x).days)

In [15]:
df['event']=np.nan
for p in df.patient.unique():
    df.loc[df['patient']==p,'event']=np.where(df.loc[df['patient']==p,'delta']>90,1,0)
    df.loc[df['patient']==p,'event']=df.loc[df['patient']==p,'event'].cumsum()

In [16]:
df.loc[df['patient']==34947]

Unnamed: 0,eventType,name,startDate,endDate,patient,isEnd,delta,event
607,Procedure,Evaluation of uterine fundal height,2012-03-19,,34947,False,,0.0
608,Procedure,Streptococcus pneumoniae group B antigen test,2012-03-19,,34947,False,0.0,0.0
609,Procedure,Auscultation of the fetal heart,2012-03-19,,34947,False,0.0,0.0
634,Procedure,Auscultation of the fetal heart,2012-04-02,,34947,False,14.0,0.0
635,Procedure,Evaluation of uterine fundal height,2012-04-02,,34947,False,0.0,0.0
651,Procedure,Evaluation of uterine fundal height,2012-04-09,,34947,False,7.0,0.0
652,Procedure,Auscultation of the fetal heart,2012-04-09,,34947,False,0.0,0.0
655,Procedure,Auscultation of the fetal heart,2012-04-16,,34947,False,7.0,0.0
656,Procedure,Evaluation of uterine fundal height,2012-04-16,,34947,False,0.0,0.0
681,Procedure,Epidural anesthesia,2012-04-23,,34947,False,7.0,0.0


In [17]:
remove=['Insertion of subcutaneous contraceptive',
'Review of systems (procedure)',
'Extraction of wisdom tooth',
'Insertion of intrauterine contraceptive device',
'Throat culture (procedure)',
'Nasal sinus endoscopy (procedure)',
'Face mask (physical object)',
'Oxygen administration by mask (procedure)',
'Placing subject in prone position (procedure)',
'Plain chest X-ray (procedure)',
'Medication Reconciliation (procedure)',
'Bilateral tubal ligation',
'Movement therapy (regime/therapy)',
'Subcutaneous immunotherapy',
'Spirometry (procedure)',
'Appendectomy',
'Information gathering (procedure)',
'Bone immobilization',
'Cognitive and behavioral therapy (regime/therapy)',
'Brief general examination (procedure)',
'Admission to burn unit', 'Allergy screening test',
'Sputum examination (procedure)',
'Suture open wound',
'Exercise class',
'Measurement of respiratory function (procedure)',
'Kitchen practice']

In [18]:
df=df.loc[~df['name'].isin(remove)]

In [19]:
df.name.unique()

array(['Auscultation of the fetal heart',
       'Streptococcus pneumoniae group B antigen test',
       'Evaluation of uterine fundal height', 'Childbirth',
       'Physical examination following birth', 'Depression screening',
       'Instrumental delivery', 'Premature birth of newborn',
       'Epidural anesthesia', 'Episiotomy', 'Medical induction of labor',
       'Cesarean section', 'RhD passive immunization',
       'Intramuscular injection', 'Augmentation of labor',
       'Peripheral blood smear interpretation',
       'Admission to long stay hospital', 'Spontaneous breech delivery',
       'Monitoring of patient (regime/therapy)'], dtype=object)

In [20]:
df['event'].unique()

array([0., 1., 2.])

In [21]:
for p in df.patient.unique():
    
    for e in df[df['patient']==p]['event'].unique():
        new_date=(df.loc[(df['patient']==p) & (df['event']==e),"startDate"].head(1) - datetime.timedelta(30))

        df=df.append({"eventType":"Procedure",
                                          "name": "Preeclampsia",
                                          "startDate": new_date.item(),
                                          "endDate": np.nan, 
                                          "patient": p,
                                          "isEnd": "False",
                                        "delta": 0,
                                         "event": e},ignore_index=True)

In [22]:
df=df.sort_values(["patient","startDate","event"])

In [23]:
df.loc[df['patient']==34947]

Unnamed: 0,eventType,name,startDate,endDate,patient,isEnd,delta,event
5072,Procedure,Preeclampsia,2012-02-18,,34947,False,0.0,0.0
273,Procedure,Evaluation of uterine fundal height,2012-03-19,,34947,False,,0.0
274,Procedure,Streptococcus pneumoniae group B antigen test,2012-03-19,,34947,False,0.0,0.0
275,Procedure,Auscultation of the fetal heart,2012-03-19,,34947,False,0.0,0.0
291,Procedure,Auscultation of the fetal heart,2012-04-02,,34947,False,14.0,0.0
292,Procedure,Evaluation of uterine fundal height,2012-04-02,,34947,False,0.0,0.0
300,Procedure,Evaluation of uterine fundal height,2012-04-09,,34947,False,7.0,0.0
301,Procedure,Auscultation of the fetal heart,2012-04-09,,34947,False,0.0,0.0
304,Procedure,Auscultation of the fetal heart,2012-04-16,,34947,False,7.0,0.0
305,Procedure,Evaluation of uterine fundal height,2012-04-16,,34947,False,0.0,0.0


In [24]:
df['idx']=np.nan
for p in df.patient.unique():
    for e in  df.loc[df['patient']==p,"event"].unique():
        df.loc[(df['patient']==p) & (df['event']==e),'idx']=df.loc[(df['patient']==p ) & (df['event']==e)].groupby("startDate").ngroup()

In [25]:
df.loc[df['patient']==34947]

Unnamed: 0,eventType,name,startDate,endDate,patient,isEnd,delta,event,idx
5072,Procedure,Preeclampsia,2012-02-18,,34947,False,0.0,0.0,0.0
273,Procedure,Evaluation of uterine fundal height,2012-03-19,,34947,False,,0.0,1.0
274,Procedure,Streptococcus pneumoniae group B antigen test,2012-03-19,,34947,False,0.0,0.0,1.0
275,Procedure,Auscultation of the fetal heart,2012-03-19,,34947,False,0.0,0.0,1.0
291,Procedure,Auscultation of the fetal heart,2012-04-02,,34947,False,14.0,0.0,2.0
292,Procedure,Evaluation of uterine fundal height,2012-04-02,,34947,False,0.0,0.0,2.0
300,Procedure,Evaluation of uterine fundal height,2012-04-09,,34947,False,7.0,0.0,3.0
301,Procedure,Auscultation of the fetal heart,2012-04-09,,34947,False,0.0,0.0,3.0
304,Procedure,Auscultation of the fetal heart,2012-04-16,,34947,False,7.0,0.0,4.0
305,Procedure,Evaluation of uterine fundal height,2012-04-16,,34947,False,0.0,0.0,4.0


In [26]:
df.patient.nunique()


545

In [27]:
import itertools
from collections import defaultdict

In [28]:
mydict=defaultdict(list)
for p in df.patient.unique():
    for e in df.loc[df['patient']==p]['event'].unique():
        for i in df.idx.unique()[:-1]:
            pid=str(p)+"_"+str(e)
            mydict[pid] += list(itertools.product(
                df[(df['patient']==p) & (df['event']==e) & (df['idx']==i)]['name']+"_"+str(int(i)), 
              df[(df['patient']==p) & (df['event']==e) & (df['idx']==i+1)]['name']+"_"+str(int(i+1))))

In [29]:
len(mydict)

568

In [30]:
output_values=list(mydict.values())

In [31]:
len(output_values)

568

In [32]:
from collections import Counter

In [33]:
frequency = dict(Counter(x for xs in output_values for x in set(xs)))       

In [34]:
sankey = {"links": [], "nodes": []}
for i, y in frequency.items():     #links are created first, from items of frequency dictionary    
        link = dict(
            source = str(i[0]),
            target = str(i[1]),
            value = y, 
            )
        sankey["links"].append(link)     
        
        check_node = [link[x] for x in ['source', 'target']]     #nodes derived from links 'source' and 'target' 
        for x in check_node:        #append a new node, only if it does not already exists   
            if not any(d.get('name', None) == x for d in sankey["nodes"]): 
                name = dict(
                    name = x,
                    station = re.sub('[^a-zA-Z]+', '', x),
                    step = re.sub('[^0-9]+', '', x)
                )
                sankey["nodes"].append(name)  

In [35]:
sorted_nodes = sorted(sankey['nodes'], key=lambda k: (k['step']))

In [36]:
for w, node in enumerate(sorted_nodes):
    node['id'] = w
    node['color'] = 'rgba(31, 119, 180, 0.8)' 

In [37]:
len(set([x['station'] for x in sorted_nodes]))

20

In [38]:
cols=['rgb(215,48,39)','rgb(244,109,67)','rgb(253,174,97)','rgb(254,224,144)','rgb(255,255,191)','rgb(224,243,248)','rgb(171,217,233)','rgb(116,173,209)','rgb(69,117,180)',
     'rgb(197,27,125)','rgb(222,119,174)','rgb(241,182,218)','rgb(253,224,239)','rgb(247,247,247)','rgb(230,245,208)','rgb(184,225,134)','rgb(127,188,65)','rgb(77,146,33)',
      'rgb(255,247,236)','rgb(254,232,200)']#,'rgb(253,212,158)','rgb(253,187,132)','rgb(252,141,89)','rgb(239,101,72)','rgb(215,48,31)','rgb(179,0,0)','rgb(127,0,0)',
     #'rgb(178,24,43)','rgb(214,96,77)']#,'rgb(244,165,130)']#,'rgb(253,219,199)','rgb(247,247,247)','rgb(209,229,240)','rgb(146,197,222)','rgb(67,147,195)','rgb(33,102,172)']

In [39]:
len(cols)

20

In [40]:
color_dict=[{x[0]:x[1] } for x in list(zip(set([x['station'] for x in sorted_nodes]), cols))]

In [41]:
from collections import ChainMap

data = dict(ChainMap(*color_dict))

In [42]:
data['Preeclampsia']

'rgb(224,243,248)'

In [43]:
def id_lookup(node, sorted_list):
    for item in sorted_list: 
        if item['name'] == node['source']:
            return item['id']

In [44]:
for d in sankey['links']: 
    d['source_id'] = id_lookup(d, sorted_nodes)    

In [45]:
sorted_links = sorted(sankey['links'], key=lambda k: (k['source_id']))  

In [46]:
nodes = dict(
            label = [node['name'] for node in sorted_nodes],
            color = [data[node['station']] for node in sorted_nodes]
        )

In [47]:
link = dict(
            source = [nodes["label"].index(link['source']) for link in sorted_links ],
            target = [nodes["label"].index(link['target']) for link in sorted_links ],
            value = [link['value'] for link in sorted_links]
        )
            

In [48]:
data = dict(nodes=nodes,
        link=link)

In [49]:
import plotly as py
from plotly.offline import iplot

In [52]:

data_trace = dict(
    type='sankey',
    domain = dict(
      x =  [0,1],
      y =  [0,1]
    ),
    orientation = "h",
    valueformat = ".0f",
    valuesuffix = "Patients",
    node = dict(
      pad = 5,
      thickness = 10,
      line = dict(
        color = "black",
        width = 0.5
      ),
      label =  data["nodes"]["label"],
      color =  data["nodes"]["color"]
    ), 

    link = dict(
      source =  data["link"]["source"],
      target =  data["link"]["target"],
      value =  data["link"]["value"],
      label =  data["nodes"]["label"]
  )   

)

layout =  dict(
    title = "Patient Flow Analysis - Procedures 90 Days After Preeclampsia Diagnosis",
    width=1000,
    height=1000,
    font = dict(
      size = 10   )
)

fig = dict(data=[data_trace], layout=layout)
py.offline.iplot(fig, validate = False)




In [51]:
## add static png, interactive version doesn't show up on github

![](sankey.png)

In [53]:
import plotly.graph_objects as go

In [54]:
fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label =  data["nodes"]["label"],
      color =  data["nodes"]["color"]
    ),
    link = dict(
      source =  data["link"]["source"],
      target =  data["link"]["target"],
      value =  data["link"]["value"],
      label =  data["nodes"]["label"]
  ))])

fig.update_layout(title_text="Basic Sankey Diagram", font_size=10)
fig.show()