-
Notifications
You must be signed in to change notification settings - Fork 2
/
sankey.py
123 lines (106 loc) · 4.46 KB
/
sankey.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import pandas as pd
from collections import Counter
import json
import re
import plotly.plotly as py
raw_data = pd.read_csv('/home/StrobiHealth/PatientFlow/Raw Data/tbl_neu.csv')
bewegung = pd.read_csv('/home/StrobiHealth/PatientFlow/Raw Data/Bewegungsarten.csv')
#step 1: parse raw_data to compute Steps number for each Case and corresponding Source and Target.
output = {1:[]} #dictionary key is Case number (i.e. 'FallNr'), value is list of Stations tuples
counter = 0 #counter represents number of step for each Case
row_iterator = raw_data.iterrows()
for i, row in row_iterator:
if i > 0:
current_row = raw_data.loc[i]
previous_row = raw_data.loc[i-1]
if current_row['FallNr'] == previous_row['FallNr']:
counter += 1 #counter incremented if previous Case number is similar
tup1 = (counter, previous_row['Ort'], current_row['Ort']) #previous row = source, current row = target
output[current_row['FallNr']].append(tup1)
else:
counter = 0 #counter reset for each new Case
output[current_row['FallNr']] = []
#step 2: aggregate tuples (i.e. (Step, Source, Target)) and compute frequency of each tuple
output_values = list(output.values())
frequency = dict(Counter(x for xs in output_values for x in set(xs)))
#step 3: create dictionary structure with links and nodes from frequency dictionary
sankey = {"links": [], "nodes": []}
for i, y in frequency.items(): #links are created first, from items of frequency dictionary
if i[0] < 5:
link = dict(
source = str(i[1]) + "_" + str(i[0]),
target = str(i[2]) + "_" + str(i[0]+1),
value = y,
)
sankey["links"].append(link)
check_node = [link[x] for x in ['source', 'target']] #nodes derived from links 'source' and 'target'
for x in check_node: #append a new node, only if it does not already exists
if not any(d.get('name', None) == x for d in sankey["nodes"]):
name = dict(
name = x,
station = re.sub('[^a-zA-Z]+', '', x),
step = re.sub('[^0-9]+', '', x)
)
sankey["nodes"].append(name)
#step 4: sort nodes (by Step number and index position in dataframe 'bewegung') and sort links by number of Steps
def bewegung_index(station): #looks up index value in table 'bewegung'
return bewegung.loc[bewegung['Bewgungsarten'] == station].index[0]
sorted_nodes = sorted(sankey['nodes'], key=lambda k: (k['step'], bewegung_index(k['station'])))
for w, node in enumerate(sorted_nodes):
node['id'] = w
node['color'] = 'rgba(31, 119, 180, 0.8)'
def id_lookup(node, sorted_list):
for item in sorted_list:
if item['name'] == node['source']:
return item['id']
for d in sankey['links']:
d['source_id'] = id_lookup(d, sorted_nodes)
sorted_links = sorted(sankey['links'], key=lambda k: (k['source_id']))
#step 5: create data structure with node labels and link lists based on node index
data = dict(
nodes = dict(
label = [node['name'] for node in sorted_nodes],
color = [node['color'] for node in sorted_nodes]
),
link = dict(
source = [nodes["label"].index(link['source']) for link in sorted_links ],
target = [nodes["label"].index(link['target']) for link in sorted_links ],
value = [link['value'] for link in sorted_links]
)
)
#step 6: plot graph with plotly
py.sign_in('xxx', 'xxxxxxx')
data_trace = dict(
type='sankey',
domain = dict(
x = [0,1],
y = [0,1]
),
orientation = "h",
valueformat = ".0f",
valuesuffix = "Patients",
node = dict(
pad = 15,
thickness = 15,
line = dict(
color = "black",
width = 0.5
),
label = data["nodes"]["label"],
color = data["nodes"]["color"]
),
link = dict(
source = data["link"]["source"],
target = data["link"]["target"],
value = data["link"]["value"],
label = data["nodes"]["label"]
)
)
layout = dict(
title = "Patient Flow Analysis",
font = dict(
size = 10
)
)
fig = dict(data=[data_trace], layout=layout)
py.iplot(fig, validate = False)