In [97]:
%pip install --upgrade pip
%pip install pandas
%pip install openpyxl
%pip install yfiles_jupyter_graphs
%pip install ipywidgets

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [98]:
import pandas as pd
import os
import json
from yfiles_jupyter_graphs import GraphWidget
import ipywidgets as widgets

In [99]:
EPOCH_ROW = 0
CYCLE_ROW = 1
CYCLE_START_ROW = 2
CYCLE_PERIOD_ROW = 3
CYCLE_END_RULE_ROW = 4
TIMING_ROW = 5
VISIT_LABEL_ROW = 6
VISIT_WINDOW_ROW = 7

HEADER_ROW = 8
FIRST_ACTIVITY_ROW = 9

ACTIVITY_COL = 0
CHILD_ACTIVITY_COL = 1
BC_COL = 2
FIRST_VISIT_COL = 3

In [100]:
def get_cycle_cell(df, row_index, col_index):
  is_null = pd.isnull(df.iloc[row_index, col_index])
  if is_null:
    return "", True
  else:
    value = str(df.iloc[row_index, col_index])
    if value.upper() == "-":
      return "", True
    else:
      return value, False

def previous_index(index):
  if index == 0:
    return 0
  else:
    return index - 1

def build_cycle_record(df, index, col_index, cycle):
  cycle_start_index = index
  cycle_start, is_null = get_cycle_cell(df, CYCLE_START_ROW, col_index)
  cycle_period, is_null = get_cycle_cell(df, CYCLE_PERIOD_ROW, col_index)
  cycle_end_rule, is_null = get_cycle_cell(df, CYCLE_END_RULE_ROW, col_index)
  return { 
    'start_index': cycle_start_index, 
    'cycle': cycle, 
    'start': cycle_start, 
    'period': cycle_period, 
    'end_rule': cycle_end_rule 
  }


In [101]:
def extract_cycles(df):
  cycles = []
  timepoint_index = -1
  cycle_start_index = None
  in_cycle = False
  prev_cycle = None
  for col_index in range(df.shape[1]):
    if col_index >= FIRST_VISIT_COL:
      timepoint_index += 1
      cycle, cycle_is_null = get_cycle_cell(df, CYCLE_ROW, col_index)
      if cycle_is_null:
        if in_cycle:
          cycle_record['end_index'] = previous_index(timepoint_index)
          cycles.append(cycle_record)
          in_cycle = False
        else:
          pass # Do nothing
      else:
        cycle = str(cycle)
        if not in_cycle:
          in_cycle = True
          cycle_record = build_cycle_record(df, timepoint_index, col_index, cycle)
        elif prev_cycle == cycle:
          pass # Do nothing
        else:
          cycle_record['end_index'] = previous_index(timepoint_index)
          cycles.append(cycle_record)
          cycle_record = build_cycle_record(df, timepoint_index, col_index, cycle)
      prev_cycle = cycle
  return cycles


In [102]:
def get_timing_cell(df, row_index, col_index):
  is_null = pd.isnull(df.iloc[row_index, col_index])
  if is_null:
    return "", True
  else:
    return df.iloc[row_index, col_index], False

def get_relative_ref(part):
  if len(part) > 1:
    return int(part[1:])
  else:
    return 1

def get_timing_type(df, col_index):
  timing_type = ""
  rel_ref = 0
  timing_value = ""
  timing_info, timing_info_is_null = get_timing_cell(df, TIMING_ROW, col_index)
  if not timing_info_is_null:
    timing_parts = timing_info.split(":")
    if timing_parts[0].upper()[0] == "A":
      timing_type = "anchor"
      rel_ref = 0
    if timing_parts[0].upper()[0] == "P":
      timing_type = "previous"
      rel_ref = get_relative_ref(timing_parts[0]) * -1
    elif timing_parts[0].upper()[0] == "N":
      timing_type = "next"
      rel_ref = get_relative_ref(timing_parts[0])
    elif timing_parts[0].upper()[0] == "C":
      timing_type = "cycle start"
      rel_ref = get_relative_ref(timing_parts[0])
    if len(timing_parts) == 2:
      timing_value = timing_parts[1].strip()
  #print("TIMING: col_index (%s) - FIRST_VISIT_COL (%s) + rel_ref (%s)" % (col_index, FIRST_VISIT_COL, rel_ref))
  return { 'type': timing_type, 'ref': col_index - FIRST_VISIT_COL + rel_ref, 'value': timing_value, 'cycle': None }

In [103]:
def extract_timepoints(df):
  timepoints = []
  for col_index in range(df.shape[1]):
    if col_index >= FIRST_VISIT_COL:
      record = get_timing_type(df, col_index)
      timepoints.append(record)
  return timepoints

In [104]:
def get_encounter_cell(df, row_index, col_index):
  is_null = pd.isnull(df.iloc[row_index, col_index])
  if is_null:
    return "", True
  else:
    return df.iloc[row_index, col_index], False

def get_encounter_details(df, col_index):
  label = ""
  window = ""
  label, label_is_null = get_encounter_cell(df, VISIT_LABEL_ROW, col_index)
  window, window_is_null = get_encounter_cell(df, VISIT_WINDOW_ROW, col_index)
  return { 'label': label, 'window': window }

In [105]:
def extract_encounters(df):
  encounters = []
  for col_index in range(df.shape[1]):
    if col_index >= FIRST_VISIT_COL:
      record = get_encounter_details(df, col_index)
      encounters.append(record)
  return encounters

In [106]:
def get_activity_cell(df, row_index, col_index):
  is_null = pd.isnull(df.iloc[row_index, col_index])
  if is_null:
    return "", True
  else:
    value = df.iloc[row_index, col_index]
    if value == '-':
      return "", True
    else:
      return df.iloc[row_index, col_index], False

def get_observation_cell(df, row_index, col_index):
  is_null = pd.isnull(df.iloc[row_index, col_index])
  if is_null:
    return "", "", True
  else:
    value = df.iloc[row_index, col_index]
    if value == '-':
      return "", "", True
    else:
      parts = value.split(':')
      if parts[0].lower() == "bc":
        return "bc", parts[1], False
      else:
        return "", "", True

In [107]:
class DDFVisual():

  def __init__(self):
    self.nodes = []
    self.edges = []
    self.add_edges = []
    self.node_index = 1
    self.edge_index = 1
    self.id_node_index_map = {}
    self.type_id_field_map = { 
      'Entry': 'entryId',
      'Exit': 'exitId',
      'Timeline': 'timelineId',
      'Timepoint': 'timepointId',
      'Timing': 'timingId',
      'Condition': 'timepointId',
      'CycleStart': 'cycleStartId',
      'StudyDesign': 'studyDesignId',
      'Activity': 'activityId',
      'Encounter': 'encounterId',
      'BCSurrogate': 'bcSurrogateId',
      'Study': 'studyId'
    }
    self.edge_attributes = [
      'relativeTo',
      'nextTimepointId',
      'cycleId',
      'timepointActivityIds',
      'timepointEncounterId',
      'activityBCSurrogates',
      'activityBCCategories',
      'activityBCs',
    ]
    
  def get_id_field_and_klass(self, node):
    klass = node['_type']
    return self.type_id_field_map[klass], klass

  def draw(self, json):
    self.process_node(json)
    for edge in self.add_edges:
      if edge['end'] in self.id_node_index_map:
        edge['id'] = self.edge_index
        edge['end'] = self.id_node_index_map[edge['end']]
        self.edges.append(edge)
        self.edge_index += 1
      else:
        print("***** %s -edge-> %s *****" % (edge['start'], edge['end']))
    return self.nodes, self.edges
  
  def process_node(self, node):
    if type(node) == list:
      result = []
      for item in node:
        indexes = self.process_node(item)
        result = result + indexes
      return result
    elif type(node) == dict:
      if node == {}:
        return []
      properties = {}
      id_field, klass = self.get_id_field_and_klass(node)
      if node[id_field] in self.id_node_index_map:
        return [self.id_node_index_map[node[id_field]]]
      this_node_index = self.node_index
      self.node_index += 1
      for key, value in node.items():
        if key in self.edge_attributes:
          if type(value) == list:
            for item in value:
              self.add_edges.append( { 'start': this_node_index, 'end': item, 'properties': {'label': key}})
          else:
            self.add_edges.append( { 'start': this_node_index, 'end': value, 'properties': {'label': key}})
        else:
          indexes = self.process_node(value)
          if indexes == []:
            properties[key] = value
          else:
            for index in indexes:
              self.edges.append( {'id': self.edge_index, 'start': this_node_index, 'end': index, 'properties': {'label': key}})
              self.edge_index += 1
      properties['node_type'] = klass
      properties['label'] = node[id_field]
      self.nodes.append({ 'id': this_node_index, 'properties': properties })
      self.id_node_index_map[properties[id_field]] = this_node_index
      return [this_node_index]
    else:
      return []

In [108]:
def extract_activities_and_bcs(df):
  activities = []
  activities_bc_map = {}
  row_activities_map = []
  prev_activity = None
  for row_index, col_def in df.iterrows():
    if row_index >= FIRST_ACTIVITY_ROW:
      activity, activity_is_null = get_activity_cell(df, row_index, CHILD_ACTIVITY_COL)
      if activity_is_null:
        if not prev_activity == None:
          row_activities_map.append(prev_activity)
          activity = prev_activity
      else:
        activities.append(activity)
        row_activities_map.append(activity)
      prev_activity = activity
      obs_type, obs_name, obs_is_null = get_observation_cell(df, row_index, BC_COL)
      if not obs_is_null:
        if obs_type == "bc":
          if not activity in activities_bc_map:
            activities_bc_map[activity] = { 'bc': [] }  
          activities_bc_map[activity]['bc'].append(obs_name)
  return { 
    'activities': activities,
    'activity_bc_map': activities_bc_map,
    'row_activities_map': row_activities_map
  }

def extract_timepoint_activities_map(df, timepoints, activities, row_activities_map):
  timepoint_activity_map = []
  activity_dict = {}
  for activity in activities:
    activity_dict[activity] = False
  for tp in timepoints:
    timepoint_activity_map.append(dict(activity_dict))
  for index in range(df.shape[1]):
    if index >= FIRST_VISIT_COL:
      column = df.iloc[:, index]
      row = 0
      for col in column:
        if row >= FIRST_ACTIVITY_ROW:
          if not pd.isnull(col):
            if col.upper() == "X":
              activity = row_activities_map[row - FIRST_ACTIVITY_ROW]
              tp_index = index - FIRST_VISIT_COL
              timepoint_activity_map[tp_index][activity] = True
        row += 1
  return timepoint_activity_map


In [109]:
class DDFJson():
  
  def __init__(self):
    self.id_index = { 'entry': 0, 'exit': 0, 'timepoint': 0, 'timeline': 0, 'timing': 0, 'study_design': 0, 'study': 0, 'activity': 0, 'encounter': 0, 'bc_surrogate': 0 }
    self.dicts = {}

  def increment_index(self, name):
    self.id_index[name] += 1

  def build_id(self, name):
    self.increment_index(name)
    return "%s_%s" % (name, self.id_index[name])

  def add_entry(self, description, timepoint_id):
    id = self.build_id('entry')
    result = { '_type': 'Entry', 'entryId': id, 'entryDescription': description, 'nextTimepointId': timepoint_id }
    self.dicts[id] = result
    return result

  def add_exit(self):
    id = self.build_id('exit')
    result = { '_type': 'Exit', 'exitId': id }
    self.dicts[id] = result
    return result

  def add_timepoint(self, previous_timepoint_id, timing, activities, encounter):
    id = self.build_id('timepoint')
    result = { '_type': 'Timepoint', 'timepointId': id, 'nextTimepointId': None, 'scheduledAt': timing, 'timepointActivityIds': activities, 'timepointEncounterId': encounter }
    self.dicts[id] = result
    if not previous_timepoint_id == None:
      self.dicts[previous_timepoint_id]['nextTimepointId'] = id
    return result

  def add_previous_timing(self, value, relative_to_from, window, to_id):
    id = self.build_id('timing')
    result = { '_type': 'Timing', 'timingId': id, 'type': "after", 'value': value, 'relativeToFrom': relative_to_from, 'window': window, 'relativeTo': to_id }
    self.dicts[id] = result
    return result

  def add_next_timing(self, value, relative_to_from, window, to_id):
    id = self.build_id('timing')
    result = { '_type': 'Timing', 'timingId': id, 'type': "next", 'value': value, 'relativeToFrom': relative_to_from, 'window': window, 'relativeTo': to_id }
    self.dicts[id] = result
    return result

  def add_anchor_timing(self, value, cycle=""):
    id = self.build_id('timing')
    result = { '_type': 'Timing', 'timingId': id, 'type': "anchor", 'value': value, 'cycle': cycle, 'relativeToFrom': None, 'window': None, 'relativeTo': None }
    self.dicts[id] = result
    return result

  def add_condition_timing(self, value, to_id):
    id = self.build_id('timing')
    result = { '_type': 'Condition', 'conditionId': id, 'type': "condition", 'value': value, 'relativeToFrom': None, 'window': None, 'relativeTo': to_id }
    self.dicts[id] = result
    return result

  def add_cycle_start_timing(self, value):
    id = self.build_id('timing')
    result = { '_type': 'CycleStart', 'cycleStartId': id, 'type': "cycle start", 'value': value, 'relativeToFrom': None, 'window': None, 'relativeTo': None }
    self.dicts[id] = result
    return result

  def add_timeline(self, entry, timepoints, exit):
    id = self.build_id('timeline')
    result = { '_type': 'Timeline', 'timelineId': id, 'timelineEntry': entry, 'timelineTimepoints': timepoints, 'timelineExit': exit }
    self.dicts['id'] = result
    return result
  
  def add_activity(self, name, description, conditional=False, conditional_reason="", surrogates=[]):
    id = self.build_id('activity')
    result = { '_type': 'Activity', 'activityId': id, 'activityName': name, 'activityDescription': description, 'activityIsConditional': conditional, 'activityConditionalReason': conditional_reason, 'bcSurrogates': surrogates }
    self.dicts['id'] = result
    return result

  def add_encounter(self, name, description, enc_type, env_setting, contact_modes):
    id = self.build_id('encounter')
    result = { '_type': 'Encounter', 'encounterId': id, 'encounterName': name, 'encounterDescription': description, 'encounterType': enc_type, 'encounterEnvironmentalSetting': env_setting, 'encounterContactMode': contact_modes }
    self.dicts['id'] = result
    return result

  def add_biomedical_concept_surrogate(self, name, description, reference):
    id = self.build_id('bc_surrogate')
    result = { '_type': 'BCSurrogate', 'bcSurrogateId': id, 'bcSurrogateName': name, 'bcSurrogateDescription': description, 'bcSurrogateReference': reference }
    self.dicts['id'] = result
    return result

  def add_study_design(self, intent, types, model, therapeutic_areas, cells, indications, objectives, populations, interventions, workflows, estimands, encounters, activities, surrogates):
    id = self.build_id('study_design')
    result = { '_type': 'StudyDesign', 'studyDesignId': id, 'studyWorkflows': workflows, 'activities': activities, 'encounters': encounters, 'bcSurrogates': surrogates }
    self.dicts['id'] = result
    return result
    
  def add_study(self, title, version, type, phase, ta, rationale, acronym, identifiers, protocols, designs):
    id = self.build_id('study')
    result = { '_type': 'Study', 'studyId': id, 'studyTitle': title, 'studyVersion': version, 'studyType': type, 'studyPhase': phase, 'businessTherapueticAreas': ta, 'studyRationale': rationale, 'studyAcronym': acronym, 'studyIdentifiers': identifiers, 'studyProtocolVersions': protocols, 'studyDesigns': designs }
    self.dicts['id'] = result
    return result

  def process_timepoints(self, timepoints, cycles, activities, tp_activities, encounters):
    tps = []
    acts = []
    encs = []
    bcs = []
    acts_map = {}
    timing = []
    cycle_offset = 0
    for index, timepoint in enumerate(timepoints):
      timepoint['activity_index'] = index
      timepoint['encounter_index'] = index
    for cycle in cycles:
      start_index = cycle['start_index'] + cycle_offset
      timepoints.insert(start_index, { 'type': 'anchor', 'ref': 0, 'value': cycle['start'], 'activity_index': None, 'encounter_index': None, 'cycle': cycle['cycle'] })
      cycle_offset += 1
      end_index = cycle['end_index'] + cycle_offset + 1
      timepoints.insert(end_index, { 'type': 'previous', 'ref': end_index - 1, 'value': cycle['period'], 'activity_index': None, 'encounter_index': None, 'cycle': None })
      cycle_offset += 1
      end_index = cycle['end_index'] + cycle_offset + 1
      timepoints.insert(end_index, { 'type': 'condition', 'ref': start_index , 'value': cycle['end_rule'], 'activity_index': None, 'encounter_index': None, 'cycle': None })
      cycle_offset += 1
    previous_tp_id = None
    for activity in activities['activities']:
      a_bcs = []
      if activity in activities['activity_bc_map']:
        for a in activities['activity_bc_map'][activity]['bc']:
          bc = self.add_biomedical_concept_surrogate(a, a, "")
          a_bcs.append(bc['bcSurrogateId'])
          bcs.append(bc)
      acts.append(self.add_activity(activity, activity, False, "", a_bcs))
      acts_map[activity] = acts[-1]['activityId']
    for encounter in encounters:
      encs.append(self.add_encounter(encounter['label'], encounter['label'], None, None, []))
    for timepoint in timepoints:
      activity_ids = []
      encounter_id = None
      if not timepoint['activity_index'] == None:
        source = tp_activities[timepoint['activity_index']]
        for k, v in source.items():
          if v:
            activity_ids.append(acts_map[k])
      if not timepoint['encounter_index'] == None:
        encounter_id = encs[timepoint['encounter_index']]['encounterId']
      tps.append(self.add_timepoint(previous_tp_id, None, activity_ids, encounter_id))
      previous_tp_id = tps[-1]['timepointId']
    for index, timepoint in enumerate(timepoints):
      if timepoint['type'] == 'condition':
        tps[index]['cycleId'] = tps[timepoint['ref']]['timepointId']
        tps[index]['_type'] = 'Condition'
    for timepoint in timepoints:
      if timepoint['type'] == 'next':
        timing.append(self.add_next_timing(timepoint['value'], 'StartToStart', None, tps[timepoint['ref']]['timepointId']))
      elif timepoint['type'] == 'previous':
        timing.append(self.add_previous_timing(timepoint['value'], 'StartToStart', None, tps[timepoint['ref']]['timepointId']))
      elif timepoint['type'] == 'anchor':
        timing.append(self.add_anchor_timing(timepoint['value'], timepoint['cycle']))
      elif timepoint['type'] == 'condition':
        #timing.append(self.add_condition_timing(timepoint['value']))
        timing.append({})
      elif timepoint['type'] == 'cycle start':
        timing.append(self.add_cycle_start_timing(timepoint['value']))
      elif timepoint['type'] == '':
        timing.append({})
    for index, tp in enumerate(tps):
      tp['scheduledAt'] = timing[index]
    entry = self.add_entry('Main timeline', tps[0]['timepointId'])
    exit = self.add_exit()
    tps[-1]['exit'] = exit
    timeline = self.add_timeline(entry, tps, exit)
    study_design = self.add_study_design(
      intent=None, 
      types=[], 
      model=None, 
      therapeutic_areas=[], 
      cells=[], 
      indications=[], 
      objectives=[], 
      populations=[], 
      interventions=[], 
      workflows=[timeline],
      estimands=[], 
      encounters=encs, 
      activities=acts,
      surrogates=bcs
    ) 
    return self.add_study("Berber 2", "1", None, None, [], "", "TEST", [], [], [study_design])
  
  def export(self, node):
    return self.export_node(node)

  def export_node(self, node):
    if type(node) == list:
      result = []
      for item in node:
        result.append(self.export_node(item))
      return result
    elif type(node) == dict:
      result = {}
      for key, value in node.items():
        if key.startswith('_'):
          continue
        result[key] = self.export_node(value)
      return result
    else:
      return node


In [110]:
def save_as_file(data, filename):
  with open('source_data/%s.json' % (filename), 'w', encoding='utf-8') as outfile:
    json.dump(data, outfile, indent=2)

In [111]:
widgets.Text(
    value='',
    placeholder='Type something',
    description='Example:',
    disabled=False
)

#study = "Roche Phase 3 NCT04320615"
#study = "cycles_1_v2"
#study = "simple_1"
study = "simple_2"

notebook_path = os.path.abspath("notebook.ipynb")
file_path = os.path.join(os.path.dirname(notebook_path), "source_data/%s.xlsx" % (study))
df = pd.read_excel(file_path, header=None)
df = df.fillna(method='ffill', axis=1)
cycles = extract_cycles(df)
timepoints = extract_timepoints(df)
encounters = extract_encounters(df)
activities = extract_activities_and_bcs(df)
tp_activities = extract_timepoint_activities_map(df, timepoints, activities['activities'], activities['row_activities_map'])
#print("CYCLES", cycles)
#print("TIMEPOINTS", timepoints)
#print("ENCOUNTERS", encounters)
#print("ACTIVITIES", activities)
#print("TP ACTIVITIES", tp_activities)

x = DDFJson()
node = x.process_timepoints(timepoints, cycles, activities, tp_activities, encounters)
save_as_file(x.export(node), study)

y = DDFVisual()
nodes, edges = y.draw(node)


***** 6 -edge-> None *****
***** 7 -edge-> None *****


In [112]:

def custom_node_color(index: int, node: dict):
  if 'node_type' in node['properties']:
    if node['properties']['node_type'] == 'Entry':
      return 'black'
    elif node['properties']['node_type'] == 'Exit':
      return 'black'
    elif node['properties']['node_type'] == 'Timeline':
      return '#3F6AFC'
    elif node['properties']['node_type'] == 'Condition':
      return '#ABB2B9'
    elif node['properties']['node_type'] == 'CycleStart':
      return '#3F6AFC'
    elif node['properties']['node_type'] == 'Timing':
      return '#6495ED'
    elif node['properties']['node_type'] == 'Timepoint':
      return '#ABB2B9'
    elif node['properties']['node_type'] == 'Activity':
      return '#1BA62F'
    elif node['properties']['node_type'] == 'BCSurrogate':
      return '#1BA62F'
    elif node['properties']['node_type'] == 'Encounter':
      return '#E53F2F'
    elif node['properties']['node_type'] == 'StudyDesign':
      return '#E52FDA'
    elif node['properties']['node_type'] == 'Study':
      return '#E52FDA'
    else:
      return 'white'
  else: 
    return 'white'

def custom_node_style(index: int, node: dict):
  if 'node_type' in node['properties']:
    if node['properties']['node_type'] == 'Entry' or node['properties']['node_type'] == 'Exit':
      return {'image': 'https://raw.githubusercontent.com/data4knowledge/timepoints/main/images/pill_black.svg'}
    elif node['properties']['node_type'] == 'Timeline':
      return { 'shape': 'ellipse' }
    elif node['properties']['node_type'] == 'Timing':
      return { 'shape': 'ellipse' }
    elif node['properties']['node_type'] == 'Condition':
      return { 'shape': 'diamond' }
    elif node['properties']['node_type'] == 'CycleStart':
      return { 'shape': 'hexagon2' }
    elif node['properties']['node_type'] == 'Timepoint':
      return { 'shape': 'ellipse' }
    elif node['properties']['node_type'] == 'Activity':
      return { 'shape': 'ellipse' }
    else:
      return { 'shape': 'ellipse' }
  else: 
    return { 'shape': 'ellipse' }

graph_widget = GraphWidget()
graph_widget.orthogonal_layout()
graph_widget.set_directed(True)

graph_widget.set_nodes(nodes)
graph_widget.set_edges(edges)
graph_widget.set_node_color_mapping(custom_node_color)
graph_widget.set_node_styles_mapping(custom_node_style)
graph_widget

GraphWidget(layout=Layout(height='500px', width='100%'))