# Patient EMPI Graph


## Convert Source Data to Neptune CSV


### Find name of your S3 Bucket

In [None]:
import os
import subprocess

stream = os.popen("source ~/.bashrc ; echo $S3_SOURCE_BUCKET; echo $S3_SOURCE_BUCKET")
S3_BUCKET = stream.read().split("\n")[0]

S3_BUCKET

### Grab local copy of S3 data

In [None]:
%%bash -s "$S3_BUCKET"

echo $1
mkdir converter_dir
cd converter_dir
aws s3 sync s3://$1 .
rm -r notebook # already have it
cd converter


### Install NPM dependencies for converter

In [None]:
%%bash

cd converter_dir/converter

# need nvm
curl https://raw.githubusercontent.com/creationix/nvm/master/install.sh | bash
export NVM_DIR="$HOME/.nvm"
    [ -s "$NVM_DIR/nvm.sh" ] && \. "$NVM_DIR/nvm.sh"  # This loads nvm
    [ -s "$NVM_DIR/bash_completion" ] && \. "$NVM_DIR/bash_completion"  # This loads nvm bash_completion
    
# need node 16
nvm install 16

# need dependencies
npm install #dependencies

# let's see which node versions we have
ls -l /home/ec2-user/.nvm/versions/node


### Run the converter

In [None]:
%%bash

cd converter_dir/converter

/home/ec2-user/.nvm/versions/node/*/bin/node process.js ../data/openempi_patient_db.json.gz patients


### Move converted files to s3

In [None]:
%%bash -s "$S3_BUCKET"

cd converter_dir/converter 
aws s3 cp neptune-patients-identifier.csv.gz s3://$1/data/converted/neptune-patients-identifier.csv.gz
aws s3 cp neptune-patients-patient.csv.gz s3://$1/data/converted/neptune-patients-patient.csv.gz
aws s3 cp neptune-patients-recordLink.csv.gz s3://$1/data/converted/neptune-patients-recordLink.csv.gz
aws s3 cp neptune-patients-identifierEdge.csv.gz s3://$1/data/converted/neptune-patients-identifierEdge.csv.gz


## Load data into Neptune

In [None]:
%load -s s3://{S3_BUCKET}/data/converted --store-to loadres --run

## Check load status

In [None]:
%load_status {loadres['payload']['loadId']} --details --errors

## Query the Data in Neptune

### Find patient by SSN. Show patient details, identifiers, record links.

In [None]:
%%gremlin

// find patients that match on ssn
g.V().
  hasLabel('patient').has('ssn', '460000320'). // show full patient record and summary identifiers, record links
  project('patient', 'identifiers', 'links').
    by(elementMap()).
    by(out('identifierEdge').as('iv').
        select('iv').
        by(elementMap('identifierDomainId', 'identifier')).
      fold()).
    by(outE('recordLink').as('re').
      inV().as('rv').
        select('re', 'rv').
        by(elementMap('weight', 'state', 'source')).
        by(id).fold()).
  limit(100)

### Show a listing of patients, their identifiers, and their links. Patient must have an identifier.

In [None]:
%%gremlin

// start with all patients
g.V().
  hasLabel('patient'). // check has at least one identifier edge
  where(outE('identifierEdge').
    count().is(gt(0))). // summarize the patient, identifiers, and record links that we get back
  project('patient', 'identifiers', 'links').
    by(elementMap('ssn','postalCode','city','state','givenName','familyName')).
    by(outE('identifierEdge').as('ie').
      inV().as('iv').select('iv').
        by(elementMap('identifierDomainId', 'identifier')).
      fold()).
    by(outE('recordLink').as('re').
      inV().as('rv').
        select('re', 'rv').
        by(elementMap('weight', 'state', 'source')).
        by(id).fold()).
  limit(100)

### Show a listing of patients, their identifiers, and their links. Patient must have a record link.

In [None]:
%%gremlin

g.V().hasLabel('patient'). // check at least one record link
  where(out('recordLink')).
  project('patient', 'identifiers', 'links').
    by(elementMap('ssn','postalCode','city','state','givenName','familyName')).
    by(out('identifierEdge').as('iv').
        select('iv').
        by(elementMap('identifierDomainId', 'identifier')).
      fold()).
    by(outE('recordLink').as('re').
      inV().as('rv').
        select('re', 'rv').
        by(elementMap('weight', 'state', 'source')).
        by(id).fold()).
  limit(100)

### Show a listing of patients with multiple record links.

In [None]:
%%gremlin

g.V().hasLabel('patient').
  where(outE('recordLink').
    count().is(gt(1))).
  project('patient', 'identifiers', 'links').
    by(elementMap('ssn','postalCode','city','state','givenName','familyName')).
    by(out('identifierEdge').as('iv').
        select('iv').
        by(elementMap('identifierDomainId', 'identifier')).
      fold()).
    by(outE('recordLink').as('re').
      inV().as('rv').
        select('re', 'rv').
        by(elementMap('weight', 'state', 'source')).
        by(id).fold()).
  limit(100)


### Visualize one patient that has multiple record links.

In [None]:
%%gremlin -d T.id -de T.label

// start with patient
g.V('patients-25:3201').
  repeat(outE('identifierEdge', 'recordLink').inV().
    simplePath()).emit().times(10).path().
    by(elementMap()).
  limit(100)