# Patient EMPI Graph


## Loading data


### Reset/Clear DB if needed

In [None]:
%db_reset

### Find name of your S3 Bucket

In [None]:
import os
import subprocess

stream = os.popen("source ~/.bashrc ; echo $S3_SOURCE_BUCKET; echo $S3_SOURCE_BUCKET")
S3_BUCKET = stream.read().split("\n")[0]

S3_BUCKET

### Grab local copy of S3 data

In [None]:
%%bash -s "$S3_BUCKET"

echo $1
mkdir converter_dir
cd converter_dir
aws s3 sync s3://$1 .
cd converter


### Install NPM dependencies for converter

In [None]:
%%bash

cd converter_dir/converter

# need nvm
curl https://raw.githubusercontent.com/creationix/nvm/master/install.sh | bash
export NVM_DIR="$HOME/.nvm"
    [ -s "$NVM_DIR/nvm.sh" ] && \. "$NVM_DIR/nvm.sh"  # This loads nvm
    [ -s "$NVM_DIR/bash_completion" ] && \. "$NVM_DIR/bash_completion"  # This loads nvm bash_completion
    
# need node 16
nvm install 16

# need dependencies
npm install #dependencies

# let's see which node versions we have
ls -l /home/ec2-user/.nvm/versions/node


### Run the converter

In [None]:
%%bash

cd converter_env/converter
/home/ec2-user/.nvm/versions/node/*/bin/node process.js ../data/openempi_patient_db.json.gz class01


### Move converted files to s3

In [None]:
%%bash -s "$S3_BUCKET"

cd converter_env/converter 
aws s3 cp neptune-class01-identifier.csv.gz s3://$1/data/converted/nodes/neptune-class01-identifier.csv.gz
aws s3 cp neptune-class01-patient.csv.gz s3://$1/data/converted/nodes/neptune-class01-patient.csv.gz
aws s3 cp neptune-class01-recordLink.csv.gz s3://$1/data/converted/nodes/neptune-class01-recordLink.csv.gz
aws s3 cp neptune-class01-identifierEdge.csv.gz s3://$1/data/converted/edges/neptune-class01-identifierEdge.csv.gz


### Load each data set one by one.  Wait for each to complete. Then check status of each. Examine any errors.

In [None]:
%load -s s3://{S3_BUCKET}/data/converted/nodes

In [None]:
%load_status b32323f9-0085-42af-9ce1-e96a2293995d

In [None]:
%load -s s3://{S3_BUCKET}/data/converted/edges

In [None]:
%load_status b2bcf828-305e-4cb2-82bb-e98a9cde817f

## Basic Exploratory Queries

In [None]:
%%gremlin

// sanity check - list some identifiers. will work once date is fixed
g.V().hasLabel('identifier').elementMap().limit(1).fold()

In [None]:
%%gremlin

// sanity check - list some patients
g.V().hasLabel('patient').elementMap().limit(100)

In [None]:
%%gremlin

g.E().hasLabel('identifierEdge').elementMap().limit(1).fold()

In [None]:
%%gremlin

g.E().hasLabel('recordLink').elementMap().limit(1).fold()

## Viz

In [None]:
%%gremlin -d T.id -de T.id
// clusters of patients linked by recordLink

g.V().hasLabel('patient').outE().inV().path().by(elementMap()).limit(100)



In [None]:
%%gremlin -d T.id -de T.id
// clusters of patients linked by recordLink

g.V().hasLabel('patient').outE('recordLink').inV().path().by(elementMap()).limit(500)




In [None]:
%%gremlin -d T.id -de T.id
// clusters of patients linked by recordLink

//g.V().hasLabel('patient').repeat(outE('recordLink').inV()).until(outE('recordLink').count().is(0)).path().by(elementMap()).limit(300)


g.V().hasLabel('patient').outE('recordLink').inV().repeat(outE('recordLink').inV()).until(outE('recordLink').count().is(0)).path().by(elementMap()).limit(1000)


In [None]:
%%gremlin -p v,oute,inv

// -p path pattern (viz hint)  - vertex, out-edge, in-vertex

// clusters of patients linked by recordLink
// TODO - this returns clusters of patients. how to show the details in the viz
g.V().hasLabel('patient')
.repeat(outE('recordLin%%gremlin -d T.id -de T.id
// clusters of patients linked by recordLink

g.V().hasLabel('patient').outE().inV().path().by(elementMap()).limit(100)



%%gremlin -d T.id -de T.id
// clusters of patients linked by recordLink

g.V().hasLabel('patient').outE('recordLink').inV().path().by(elementMap()).limit(500)




%%gremlin -d T.id -de T.id
// clusters of patients linked by recordLink

//g.V().hasLabel('patient').repeat(outE('recordLink').inV()).until(outE('recordLink').count().is(0)).path().by(elementMap()).limit(300)


g.V().hasLabel('patient').outE('recordLink').inV().repeat(outE('recordLink').inV()).until(outE('recordLink').count().is(0)).path().by(elementMap()).limit(1000)
k').inV())
.until(outE('recordLink').count().is(0))
.path().by(elementMap())
.limit(300)



## SSN Match

In [None]:
%%gremlin

//
// PRETTY GOOD QUERy
//

//
// SSN Match Query
//

// here are patients sharing ssn
//g.V().hasLabel('patient').has('ssn', '672181714').elementMap()

// SSN to try
// 923456789
// 000000999

g.V().hasLabel('patient').has('ssn', '923456789').as('p')
.project('patient', 'identifiers', 'links')
.by(elementMap('ssn', 'postalCode', 'city', 'state', 'givenName', 'familyName2'))
.by(outE('identifierEdge').as('ie').inV().as('iv').select('ie', 'iv').by(id).by(elementMap()).fold())
.by(outE('recordLink').as('re-id', 're-wt').inV().as('rv')
  .select('re-id', 're-wt', 'rv').by(id).by(valueMap('weight')).by(elementMap('givenName')).fold())


// Nice to show which are already linked

In [None]:
%%gremlin

// same but bring in identifiers and record links
g.V().hasLabel('patient').has('ssn', '672181714')
.project('patient', 'identifiers', 'links')
.by(elementMap())
.by(outE('identifierEdge').as('ie').inV().as('iv').select('ie', 'iv').by(id).by(elementMap()).fold())
.by(outE('recordLink').as('re-id', 're-wt').inV().as('rv')
  .select('re-id', 're-wt', 'rv').by(id).by(valueMap('weight')).by(elementMap('givenName')).fold())

In [None]:
%%gremlin

// all patients
g.V().hasLabel('patient')
.order().by('ssn')
.project('patient', 'identifiers', 'links')
.by(elementMap())
.by(outE('identifierEdge').as('ie').inV().as('iv').select('ie', 'iv').by(id).by(elementMap()).fold())
.by(outE('recordLink').as('re-id', 're-wt').inV().as('rv')
  .select('re-id', 're-wt', 'rv').by(id).by(valueMap('weight')).by(id).fold())
.limit(100)

## RecordQueryNotInIdentifierDomain -haslinks


//Query: select from patient let $id = out_identifierEdge 
// where dateVoided is null and ($id.size() = 0 or $id.in.identifierDomainId not contains '18') 
// and (in_recordLink is not null OR
// out_recordLink is not null) limit {limit}

In [None]:
%%gremlin

g.V().hasLabel('patient')
.where(out('identifierEdge').has('identifierDomainId',neq('18')).count().is(gt(0)))
.where(out('recordLink').count().is(gt(0)))
.elementMap()
.limit(100)


In [None]:
%%gremlin

// same, but we list a summary of identifiers and links. Show at most three record links
g.V().hasLabel('patient')
.where(out('identifierEdge').has('identifierDomainId',neq('18')).count().is(gt(0)))
.where(out('recordLink').count().is(gt(0)))
.project('node', 'identifiers', 'links')
.by(elementMap())
.by(outE('identifierEdge').as('ie').inV().as('iv').select('ie', 'iv').by(id).by(elementMap()).fold())
.by(outE('recordLink').as('re-id', 're-wt').inV().as('rv')
  .select('re-id', 're-wt', 'rv').by(id).by(valueMap('weight')).by(id).limit(3).fold())
.limit(100)

In [None]:
%%gremlin

//
// SSN Match Query
//

// here are patients sharing ssn
g.V().hasLabel('patient').has('ssn', '672181714').elementMap()

// same but bring in identifiers and record links
g.V().hasLabel('patient').has('ssn', '672181714')
.project('patient', 'identifiers', 'links')
.by(elementMap())
.by(outE('identifierEdge').as('ie').inV().as('iv').select('ie', 'iv').by(id).by(elementMap()).fold())
.by(outE('recordLink').as('re-id', 're-wt').inV().as('rv')
  .select('re-id', 're-wt', 'rv').by(id).by(valueMap('weight')).by(elementMap()).fold())

// all patients
g.V().hasLabel('patient')
.order().by('ssn')
.project('patient', 'identifiers', 'links')
.by(elementMap())
.by(outE('identifierEdge').as('ie').inV().as('iv').select('ie', 'iv').by(id).by(elementMap()).fold())
.by(outE('recordLink').as('re-id', 're-wt').inV().as('rv')
  .select('re-id', 're-wt', 'rv').by(id).by(valueMap('weight')).by(elementMap()).fold())
.limit(10)


//g.V().hasLabel('patient')
//.order().by('ssn').as('patient')
//.select('patient', 'identifiers').by(elementMap()).by(outE('identifierEdge').as('ie').inV().as('iv').select('ie', 'iv').by(id).by(elementMap()).fold())
//.limit(100)


In [None]:
%%gremlin

//Query: select from patient let $id = out_identifierEdge where dateVoided is null and ($id.size() = 0 or $id.in.identifierDomainId not contains '18') and (in_recordLink is not null OR
//out_recordLink is not null) limit {limit}

g.V().hasLabel('patient')
.where(out('identifierEdge').has('identifierDomainId',neq('18')).count().is(gt(0)))
.where(out('recordLink').count().is(gt(0)))
.elementMap()
.limit(100)

// same, but we list a summary of identifiers and lin
g.V().hasLabel('patient').as('patient')
.where(out('identifierEdge').has('identifierDomainId',neq('18')).count().is(gt(0)))
.where(out('recordLink').count().is(gt(0)))
.out('identifierEdge', 'recordLink').as('edge')
.select('patient', 'edge').by(elementMap())
.limit(100)

// bring in identifiers and record links
g.V().hasLabel('patient')
.project('node', 'identifiers', 'links')
.by(elementMap())
.by(outE('identifierEdge').as('ie').inV().as('iv').select('ie', 'iv').by(id).by(elementMap()).fold())
.by(outE('recordLink').as('re-id', 're-wt').inV().as('rv')
  .select('re-id', 're-wt', 'rv').by(id).by(valueMap('weight')).by(id).fold())
.order().by('ssn')
.limit(50) 