In [1]:
from gensim.models import KeyedVectors
import pandas as pd
import numpy as np
import re

from functools import lru_cache

train_start_date = '2017-01-01'
train_end_date = '2019-02-28'
test_end_date = '2019-06-26'

In [2]:
full_notes = pd.read_parquet(f's3://saiva-restricted-data/raw/avante_progress_notes_{train_start_date}_{test_end_date}.parquet')

In [3]:
def preprocess(strings):
    for s in strings:
        yield preprocess_one(s)

def preprocess_one(s):
    s = s.lower()
    tokens = re.split(r'\s', s)
    return tuple(t for t in tokens if len(t) > 0)

In [4]:
kv = KeyedVectors.load('/code/data/fasttext_keyed_vectors.kv')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [7]:
avg_vector = np.mean(kv.vectors, axis=0)

In [9]:
@lru_cache(maxsize=100000)
def vector_lookup(t):
    try:
        return kv[t]
    except KeyError:
        return avg_vector

def vectorize_note(s):
        tokens = preprocess_one(s)
        if tokens:
            note_vector = np.sum([vector_lookup(t) for t in tokens], axis=0)
            return note_vector
        else:
            return avg_vector

In [13]:
from multiprocessing import Pool

In [14]:
import os

In [15]:
pool = Pool(os.cpu_count() - 2)

In [16]:
vectors = pool.map(vectorize_note, full_notes['NoteText'])

In [18]:
vectors_df = pd.DataFrame.from_records(vectors)

vectors_df.columns = [f'e_{n}' for n in vectors_df.columns]

In [19]:
full_notes = pd.concat([full_notes.reset_index(), vectors_df], axis=1)

In [20]:
del vectors_df

In [21]:
import pickle

In [22]:
with open('/code/data/processed/avante_notes.pickle', 'wb') as f:
    pickle.dump(full_notes, f, protocol=4)

In [25]:
test_note = full_notes.iloc[-1]['NoteText']

In [34]:
len(full_notes)

13886119

In [33]:
full_notes.index.get_level_values(7).value_counts()

Note Text                                                                              8945068
Sx / Lavage (Yes or No)                                                                 150129
FiO2                                                                                    150129
Trach Care (Yes or No) with Narrative                                                   150129
Resp. Rate (Pre / Post)                                                                 150129
Heart Rate (Pre - Post)                                                                 150129
SAN Treatment Time with Narrative                                                       150129
Sa02 (Pre / Post)                                                                       150129
Ambu Bag / O2 Tank / Spare Trach in place (Yes or No)                                   147592
Narrative Note                                                                          147592
Measured /Spontaneous Parameters: Sa02            

In [28]:
full_notes.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,NoteText,vector
provider,ProgressNoteID,PatientID,FacilityID,ProgressNoteType,CreatedDate,SectionSequence,Section,Unnamed: 8_level_1,Unnamed: 9_level_1
avante,17221143,366254,21,eMAR-Medication Administration Note,2017-01-01 00:08:01.257,1,Note Text,Ketoconazole Cream 2 %\n\tApply to bilateral ...,"[-34.967876, -10.116984, -35.538002, 67.07896,..."
avante,17221144,140573,3,Weekly Nurses Skin Observation Note,2017-01-01 00:37:48.507,1,Note Text,"A skin observation was completed on, SCOTT STR...","[-77.20341, 258.98798, -72.93674, 116.10251, 1..."
avante,17221174,409644,18,* Skilled Nursing Note,2017-01-01 00:11:47.527,1,Note Text,98.7-66-18-132/72 continue plan of care.,"[-0.06227255, 1.1612256, -1.2654214, 9.444483,..."
avante,17221233,179253,18,* Skilled Nursing Note,2017-01-01 00:01:52.583,1,Note Text,no c/o voiced this tour.resident takes meds we...,"[-44.405064, -69.823814, -119.374466, 32.02920..."
avante,17221234,307860,13,* General NURSING Note,2017-01-01 00:04:25.697,1,Note Text,"Resident A&O times 3, able to make needs known...","[-61.400497, -155.34035, -54.70516, 8.950726, ..."


In [19]:
full_notes.loc[full_notes['NoteText'] == 'Yes']

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,NoteText,vector
provider,ProgressNoteID,PatientID,FacilityID,ProgressNoteType,CreatedDate,SectionSequence,Section,Unnamed: 8_level_1,Unnamed: 9_level_1
avante,17221706,388038,10,z R.T. Shift Note (7pm - 7am),2017-01-01 00:33:01.013,2,Ambu Bag / O2 Tank / Spare Trach in place (Yes or No),Yes,"[0.34187862, -6.979161, 14.907242, 0.09937014,..."
avante,17221787,379157,10,z R.T. Shift Note (7pm - 7am),2017-01-01 00:46:10.743,2,Ambu Bag / O2 Tank / Spare Trach in place (Yes or No),Yes,"[0.34187862, -6.979161, 14.907242, 0.09937014,..."
avante,17221881,411592,10,z R.T. Shift Note (7pm - 7am),2017-01-01 01:16:22.807,2,Ambu Bag / O2 Tank / Spare Trach in place (Yes or No),Yes,"[0.34187862, -6.979161, 14.907242, 0.09937014,..."
avante,17222024,404249,6,z R.T. Shift Note (7pm - 7am),2017-01-01 02:14:33.447,2,Ambu Bag / O2 Tank / Spare Trach in place (Yes or No),Yes,"[0.34187862, -6.979161, 14.907242, 0.09937014,..."
avante,17222487,369080,10,z R.T. Shift Note (7pm - 7am),2017-01-01 03:02:09.643,2,Ambu Bag / O2 Tank / Spare Trach in place (Yes or No),Yes,"[0.34187862, -6.979161, 14.907242, 0.09937014,..."
avante,17222764,409771,6,z R.T. Shift Note (7pm - 7am),2017-01-01 05:21:27.153,2,Ambu Bag / O2 Tank / Spare Trach in place (Yes or No),Yes,"[0.34187862, -6.979161, 14.907242, 0.09937014,..."
avante,17222830,391867,10,z R.T. Shift Note (7pm - 7am),2017-01-01 04:24:02.447,2,Ambu Bag / O2 Tank / Spare Trach in place (Yes or No),Yes,"[0.34187862, -6.979161, 14.907242, 0.09937014,..."
avante,17222959,381108,6,z R.T. Shift Note (7pm - 7am),2017-01-01 05:25:20.060,2,Ambu Bag / O2 Tank / Spare Trach in place (Yes or No),Yes,"[0.34187862, -6.979161, 14.907242, 0.09937014,..."
avante,17222974,402836,6,z R.T. Shift Note (7pm - 7am),2017-01-01 05:26:51.357,2,Ambu Bag / O2 Tank / Spare Trach in place (Yes or No),Yes,"[0.34187862, -6.979161, 14.907242, 0.09937014,..."
avante,17223054,393306,6,z R.T. Shift Note (7pm - 7am),2017-01-01 05:33:42.010,2,Ambu Bag / O2 Tank / Spare Trach in place (Yes or No),Yes,"[0.34187862, -6.979161, 14.907242, 0.09937014,..."


In [70]:
pd.concat([full_notes.head(), full_notes.head()['vector'].apply(pd.Series)], axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,NoteText,vector,0,1,2,3,4,5,6,7,...,290,291,292,293,294,295,296,297,298,299
provider,ProgressNoteID,PatientID,FacilityID,ProgressNoteType,CreatedDate,SectionSequence,Section,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
avante,17221143,366254,21,eMAR-Medication Administration Note,2017-01-01 00:08:01.257,1,Note Text,Ketoconazole Cream 2 %\n\tApply to bilateral ...,"[-34.967876, -10.116984, -35.538002, 67.07896,...",-34.967876,-10.116984,-35.538002,67.078957,-108.616386,100.519669,6.509571,-25.575071,...,27.300659,18.002102,-43.263775,44.420994,-60.039364,84.271309,-54.068474,-39.837219,48.489815,-87.552116
avante,17221144,140573,3,Weekly Nurses Skin Observation Note,2017-01-01 00:37:48.507,1,Note Text,"A skin observation was completed on, SCOTT STR...","[-77.20341, 258.98798, -72.93674, 116.10251, 1...",-77.203407,258.987976,-72.936737,116.102509,135.238235,105.968681,-97.621643,-68.923195,...,-151.77536,-116.886086,127.423866,-217.448608,-75.750168,53.329865,-114.793877,-149.095291,74.077911,-108.858856
avante,17221174,409644,18,* Skilled Nursing Note,2017-01-01 00:11:47.527,1,Note Text,98.7-66-18-132/72 continue plan of care.,"[-0.06227255, 1.1612256, -1.2654214, 9.444483,...",-0.062273,1.161226,-1.265421,9.444483,18.556273,1.997604,-10.535656,37.203003,...,4.844214,-4.187387,-6.266688,16.259949,-27.57954,-0.664464,6.138464,-1.90341,1.459381,8.054781
avante,17221233,179253,18,* Skilled Nursing Note,2017-01-01 00:01:52.583,1,Note Text,no c/o voiced this tour.resident takes meds we...,"[-44.405064, -69.823814, -119.374466, 32.02920...",-44.405064,-69.823814,-119.374466,32.029205,-171.118866,-25.77891,0.263687,96.029793,...,-48.214283,-29.323418,-44.39954,31.991673,-66.514771,72.265961,63.012627,29.912403,-16.558504,-40.953609
avante,17221234,307860,13,* General NURSING Note,2017-01-01 00:04:25.697,1,Note Text,"Resident A&O times 3, able to make needs known...","[-61.400497, -155.34035, -54.70516, 8.950726, ...",-61.400497,-155.340347,-54.705158,8.950726,-123.336006,72.488274,123.8592,-61.055389,...,74.040321,5.359493,-1.47936,206.780945,-163.656525,-96.320419,67.976273,47.83926,49.563931,-24.667557


In [72]:
def seriesify(x):
    return pd.Series(x)

In [73]:
pool = Pool(processes=12)

Process ForkPoolWorker-28:
Process ForkPoolWorker-36:
Process ForkPoolWorker-32:
Process ForkPoolWorker-26:
Process ForkPoolWorker-35:
Process ForkPoolWorker-31:
Process ForkPoolWorker-29:
Process ForkPoolWorker-34:
Process ForkPoolWorker-30:
Process ForkPoolWorker-27:
Process ForkPoolWorker-25:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.7/multiprocessing/p

In [78]:
embeddings = pool.map(seriesify, full_notes['vector'])

KeyboardInterrupt: 

In [77]:
pd.DataFrame(resp)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-34.967876,-10.116984,-35.538002,67.078957,-108.616386,100.519669,6.509571,-25.575071,-52.944805,-12.982584,...,27.300659,18.002102,-43.263775,44.420994,-60.039364,84.271309,-54.068474,-39.837219,48.489815,-87.552116
1,-77.203407,258.987976,-72.936737,116.102509,135.238235,105.968681,-97.621643,-68.923195,-37.388309,114.722565,...,-151.77536,-116.886086,127.423866,-217.448608,-75.750168,53.329865,-114.793877,-149.095291,74.077911,-108.858856
2,-0.062273,1.161226,-1.265421,9.444483,18.556273,1.997604,-10.535656,37.203003,-2.420051,8.862009,...,4.844214,-4.187387,-6.266688,16.259949,-27.57954,-0.664464,6.138464,-1.90341,1.459381,8.054781
3,-44.405064,-69.823814,-119.374466,32.029205,-171.118866,-25.77891,0.263687,96.029793,22.707964,23.610312,...,-48.214283,-29.323418,-44.39954,31.991673,-66.514771,72.265961,63.012627,29.912403,-16.558504,-40.953609
4,-61.400497,-155.340347,-54.705158,8.950726,-123.336006,72.488274,123.8592,-61.055389,28.173901,-186.609329,...,74.040321,5.359493,-1.47936,206.780945,-163.656525,-96.320419,67.976273,47.83926,49.563931,-24.667557


In [71]:
embedded_notes = full_notes['vector'].apply(pd.Series)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
provider,ProgressNoteID,PatientID,FacilityID,ProgressNoteType,CreatedDate,SectionSequence,Section,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
avante,17221143,366254,21,eMAR-Medication Administration Note,2017-01-01 00:08:01.257,1,Note Text,-34.967876,-10.116984,-35.538002,67.078957,-108.616386,100.519669,6.509571,-25.575071,-52.944805,-12.982584,...,27.300659,18.002102,-43.263775,44.420994,-60.039364,84.271309,-54.068474,-39.837219,48.489815,-87.552116
avante,17221144,140573,3,Weekly Nurses Skin Observation Note,2017-01-01 00:37:48.507,1,Note Text,-77.203407,258.987976,-72.936737,116.102509,135.238235,105.968681,-97.621643,-68.923195,-37.388309,114.722565,...,-151.77536,-116.886086,127.423866,-217.448608,-75.750168,53.329865,-114.793877,-149.095291,74.077911,-108.858856
avante,17221174,409644,18,* Skilled Nursing Note,2017-01-01 00:11:47.527,1,Note Text,-0.062273,1.161226,-1.265421,9.444483,18.556273,1.997604,-10.535656,37.203003,-2.420051,8.862009,...,4.844214,-4.187387,-6.266688,16.259949,-27.57954,-0.664464,6.138464,-1.90341,1.459381,8.054781
avante,17221233,179253,18,* Skilled Nursing Note,2017-01-01 00:01:52.583,1,Note Text,-44.405064,-69.823814,-119.374466,32.029205,-171.118866,-25.77891,0.263687,96.029793,22.707964,23.610312,...,-48.214283,-29.323418,-44.39954,31.991673,-66.514771,72.265961,63.012627,29.912403,-16.558504,-40.953609
avante,17221234,307860,13,* General NURSING Note,2017-01-01 00:04:25.697,1,Note Text,-61.400497,-155.340347,-54.705158,8.950726,-123.336006,72.488274,123.8592,-61.055389,28.173901,-186.609329,...,74.040321,5.359493,-1.47936,206.780945,-163.656525,-96.320419,67.976273,47.83926,49.563931,-24.667557


In [64]:
pd.concat([full_notes.head(),pd.DataFrame(data=full_notes['vector'].head().apply(lambda x: x.tolist()).tolist(), columns=[f'e_{n}' for n in range(0,300)])], axis=1)

ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'

In [38]:
full_notes.to_pickle(f'/code/data/full_notes_with_vectors_{train_start_date}_{train_end_date}.p')

In [41]:
import s3fs

In [44]:
import boto3

In [45]:
s3 = boto3.resource('s3')

In [47]:
s3.Bucket('saiva-restricted-data').upload_file(f'/code/data/full_notes_with_vectors_{train_start_date}_{train_end_date}.p', f'processed/full_notes_with_vectors_{train_start_date}_{train_end_date}.p')