In [1]:
import re
from multiprocessing import Pool
import os
import pandas as pd
from gensim.models.phrases import Phrases, Phraser
from functools import lru_cache

In [2]:
greystone_notes = pd.read_parquet('/data/progress_notes/greystone_progress_notes_all_time.parquet')
greystone_notes['client'] = 'greystone'

In [3]:
avante_notes = pd.read_parquet('/data/progress_notes/avante_notes_valid_types_only.parquet')

In [4]:
note_columns = set(avante_notes.columns) & set(greystone_notes.columns)
all_notes = greystone_notes[note_columns].append(avante_notes[note_columns]).copy()
all_notes = all_notes.sort_values(['client', 'CreatedDate', 'ProgressNoteID', 'Section', 'SectionSequence'])

In [5]:
grp_cols = ['client', 'FacilityID', 'PatientID', 'CreatedDate', 'ProgressNoteID', 'ProgressNoteType', 'Section']
all_notes_grouped = all_notes.groupby(grp_cols).agg({'NoteText': lambda note_parts: ''.join(note_parts)}).reset_index()

In [6]:
def preprocess(strings):
    for s in strings:
        yield preprocess_one(s)

@lru_cache(maxsize=500000)
def preprocess_one(s):
    s = s.lower()
    tokens = re.split(r'\b', s)
    return tuple(t for t in tokens if len(t) > 0 and t != ' ')

In [7]:
pool = Pool(os.cpu_count() - 2)

In [8]:
tokens = pool.map(preprocess_one, all_notes_grouped['NoteText'].values)

In [9]:
all_notes_grouped['tokens'] = tokens

In [10]:
all_notes_grouped.to_pickle('/data/progress_notes/all_notes_tokenized.pickle')

In [11]:
is_emar = (all_notes_grouped['ProgressNoteType'] == 'eMAR-Medication Administration Note') | (all_notes_grouped['ProgressNoteType'] == 'eMar - General Note from eRecord')

In [12]:
emar_notes = all_notes_grouped.loc[is_emar]
other_notes = all_notes_grouped.loc[~is_emar]
assert len(emar_notes) + len(other_notes) == len(all_notes_grouped)

In [13]:
emar_notes.to_pickle('/data/emar_notes_tokenized.pickle')
other_notes.to_pickle('/data/non_emar_notes_tokenized.pickle')

In [14]:
!mv /data/emar_notes_tokenized.pickle /data/progress_notes/emar_notes_tokenized.pickle

In [15]:
!mv /data/non_emar_notes_tokenized.pickle /data/progress_notes/non_emar_notes_tokenized.pickle