# Progress Notes
## Build Word Vector Models
Run tokenizer and phrase detection, save corpus and train fasttext embeddings.  Save tokenized notes dataset. 

In [None]:
import os
import sys
import datetime
import pandas as pd
import pickle as pkl
import numpy as np
import scipy
import gc
import re

from gensim.utils import simple_preprocess
from gensim.models import FastText

%load_ext autoreload
%autoreload 2

In [None]:
sys.path.append('/src')
from shared import data
from shared import notes

In [None]:
data_dict = data.load_raw_data_from_files('/data/raw')

In [None]:
note_parts = data_dict['progress_notes']
note_parts.shape

In [None]:
note_parts = note_parts.sort_values(['MasterPatientID', 
                                    'ProgressNoteID', 
                                   'SectionSequence', 
                                   'NoteTextOrder'])
note_parts = note_parts.reset_index(drop=True)
grp_cols = ['MasterPatientID', 'FacilityID', 'ProgressNoteID', 
            'ProgressNoteType', 'Section', 'SectionSequence', 
            'CreatedDate']
stitched_notes = note_parts.groupby(grp_cols).agg({'NoteText': lambda x: ''.join(x)}).reset_index()
stitched_notes.shape

In [None]:
# Filter eMAR & Progress Notes

print("Splitting notes into eMAR and other notes...")
note_types = np.array([str(el) for el in stitched_notes.ProgressNoteType.unique()])
emar_sel = np.array([re.match(r'emar', note_type, re.IGNORECASE) is not None for note_type in note_types])
emar_note_types = note_types[emar_sel]
is_emar = stitched_notes['ProgressNoteType'].isin(emar_note_types)
emar_notes = stitched_notes.loc[is_emar]
progress_notes = stitched_notes.loc[~is_emar]

In [None]:
# Get rid of notes with no note text

mask = emar_notes.NoteText == ''
mask = mask.values
emar_notes = emar_notes[~mask]

mask = progress_notes.NoteText == ''
mask = mask.values
progress_notes = progress_notes[~mask]

In [None]:
# Save again - duh. 

print("Saving stitched eMAR and progress notes...")
emar_notes.to_parquet('/data/raw/emar_notes.parquet')
progress_notes.to_parquet('/data/raw/pn_notes.parquet')

In [None]:
# Filter out notes after val start date (train end date)

train_end_date = pd.Timestamp('2019-07-31')
sel = (emar_notes.CreatedDate <= train_end_date).values
print(emar_notes.shape)
emar_notes = emar_notes[sel]
print(emar_notes.shape)

sel = (progress_notes.CreatedDate <= train_end_date).values
print(progress_notes.shape)
progress_notes = progress_notes[sel]
print(progress_notes.shape)

In [None]:
# Get corpora, generate phrases, save corpora with phrases... 

from multiprocessing import Pool

def _processNote(note_text): 
    word_list = simple_preprocess(note_text, max_len=25)
    words = " ".join(word_list)
    return words

print(f'Processing {len(progress_notes)} progress notes')
with Pool(os.cpu_count() - 4) as pool: 
    pn_sentences = pool.map(_processNote, progress_notes.NoteText.values)

print(f'Processing {len(emar_notes)} eMAR notes')    
with Pool(os.cpu_count() - 4) as pool: 
    emar_sentences = pool.map(_processNote, emar_notes.NoteText.values)
    
print('Done...')

In [None]:
# Write corpora to file

print("Writing progress note corpus...")
pn_corpus_file = "/data/raw/pn_corpus.txt"
with open(pn_corpus_file, "w") as f_out: 
    f_out.write("\n".join(pn_sentences))

print("Writing emar note corpus...")
emar_corpus_file = "/data/raw/emar_corpus.txt"        
with open(emar_corpus_file, "w") as f_out: 
    f_out.write("\n".join(emar_sentences))

print('Done...')    

In [None]:
from gensim.models import FastText

num_threads = os.cpu_count() - 8

# Fit fasttext embeddings to these...
embed_dim = 200
num_iter = 3
print(f"Fitting fasttext for pn phrases")
pn_model = FastText(corpus_file='/data/raw/pn_corpus.txt', 
                    size=embed_dim, 
                    sg=1, 
                    iter=num_iter,
                    negative=10,
                    workers=num_threads)
pn_model.save(f"/data/models/ft_progress_notes_d{embed_dim}.model")

print(f"Fitting fasttext for emar phrases")
emar_model = FastText(corpus_file='/data/raw/emar_corpus.txt',  
                      size=embed_dim, 
                      sg=1, 
                      iter=num_iter,
                      negative=10,
                      workers=num_threads)
emar_model.save(f"/data/models/ft_emar_notes_d{embed_dim}.model")