In [1]:
import pandas as pd
import s3fs

import re

from gensim.models import FastText

train_start_date = '2017-01-01'
train_end_date = '2019-02-28'
test_end_date = '2019-06-26'

# Create Full Note Set

In [2]:
avante_df = pd.read_parquet(f's3://saiva-restricted-data/raw/avante_progress_notes_{train_start_date}_{test_end_date}.parquet')

In [3]:
grey_df = pd.read_parquet(f's3://saiva-restricted-data/raw/greystone_progress_notes_{train_start_date}_{test_end_date}.parquet')

In [4]:
avante_df['provider'] = 'avante'
grey_df['provider'] = 'greystone'

In [5]:
full_df = avante_df.append(grey_df).sort_values(['ProgressNoteID', 'SectionSequence', 'NoteTextOrder'])

In [6]:
grp_columns = 'provider ProgressNoteID PatientID FacilityID ProgressNoteType CreatedDate SectionSequence Section'.split()

In [7]:
grp = full_df.groupby(grp_columns)

In [8]:
full_notes = grp['NoteText'].agg(lambda x: ''.join(x)).reset_index()

In [9]:
full_notes.to_parquet(f's3://saiva-restricted-data/raw/full_note_text_{train_start_date}_{test_end_date}.parquet')

# Load and process

In [2]:
full_notes = pd.read_parquet(f's3://saiva-restricted-data/raw/full_note_text_{train_start_date}_{test_end_date}.parquet')

In [10]:
def preprocess(strings):
    for s in strings:
        yield preprocess_one(s)

def preprocess_one(s):
    s = s.lower()
    tokens = re.split(r'\s', s)
    return tuple(t for t in tokens if len(t) > 0)

In [11]:
from multiprocessing import Pool
import os

In [12]:
pool = Pool(os.cpu_count() - 2)

In [13]:
# sample['tokens'] = sample['NoteText'].apply(preprocess_one)
# full_notes['tokens'] = full_notes['NoteText'].apply(preprocess_one)
tokens = pool.map(preprocess_one, full_notes[full_notes['NoteText'].str.len() >= 100]['NoteText'])

In [14]:
len(tokens)

7432246

In [14]:
len(tokens)

14959658

In [15]:
%%time
model = FastText(size=300, window=5, workers=os.cpu_count()-2, min_count=50)
model.build_vocab(tokens)
model.train(sentences=tokens, total_examples=len(tokens), epochs=20)

CPU times: user 4d 6h 52min 14s, sys: 2min 30s, total: 4d 6h 54min 45s
Wall time: 2h 27min 18s


In [16]:
model.save(f'/code/data/fasttext_model_min_100_chars_{train_start_date}_{test_end_date}.model')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


# Copy to s3

In [17]:
import glob

In [18]:
model_files = glob.glob(f'/code/data/fasttext_model_min_100_chars_{train_start_date}_{test_end_date}.model*')

In [19]:
import boto3

In [20]:
s3 = boto3.resource('s3')

In [21]:
import os

In [22]:
os.path.basename(model_files[0])

'fasttext_model_2017-01-01_2019-06-26.model.wv.vectors.npy'

In [22]:
for fp in model_files:
    s3.Bucket('saiva-restricted-data').upload_file(fp, os.path.join('models/meta/', os.path.basename(fp)))
    print(fp)

/code/data/fasttext_model_min_100_chars_2017-01-01_2019-06-26.model.trainables.vectors_vocab_lockf.npy
/code/data/fasttext_model_min_100_chars_2017-01-01_2019-06-26.model.trainables.vectors_ngrams_lockf.npy
/code/data/fasttext_model_min_100_chars_2017-01-01_2019-06-26.model
/code/data/fasttext_model_min_100_chars_2017-01-01_2019-06-26.model.wv.vectors_ngrams.npy
/code/data/fasttext_model_min_100_chars_2017-01-01_2019-06-26.model.trainables.syn1neg.npy
/code/data/fasttext_model_min_100_chars_2017-01-01_2019-06-26.model.wv.vectors_vocab.npy
/code/data/fasttext_model_min_100_chars_2017-01-01_2019-06-26.model.wv.vectors.npy


# Load from disk

In [4]:
model = FastText.load(f'/code/data/fasttext_model_{train_start_date}_{train_end_date}.model')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [17]:
kv = model.wv

In [18]:
model = None

In [19]:
kv.save('/code/data/fasttext_keyed_vectors.kv')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
