# Experiment 1 - LM with All Data, with FastAI tokenization

In [None]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [None]:
from fastai.text import *
import html
import pickle

In [None]:
BOS = 'xbos'  # beginning-of-sentence tag
FLD = 'xfld'  # data field tag

PATH=Path('/home/paperspace/data/mimic-iii')
LM_PATH=PATH/'lm_word_level'

LM_PATH.mkdir(exist_ok=True)

In [None]:
df = pd.read_csv(PATH/'NOTEEVENTS.csv.gz')

In [None]:
notes = df.TEXT.values

In [None]:
np.random.seed(42)
np.random.shuffle(notes)

In [None]:
trn_texts,val_texts = sklearn.model_selection.train_test_split(
    notes, test_size=0.1)

df_trn = pd.DataFrame({'text':trn_texts, 'labels':[0]*len(trn_texts)}, columns=['labels','text'])
df_val = pd.DataFrame({'text':val_texts, 'labels':[0]*len(val_texts)}, columns=['labels','text'])

df_trn['text'] = df_trn.text.str.replace('\n','')
df_val['text'] = df_val.text.str.replace('\n','')

df_trn.to_csv(LM_PATH/'train.csv', header=True, index=False)
df_val.to_csv(LM_PATH/'test.csv', header=True, index=False)

In [None]:
chunksize=20000

In [None]:
re1 = re.compile(r'  +')

def fixup(x):
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))


In [None]:
df_trn = pd.read_csv(LM_PATH/'train.csv', chunksize=chunksize, engine='python')
df_val = pd.read_csv(LM_PATH/'test.csv', chunksize=chunksize, engine='python')

In [None]:
def get_texts(df):
    texts = f'\n{BOS} {FLD} 1 ' + df.text.astype(str)
    texts = texts.apply(fixup).values.astype(str)
    tok = Tokenizer.proc_all_mp(partition_by_cores(texts), lang='en')
    return tok

def get_all(df, name):
    for i, r in enumerate(df):
        print(i)
        tok_  = get_texts(r)
        #save the partial tokens instead of regrouping them in one big array.
        np.save(LM_PATH/f'{name}_tok{i}.npy', tok_)

In [None]:
get_all(df_trn,'trn')

In [None]:
get_all(df_val,'tst')

In [None]:
def count_them_all(names):
    cnt = Counter()
    for name in names:
        for file in LM_PATH.glob(f'{name}_tok*'):
            tok = np.load(file)
            cnt_tok = Counter(word for sent in tok for word in sent)
            cnt += cnt_tok
    return cnt

In [None]:
cnt = count_them_all(['trn'])

In [None]:
cnt.most_common(n=500)

In [None]:
max_vocab = 60000
min_freq = 5

In [None]:
itos = [o for o,c in cnt.most_common(max_vocab) if c > min_freq]
itos.insert(0,'_pad_')
itos.insert(0,'_unk_')

In [None]:
len(itos)

In [None]:
stoi = collections.defaultdict(int,{s:i for (i,s) in enumerate(itos)})

In [None]:
with open(LM_PATH/'stoi.pickle','rb') as f:
    stoi = pickle.load(f)

In [None]:
def numericalize(name, partial=True):
    results = []
    for index, file in enumerate(LM_PATH.glob(f'{name}_tok*')):
        print(index)
        tok = np.load(file)
        results.append(np.array([[stoi[word] for word in sent] for sent in tok]))
        
        if (index == 10) and (partial==True):
            break

    return np.concatenate(results)

In [None]:
trn_ids = numericalize('trn')
np.save(LM_PATH/'trn_ids.npy', trn_ids)

In [None]:
val_ids = numericalize('tst')
np.save(LM_PATH/'val_ids.npy', val_ids)

In [None]:
with open(LM_PATH/'itos.pickle', 'wb') as handle:
    pickle.dump(itos, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open(LM_PATH/'stoi.pickle', 'wb') as handle:
    pickle.dump(stoi, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
trn_ids = np.load(LM_PATH/'trn_ids.npy')
np.save(LM_PATH/'trn_ids_concat.npy', np.concatenate(trn_ids))

In [None]:
val_ids = np.load(LM_PATH/'val_ids.npy')
np.save(LM_PATH/'val_ids_concat.npy', np.concatenate(val_ids))