# Experiment 4 - Word-level LM with Sampled Data, Preprocessed

Using the mimic3-utils (https://github.com/sudarshan85/mimic3-utils/) the data has been preprocessed to replace commonly redacted information with tokens. 

The following notebook will allow you to get to the same processed data that I load below:
https://github.com/sudarshan85/mimic3-utils/blob/master/mimic-notes-preprocess-from-file.ipynb

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
from fastai.text import *
import html
import pickle

  from numpy.core.umath_tests import inner1d


In [3]:
BOS = 'xbos'  # beginning-of-sentence tag
FLD = 'xfld'  # data field tag

PATH=Path('/home/paperspace/data/mimic-iii')
LM_PATH=PATH/'exp-4'

LM_PATH.mkdir(exist_ok=True)

In [5]:
chunksize=20000

In [6]:
df_trn = pd.read_csv(LM_PATH/'train.csv', chunksize=chunksize, engine='python')
df_val = pd.read_csv(LM_PATH/'val.csv', chunksize=chunksize, engine='python')

In [7]:
def get_texts(df):
    texts = f'\n{BOS} ' + df.proc_text.astype(str)
    tok = Tokenizer.proc_all_mp(partition_by_cores(texts), lang='en')
    return tok

def get_all(df, name):
    for i, r in enumerate(df):
        print(i)
        tok_  = get_texts(r)
        #save the partial tokens instead of regrouping them in one big array.
        np.save(LM_PATH/f'{name}_tok{i}.npy', tok_)

In [8]:
get_all(df_trn,'trn')

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66


In [9]:
get_all(df_val,'val')

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16


In [10]:
def count_them_all(names):
    cnt = Counter()
    for name in names:
        for file in LM_PATH.glob(f'{name}_tok*'):
            tok = np.load(file)
            cnt_tok = Counter(word for sent in tok for word in sent)
            cnt += cnt_tok
    return cnt

In [11]:
cnt = count_them_all(['trn'])

In [12]:
cnt.most_common(n=50)

[('\n ', 34164786),
 ('t_up', 30174417),
 ('.', 24853159),
 (':', 17153030),
 (',', 14379149),
 ('\n', 12111097),
 ('/', 9688055),
 ('and', 6589993),
 ('to', 5991476),
 ('the', 5908185),
 ('-', 5541930),
 ('of', 5245754),
 ('with', 4603970),
 (')', 4480676),
 ('(', 3975311),
 ('for', 3420196),
 ('in', 3383370),
 ('xxdate', 3099099),
 ('is', 2993496),
 ('on', 2913167),
 ('no', 2875740),
 ('patient', 2621691),
 ('\n\n ', 2423244),
 ('a', 2193193),
 ('\n\n', 2170884),
 ('was', 1947243),
 ('at', 1759823),
 ('mg', 1480230),
 ('left', 1378568),
 ('%', 1347992),
 ('xbos', 1328353),
 ('this', 1313900),
 ('as', 1291006),
 ('#', 1264849),
 ('right', 1264339),
 ('ml', 1185811),
 ('not', 1044926),
 ('xxmmdd', 989587),
 ('there', 987744),
 ('are', 985142),
 ('or', 984127),
 ('1', 982599),
 ('2', 972217),
 ('s', 935815),
 ('tk_rep', 929212),
 (';', 917933),
 ('p', 880212),
 ('*', 869060),
 ('from', 861849),
 ('chest', 815991)]

In [13]:
max_vocab = 60000
min_freq = 5

In [14]:
itos = [o for o,c in cnt.most_common(max_vocab) if c > min_freq]
itos.insert(0,'_pad_')
itos.insert(0,'_unk_')

In [15]:
len(itos)

60002

In [16]:
stoi = collections.defaultdict(int,{s:i for (i,s) in enumerate(itos)})

In [17]:
#with open(LM_PATH/'stoi.pickle','rb') as f:
#    stoi = pickle.load(f)

In [18]:
def numericalize(name, partial=True):
    results = []
    for index, file in enumerate(LM_PATH.glob(f'{name}_tok*')):
        print(index)
        tok = np.load(file)
        results.append(np.array([[stoi[word] for word in sent] for sent in tok]))
        
        if (index == 10) and (partial==True):
            break

    return np.concatenate(results)

In [19]:
trn_ids = numericalize('trn')
np.save(LM_PATH/'trn_ids.npy', trn_ids)

0
1
2
3
4
5
6
7
8
9
10


In [20]:
val_ids = numericalize('val')
np.save(LM_PATH/'val_ids.npy', val_ids)

0
1
2
3
4
5
6
7
8
9
10


In [21]:
with open(LM_PATH/'itos.pickle', 'wb') as handle:
    pickle.dump(itos, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [22]:
with open(LM_PATH/'stoi.pickle', 'wb') as handle:
    pickle.dump(stoi, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [23]:
trn_ids = np.load(LM_PATH/'trn_ids.npy')
np.save(LM_PATH/'trn_ids_concat.npy', np.concatenate(trn_ids))

In [24]:
val_ids = np.load(LM_PATH/'val_ids.npy')
np.save(LM_PATH/'val_ids_concat.npy', np.concatenate(val_ids))