# Experiment 2 - LM with Sampled Data, Lowercased with BOS/EOS

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
from fastai.text import *
import html
import pickle

  from numpy.core.umath_tests import inner1d


In [4]:
BOS = 'xbos'  # beginning-of-sentence tag
EOS = 'xeos'  # data field tag

PATH=Path('/home/paperspace/data/mimic-iii')
LM_PATH=PATH/'exp-2'

LM_PATH.mkdir(exist_ok=True)

In [5]:
df = pd.read_csv(PATH/'NOTEEVENTS.csv.gz')

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
notes = df.TEXT.str.lower().values

In [7]:
np.random.seed(42)
np.random.shuffle(notes)

In [8]:
trn_texts,val_texts = sklearn.model_selection.train_test_split(
    notes, test_size=0.1)

df_trn = pd.DataFrame({'text':trn_texts, 'labels':[0]*len(trn_texts)}, columns=['labels','text'])
df_val = pd.DataFrame({'text':val_texts, 'labels':[0]*len(val_texts)}, columns=['labels','text'])

df_trn['text'] = df_trn.text.str.replace('\n','')
df_val['text'] = df_val.text.str.replace('\n','')

df_trn.to_csv(LM_PATH/'train.csv', header=True, index=False)
df_val.to_csv(LM_PATH/'test.csv', header=True, index=False)

In [9]:
chunksize=20000

In [10]:
df_trn = pd.read_csv(LM_PATH/'train.csv', chunksize=chunksize, engine='python')
df_val = pd.read_csv(LM_PATH/'test.csv', chunksize=chunksize, engine='python')

In [23]:
def get_texts(df):
    texts = f'\n{BOS} ' + df.text.astype(str) + f' {EOS}'
    tok = Tokenizer.proc_all_mp(partition_by_cores(texts), lang='en')
    return tok

def get_all(df, name):
    for i, r in enumerate(df):
        print(i)
        tok_  = get_texts(r)
        #save the partial tokens instead of regrouping them in one big array.
        np.save(LM_PATH/f'{name}_tok{i}.npy', tok_)

In [24]:
get_all(df_trn,'trn')

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83


In [25]:
get_all(df_val,'tst')

0
1
2
3
4
5
6
7
8
9
10


In [26]:
def count_them_all(names):
    cnt = Counter()
    for name in names:
        for file in LM_PATH.glob(f'{name}_tok*'):
            tok = np.load(file)
            cnt_tok = Counter(word for sent in tok for word in sent)
            cnt += cnt_tok
    return cnt

In [27]:
cnt = count_them_all(['trn'])

In [28]:
cnt.most_common(n=50)

[('*', 38382786),
 ('.', 27018846),
 (':', 20976834),
 (',', 18188128),
 ('-', 16487644),
 ('/', 12633658),
 (']', 9740260),
 ('[', 9400807),
 ('and', 7999520),
 (')', 7949283),
 ('(', 7537086),
 ('to', 7340443),
 ('the', 7096614),
 ('of', 6447552),
 ('with', 5605332),
 ('for', 4195069),
 ('in', 4139239),
 ('is', 3669299),
 ('on', 3496859),
 ('no', 3449699),
 ('a', 2521034),
 ('was', 2271430),
 ('pt', 2207854),
 ('at', 2149678),
 ('2', 2006607),
 ('1', 1905333),
 ('am', 1879485),
 ('mg', 1873696),
 ('\n', 1676109),
 ('xbos', 1674985),
 ('xeos', 1674985),
 ('%', 1636825),
 ('left', 1600425),
 ('#', 1598717),
 ('this', 1593269),
 ('as', 1583053),
 ('name', 1569413),
 ('ml', 1511484),
 ('right', 1471611),
 ('3', 1331350),
 ('not', 1289241),
 ('patient', 1239536),
 ('there', 1204960),
 ('4', 1197226),
 ('last', 1194902),
 ('are', 1188600),
 ('tk_rep', 1181862),
 ('5', 1178812),
 ('or', 1176187),
 ('pm', 1159586)]

In [29]:
max_vocab = 60000
min_freq = 5

In [30]:
itos = [o for o,c in cnt.most_common(max_vocab) if c > min_freq]
itos.insert(0,'_pad_')
itos.insert(0,'_unk_')

In [31]:
len(itos)

60002

In [32]:
stoi = collections.defaultdict(int,{s:i for (i,s) in enumerate(itos)})

In [33]:
def numericalize(name, partial=True):
    results = []
    for index, file in enumerate(LM_PATH.glob(f'{name}_tok*')):
        print(index)
        tok = np.load(file)
        results.append(np.array([[stoi[word] for word in sent] for sent in tok]))
        
        if (index == 10) and (partial==True):
            break

    return np.concatenate(results)

In [34]:
trn_ids = numericalize('trn')
np.save(LM_PATH/'trn_ids.npy', trn_ids)

0
1
2
3
4
5
6
7
8
9
10


In [35]:
val_ids = numericalize('tst')
np.save(LM_PATH/'val_ids.npy', val_ids)

0
1
2
3
4
5
6
7
8
9
10


In [36]:
with open(LM_PATH/'itos.pickle', 'wb') as handle:
    pickle.dump(itos, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [37]:
with open(LM_PATH/'stoi.pickle', 'wb') as handle:
    pickle.dump(stoi, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [38]:
trn_ids = np.load(LM_PATH/'trn_ids.npy')
np.save(LM_PATH/'trn_ids_concat.npy', np.concatenate(trn_ids))

In [39]:
val_ids = np.load(LM_PATH/'val_ids.npy')
np.save(LM_PATH/'val_ids_concat.npy', np.concatenate(val_ids))