In [18]:
import torch
from torch import nn
import gensim
import fasttext
from gensim.models import FastText
import random
import operator
from nltk.tokenize import word_tokenize
import nltk

import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import pickle
import joblib

In [20]:
class Batch:
    """
    Struct containing batches info
    """

    def __init__(self):
        self.encoderSeqs = []
        self.encoder_lens = []
        self.label = []
        self.decoderSeqs = []
        self.targetSeqs = []
        self.decoder_lens = []
        self.rationals = []
        self.raw = []


class TextDataMimic:
    """Dataset class
    Warning: No vocabulary limit
    """

    def __init__(self, corpusname, datadir, taskname, trainLM=False, test_phase = True, big_emb = False, new_emb = True):

        """Load all conversations
        Args:
            args: parameters of the model
        """

        # Path variables
        if corpusname == 'cail':
            self.tokenizer = lambda x: list(jieba.cut(x))
        elif corpusname == 'mimic':
#             self.tokenizer = word_tokenize
            self.tokenizer = nltk.RegexpTokenizer(r"\w+").tokenize
            print("self tokenizer is: ", self.tokenizer)

        self.datadir = datadir
        self.taskname = taskname
        self.basedir = './data/mimic3/'
        
        #set embedding files
        self.big_emb = big_emb
        self.new_emb = new_emb
        print("self.new emb in textData is: ", self.new_emb)
        print("self.big emb in textData is: ", self.big_emb)
        if self.new_emb:
            print("using new strict embeddings")
            self.embfile = "./data/mimic3/new_mimic_word2vec_200_strict.model"

        else:
            print("using original embeddings")
            self.embfile = "../clinicalBERT/word2vec+fastText/word2vec+fastText/word2vec.model"
        # self.embfile = embedding_file
        print(f"using this embedding model:{self.embfile} ")
        # self.embfile = "./data/mimic3/new_mimic_word2vec.model"
        self.big_embfile = "../clinicalBERT/word2vec+fastText/BioWordVec_PubMed_MIMICIII_d200.vec.bin"
        if test_phase:
            self.test_phase = True
        else:
            self.test_phase = False
            
        

        self.trainingSamples = []  # 2d array containing each question and his answer [[input,target]]
        if not trainLM:
            self.datasets = self.loadCorpus_Mimic3()
        else:
            # TODO need to adapt below to load mimic3 data in a way ready for the language modelling (if we end up using)
            self.datasets = self.load_all_mimic()

        print('set')
        # Plot some stats:
        self._printStats(corpusname)

        #         if args['playDataset']:
        #             self.playDataset()

        self.batches = {}

    def loadCorpus_Mimic3(self):
        """
        Load/create the mimic 3 dataset
        """

        #         self.datadir = '../clinicalBERT/data/discharge/'
        self.corpus_file_train = self.datadir + self.taskname + '/train.csv'
        self.corpus_file_dev = self.datadir + self.taskname + '/val.csv'
        self.corpus_file_test = self.datadir + self.taskname + '/test.csv'
        
        if self.big_emb:
            self.data_dump_path = f"{self.basedir}/mimic3_processed_bigembed_{self.taskname}.pkl"
        elif self.new_emb:
            self.data_dump_path = f"{self.basedir}/mimic3_processed_new200strict_{self.taskname}.pkl"
        else:  
            self.data_dump_path = f"{self.basedir}/mimic3_processed_originalembs_{self.taskname}.pkl"
#             self.data_dump_path = f"{self.basedir}/mimic3_processed_{self.taskname}.pkl"

        print(self.data_dump_path)
        datasetExist = os.path.isfile(self.data_dump_path)

        if not datasetExist:  # First time we load the database: creating all files
            print('Training data not found. Creating dataset...')

            total_words = []

            # index2word, index2vec, word2vec, key2index
            if self.big_emb:
                print("using big boy embeddings!")
                self.org_index2word, self.org_index2vec, self.org_word2vec, self.org_key2index = self.get_word2vec_from_pretrained(
                    self.big_embfile)
            if self.new_emb:
                print("using new 200d embeddings from: ", self.embfile)
                self.org_index2word, self.org_index2vec, self.org_word2vec, self.org_key2index = self.get_word2vec_from_pretrained(
                self.embfile)
                
            else:
                print("using original embeddings! from: ", self.embfile)
                self.org_index2word, self.org_index2vec, self.org_word2vec, self.org_key2index = self.get_word2vec_from_pretrained(
                self.embfile)
                

            # need to re order these to have special tokens same as beer dataset
            # ord_word2index, ord_index2word, ord_index2vector
            self.word2index, self.index2word, self.index2vector = self.rearrange_word2vec(self.org_index2word,
                                                                                          self.org_index2vec,
                                                                                          self.org_word2vec,
                                                                                          self.org_key2index)
            # get the set of these index2words - words in index position essentially
            self.index2word_set = set(self.index2word)

            #             print("self index 2 word :", self.index2word)
            #             print("self word 2 index : ", self.word2index)

            datasets = self.format_mimic_datasets()

            print(len(datasets['train']), len(datasets['dev']), len(datasets['test']))

            #             # self.raw_sentences = copy.deepcopy(dataset)
            #             for setname in ['train', 'dev', 'test']:
            #                 dataset[setname] = [(self.TurnWordID(sen), y, sen, rational) for sen, y, rational in tqdm(dataset[setname])]

            # Saving
            print('Saving dataset...')
            self.save_mimic_datasets(datasets, self.data_dump_path)  # Saving tf samples
        else:
            print(f"Found already saved data at {self.data_dump_path}! Loading that instead")
            datasets = self.loadDataset(self.data_dump_path)
            print('loaded')

        return datasets

    def loadDataset(self, filename):
        """
        Load samples from file
        Args:
            filename (str): pickle filename

        """
        dataset_path = os.path.join(filename)
        print('Loading dataset from {}'.format(dataset_path))
        with open(dataset_path, 'rb') as handle:
            data = pickle.load(handle)  # Warning: If adding something here, also modifying saveDataset
            self.word2index = data['word2index']
            self.index2word = data['index2word']
            self.index2vector = data['index2vector']
            self.org_index2word = data['org_index2word']
            self.org_index2vector = data['org_index2vector']
            self.org_key2index = data['org_key2index']
            self.org_word2vec = data['org_word2vec']

            datasets = data['datasets']
        if self.test_phase:
            test_datasets = {}
            test_datasets['train'] = datasets['train'][0:500]
            test_datasets['dev'] = datasets['dev'][0:500]
            test_datasets['test'] = datasets['test'][0:500]
            self.index2word_set = set(self.index2word)
            print('training: \t', len(test_datasets['train']))
            print('dev: \t', len(test_datasets['dev']))
            print('testing: \t', len(test_datasets['test']))
            self.index2word_set = set(self.index2word)
            print('w2i shape: ', len(self.word2index))
            print('i2w shape: ', len(self.index2word))
            print('embeding shape: ', self.index2vector.shape)
            
            return test_datasets
        else:   
            print('training: \t', len(datasets['train']))
            print('dev: \t', len(datasets['dev']))
            print('testing: \t', len(datasets['test']))
            self.index2word_set = set(self.index2word)
            print('w2i shape: ', len(self.word2index))
            print('i2w shape: ', len(self.index2word))
            print('embeding shape: ', self.index2vector.shape)
            return datasets

    def get_word2vec_from_pretrained(self, filename):
        
        if self.big_emb:
            model = gensim.models.KeyedVectors.load_word2vec_format(f'{filename}', binary=True)
            # load in gensim format
            weights = model

            # convert to tensor for pytorch
            weights = torch.FloatTensor(model.vectors)
            print("weights vector shape: ", weights.shape)

            # convert to embedding layer
            embedding = nn.Embedding.from_pretrained(weights)

            print("embedding tensor shape: ", embedding)

            words = model.index_to_key
            # get the word2vec dictionary {'word':vector}
            word2vec = {word: model[word] for word in words}

            # get the vec2index array - essentially the vector arrays are in the index position corresponding to the word in word2vec
            index2vec = model.vectors

            # index to words - just a list of words in correct index position

            index2word = words
            print("length of indices to words: ", len(index2word))

            key2index = model.key_to_index
            #         self.index2word = index2word
            #         self.index2vec = index2vec
            #         self.word2vec = word2vec
            return index2word, index2vec, word2vec, key2index
        else:
            model = gensim.models.KeyedVectors.load(f'{filename}')

      


            # load in gensim format
            weights = model.wv

            # convert to tensor for pytorch
            weights = torch.FloatTensor(model.wv.vectors)
            print("weights vector shape: ", weights.shape)

            # convert to embedding layer
            embedding = nn.Embedding.from_pretrained(weights)

            print("embedding tensor shape: ", embedding)

            words = model.wv.index_to_key
            # get the word2vec dictionary {'word':vector}
            word2vec = {word: model.wv[word] for word in words}

            # get the vec2index array - essentially the vector arrays are in the index position corresponding to the word in word2vec
            index2vec = model.wv[model.wv.index_to_key]

            # index to words - just a list of words in correct index position

            index2word = words
            print("length of indices to words: ", len(index2word))
            
            

            key2index = model.wv.key_to_index
            #         self.index2word = index2word
            #         self.index2vec = index2vec
            #         self.word2vec = word2vec
            return index2word, index2vec, word2vec, key2index

    def rearrange_word2vec(self, index2word, index2vec, word2vec, key2index):

        word2index = dict()
        #if using the bigger embeddings - we already have tokens for fullstops - so only want to specify these special tokens
        if self.big_emb:
            print("re-arranging the big embeddings")
            word2index['PAD'] = 0
            word2index['START_TOKEN'] = 1
            word2index['END_TOKEN'] = 2
            word2index['UNK'] = 3            

            #start the counter/embedding ID at 1 more than the newly added special tokens
            cnt = 4
            index2vector = []
            for word in word2vec:
                index2vector.append(word2vec[word])

                word2index[word] = cnt
                #             print(word, cnt)
                cnt += 1
            vectordim = len(word2vec[word])
            print('before add special token:', len(index2vector))
            index2vector = [np.random.normal(size=[vectordim]).astype('float32') for _ in range(4)] + index2vector
            print('after add special token:', len(index2vector))
            index2vector = np.asarray(index2vector)
            index2word = [w for w, n in word2index.items()]
            print(len(word2index), cnt)
            print('Dictionary Got!')
            return word2index, index2word, index2vector
        elif self.new_emb:
            print("re-arranging the newer embeddings")
            # for the smaller embeddings file - it did not seem to know fullstops - so we explicitly add it here
            word2index['PAD'] = 0
            word2index['START_TOKEN'] = 1
            word2index['END_TOKEN'] = 2
            word2index['UNK'] = 3
#             word2index['.'] = 4
            # word2index['PAD'] = 1
            # word2index['UNK'] = 0

            cnt = 4
            index2vector = []
            for word in word2vec:
                index2vector.append(word2vec[word])

                word2index[word] = cnt
                #             print(word, cnt)
                cnt += 1
            vectordim = len(word2vec[word])
            print('before add special token:', len(index2vector))
            index2vector = [np.random.normal(size=[vectordim]).astype('float32') for _ in range(4)] + index2vector
            print('after add special token:', len(index2vector))
            index2vector = np.asarray(index2vector)
            index2word = [w for w, n in word2index.items()]
            print(len(word2index), cnt)
            print('Dictionary Got!')
            return word2index, index2word, index2vector
        else:
            print("re-arranging the smaller embeddings")
            # for the smaller embeddings file - it did not seem to know fullstops - so we explicitly add it here
            word2index['PAD'] = 0
            word2index['START_TOKEN'] = 1
            word2index['END_TOKEN'] = 2
            word2index['UNK'] = 3
            word2index['.'] = 4
            # word2index['PAD'] = 1
            # word2index['UNK'] = 0

            cnt = 5
            index2vector = []
            for word in word2vec:
                index2vector.append(word2vec[word])

                word2index[word] = cnt
                #             print(word, cnt)
                cnt += 1
            vectordim = len(word2vec[word])
            print('before add special token:', len(index2vector))
            index2vector = [np.random.normal(size=[vectordim]).astype('float32') for _ in range(5)] + index2vector
            print('after add special token:', len(index2vector))
            index2vector = np.asarray(index2vector)
            index2word = [w for w, n in word2index.items()]
            print(len(word2index), cnt)
            print('Dictionary Got!')
            return word2index, index2word, index2vector
            

    # ord_word2index, ord_index2word, ord_index2vector = rearrange_word2vec(index2word, index2vec, word2vec, key2index)

    def get_word_ids(self, text):

#         print("getting word ids for each token in provided sentences!")

        #         index2word_set = set(self.index2word)
        #     print(text)
        res = []
        for token in text:
            #         print(token)
            if token in self.index2word_set:
                #             print(token)
                #             print(ord_word2index[token])
                word_id = self.word2index[token]
                res.append(word_id)
            else:
                res.append(self.word2index["UNK"])

        return res

    # test_data["word_ids"] = test_data['tokenized_text'].apply(get_word_ids)

    def format_mimic_datasets(self):

        dataset = dict()

        dataset["train"] = pd.read_csv(self.corpus_file_train, index_col=0)
        #     train_df["word_ids"] =

        dataset["dev"] = pd.read_csv(self.corpus_file_dev, index_col=0)

        dataset["test"] = pd.read_csv(self.corpus_file_test, index_col=0)

        setnames = ["train", "dev", "test"]

        for setname in setnames:
            print("working on: ", setname)
            df = dataset[setname]
            print(df.head())
            df["tokenized_text"] = df.TEXT.apply(self.tokenizer)
            df["word_ids"] = df["tokenized_text"].apply(self.get_word_ids)
            df["rational"] = -1
            dataset[setname] = np.asarray(df[["word_ids", 'Label', 'tokenized_text', "rational"]])

        print("training data size: ", dataset["train"].shape)
        print("dev data size: ", dataset["dev"].shape)
        print("test data size: ", dataset["test"].shape)
        return dataset

    def save_mimic_datasets(self, datasets, dump_path):

        all_data = {}
        all_data["word2index"] = self.word2index
        all_data["index2word"] = self.index2word
        all_data["index2vector"] = self.index2vector
        all_data["org_index2word"] = self.org_index2word
        all_data["org_index2vector"] = self.org_index2vec
        all_data["org_key2index"] = self.org_key2index
        all_data["org_word2vec"] = self.org_word2vec

        all_data["datasets"] = datasets

        #         # Create output directory if needed
        #         if not os.path.exists(self.):
        #             os.makedirs(output_dir)
        
        if self.big_emb:
            with open(f'{dump_path}', 'wb') as handle:
                joblib.dump(all_data, handle)
        else:
            with open(f'{dump_path}', 'wb') as handle:
                pickle.dump(all_data, handle, -1)

    def load_all_mimic(self):
        
        if self.new_emb:
            self.data_dump_path1 = self.basedir + '/mimic3_processed_new200strict_3days.pkl'
            self.data_dump_path2 = self.basedir + '/mimic3_processed_new200strict_discharge.pkl'
            self.data_dump_all_path = self.basedir + '/mimic3_processed_new200strict_all.pkl'
            
        else:
            self.data_dump_path1 = self.basedir + '/mimic3_processed_originalembs_3days.pkl'
            self.data_dump_path2 = self.basedir + '/mimic3_processed_originalembs_discharge.pkl'

            self.data_dump_all_path = self.basedir + '/mimic3_processed_originalembs_all.pkl'




        datasetExist = os.path.isfile(self.data_dump_all_path)

        if not datasetExist:  # First time we load the database: creating all files
            print('Already processed combined data not found. Creating dataset of all combined...')

            data = {'train': [], 'dev': [], 'test': []}
            d1 = self.loadDataset(self.data_dump_path1)
            d2 = self.loadDataset(self.data_dump_path2)

            # index2word, index2vec, word2vec, key2index
            self.org_index2word, self.org_index2vec, self.org_word2vec, self.org_key2index = self.get_word2vec_from_pretrained(
                self.embfile)

            # need to re order these to have special tokens same as beer dataset
            # ord_word2index, ord_index2word, ord_index2vector
            self.word2index, self.index2word, self.index2vector = self.rearrange_word2vec(self.org_index2word,
                                                                                          self.org_index2vec,
                                                                                          self.org_word2vec,
                                                                                          self.org_key2index)
            # get the set of these index2words - words in index position essentially
            self.index2word_set = set(self.index2word)

            data['train'] = np.asarray(pd.concat([pd.DataFrame(d1['train']), pd.DataFrame(d2['train'])]))
            data['dev'] = np.asarray(pd.concat([pd.DataFrame(d1['dev']), pd.DataFrame(d2['dev'])]))
            data['test'] = np.asarray(pd.concat([pd.DataFrame(d1['test']), pd.DataFrame(d2['test'])]))

            # save
            print("saving the combined mimic dataset")
            self.save_mimic_datasets(data, self.data_dump_all_path)
        else:
            print("already combined them so loading that instead! ")
            data = self.loadDataset(self.data_dump_all_path)

        return data

    def _printStats(self, corpusname):
        print('Loaded {}: {} words, {} QA'.format(corpusname, len(self.word2index), len(self.trainingSamples)))

    def shuffle(self):
        """Shuffle the training samples
        """
        print('Shuffling the dataset...')
        random.shuffle(self.datasets['train'])

    def getSampleSize(self, setname='train'):
        """Return the size of the dataset
        Return:
            int: Number of training samples
        """
        return len(self.datasets[setname])

    def getVocabularySize(self):
        """Return the number of words present in the dataset
        Return:
            int: Number of word on the loader corpus
        """
        return len(self.word2index)

    def sequence2str(self, sequence, clean=False, reverse=False):
        """Convert a list of integer into a human readable string
        Args:
            sequence (list<int>): the sentence to print
            clean (Bool): if set, remove the <go>, <pad> and <eos> tokens
            reverse (Bool): for the input, option to restore the standard order
        Return:
            str: the sentence
        """

        if not sequence:
            return ''

        if not clean:
            return ' '.join([self.index2word[idx] for idx in sequence])

        sentence = []
        for wordId in sequence:
            if wordId == self.word2index['END_TOKEN']:  # End of generated sentence
                break
            elif wordId != self.word2index['PAD'] and wordId != self.word2index['START_TOKEN']:
                sentence.append(self.index2word[wordId])

        if reverse:  # Reverse means input so no <eos> (otherwise pb with previous early stop)
            sentence.reverse()

        return self.detokenize(sentence)

    def detokenize(self, tokens):
        """Slightly cleaner version of joining with spaces.
        Args:
            tokens (list<string>): the sentence to print
        Return:
            str: the sentence
        """
        return ''.join([
            ' ' + t if not t.startswith('\'') and
                       t not in string.punctuation
            else t
            for t in tokens]).strip().capitalize()

    def batchSeq2str(self, batchSeq, seqId=0, **kwargs):
        """Convert a list of integer into a human readable string.
        The difference between the previous function is that on a batch object, the values have been reorganized as
        batch instead of sentence.
        Args:
            batchSeq (list<list<int>>): the sentence(s) to print
            seqId (int): the position of the sequence inside the batch
            kwargs: the formatting options( See sequence2str() )
        Return:
            str: the sentence
        """
        sequence = []
        for i in range(len(batchSeq)):  # Sequence length
            sequence.append(batchSeq[i][seqId])
        return self.sequence2str(sequence, **kwargs)

    def sentence2batch(self, sentence):
        """Encode a sequence and return a batch as an input for the model
        Return:
            Batch: a batch object containing the sentence, or none if something went wrong
        """
        print("the sentence was: ", sentence)
        if sentence == '':
            return None

        # First step: Divide the sentence in token
        tokens = nltk.word_tokenize(sentence)
        if len(tokens) > 512:
            return None
        
        print("tokens are: ", tokens)
        # Second step: Convert the token in word ids
#         wordIds = []
#         for token in tokens:
#             print(f"converting token {token} to word ID")
#             wordIds.append(self.get_word_ids(token))  # Create the vocabulary and the training sentences

        wordIds = self.get_word_ids(tokens)
        
        print("wordIds after gettitng ids: ", wordIds)
        # Third step: creating the batch (add padding, reverse)
#         batch = self._createBatch([[wordIds, []]])  # Mono batch, no target output
        batch = Batch()
        
        batch.encoderSeqs.append(wordIds)
        batch.encoder_lens.append(len(wordIds))
        batch.label.append([0])
        batch.rationals.append([-1])
        batch.raw.append(tokens)
        
        maxlen_enc = 512
        batch.encoderSeqs[0] = batch.encoderSeqs[0] + [self.word2index['PAD']] * (
                maxlen_enc - len(batch.encoderSeqs[0]))

        return batch

    def deco2sentence(self, decoderOutputs):
        """Decode the output of the decoder and return a human friendly sentence
        decoderOutputs (list<np.array>):
        """
        sequence = []

        # Choose the words with the highest prediction score
        for out in decoderOutputs:
            sequence.append(np.argmax(out))  # Adding each predicted word ids

        return sequence  # We return the raw sentence. Let the caller do some cleaning eventually

    def _createBatch(self, samples):
        """Create a single batch from the list of sample. The batch size is automatically defined by the number of
        samples given.
        The inputs should already be inverted. The target should already have <go> and <eos>
        Warning: This function should not make direct calls to args['batchSize'] !!!
        Args:
            samples (list<Obj>): a list of samples, each sample being on the form [input, target]
        Return:
            Batch: a batch object en
        """

        batch = Batch()
        batchSize = len(samples)

        # Create the batch tensor
        for i in range(batchSize):
            # Unpack the sample
            sen_ids, y, raw_sen, rational = samples[i]

            if len(sen_ids) > 512:
                sen_ids = sen_ids[:512]

            batch.encoderSeqs.append(sen_ids)
            batch.encoder_lens.append(len(batch.encoderSeqs[i]))
            batch.label.append(y)
            batch.rationals.append(rational)
            batch.raw.append(raw_sen)
            # print(y)

        maxlen_enc = max(batch.encoder_lens)

        for i in range(batchSize):
            batch.encoderSeqs[i] = batch.encoderSeqs[i] + [self.word2index['PAD']] * (
                        maxlen_enc - len(batch.encoderSeqs[i]))

        return batch

    def getBatches(self, setname='train'):
        """Prepare the batches for the current epoch
        Return:
            list<Batch>: Get a list of the batches for the next epoch
        """
        print("getting batches for :", setname)
        if setname not in self.batches:
            # self.shuffle()

            batches = []
            print(setname, 'size:', len(self.datasets[setname]))

            def genNextSamples():
                """ Generator over the mini-batch training samples
                """
                for i in range(0, self.getSampleSize(setname), 512):
                    yield self.datasets[setname][i:min(i + 512, self.getSampleSize(setname))]

            # TODO: Should replace that by generator (better: by tf.queue)

            for index, samples in enumerate(genNextSamples()):
                # print([self.index2word[id] for id in samples[5][0]], samples[5][2])
                batch = self._createBatch(samples)
                batches.append(batch)

            self.batches[setname] = batches

        # print([self.index2word[id] for id in batches[2].encoderSeqs[5]], batches[2].raws[5])
        return self.batches[setname]


    def _createBatch_forLM(self, samples):
        """Create a single batch from the list of sample. The batch size is automatically defined by the number of
        samples given.
        The inputs should already be inverted. The target should already have <go> and <eos>
        Warning: This function should not make direct calls to args['batchSize'] !!!
        Args:
            samples (list<Obj>): a list of samples, each sample being on the form [input, target]
        Return:
            Batch: a batch object en
        """

        batch = Batch()
        batchSize = len(samples)

        # Create the batch tensor
        for i in range(batchSize):
            # Unpack the sample
            sen_ids = samples[i]
            if len(sen_ids) > args['maxLengthEnco']:
                sen_ids = sen_ids[:args['maxLengthEnco']]
            batch.decoderSeqs.append([self.word2index['START_TOKEN']] + sen_ids)
            batch.decoder_lens.append(len(batch.decoderSeqs[i]))
            batch.targetSeqs.append(sen_ids + [self.word2index['END_TOKEN']])

        # print(batch.decoderSeqs)
        # print(batch.decoder_lens)
        maxlen_dec = max(batch.decoder_lens)
        maxlen_dec = min(maxlen_dec, args['maxLengthEnco'])

        for i in range(batchSize):
            batch.decoderSeqs[i] = batch.decoderSeqs[i] + [self.word2index['PAD']] * (maxlen_dec - len(batch.decoderSeqs[i]))
            batch.targetSeqs[i] = batch.targetSeqs[i] + [self.word2index['PAD']] * (maxlen_dec - len(batch.targetSeqs[i]))

        return batch

    def getBatches_forLM(self, setname='train'):
        """Prepare the batches for the current epoch
        Return:
            list<Batch>: Get a list of the batches for the next epoch
        """
        if setname not in self.batches:
            self.shuffle()

            dataset_sen = self.paragraph2sentence(self.datasets[setname])
            sennum = len(dataset_sen)
            print(sennum)

            batches = []
            print(len(self.datasets[setname]))

            def genNextSamples():
                """ Generator over the mini-batch training samples
                """
                for i in range(0, sennum, args['batchSize']):
                    yield dataset_sen[i:min(i + args['batchSize'], sennum)]

            # TODO: Should replace that by generator (better: by tf.queue)

            for index, samples in enumerate(genNextSamples()):
                # print([self.index2word[id] for id in samples[5][0]], samples[5][2])
                batch = self._createBatch_forLM(samples)
                batches.append(batch)

            self.batches[setname] = batches

        # print([self.index2word[id] for id in batches[2].encoderSeqs[5]], batches[2].raws[5])
        return self.batches[setname]

    def paragraph2sentence(self, doclist):
        split_tokens = [self.word2index['.']]
        print("split tokens: ", split_tokens)
        sen_list = []
        for sen_ids, y, raw_sen, rational in doclist:
            start = 0
            for ind, w in enumerate(sen_ids):
                if w in split_tokens:
                    sen_list.append(sen_ids[start:ind + 1])
                    start = ind + 1

            if start < len(sen_ids) - 1:
                sen_list.append(sen_ids[start:])

        return sen_list

In [None]:
# dataset_2day = TextDataMimic("mimic", "../clinicalBERT/data/", "2days", trainLM = False, test_phase = False,big_emb = False)

In [11]:
dataset_3day = TextDataMimic("mimic", "../clinicalBERT/data/", "3days", "./data/mimic3/new_mimic_word2vec_200.model",trainLM = False, test_phase = False,big_emb = False, new_emb = True)

self tokenizer is:  <bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='\\w+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>
using this embedding model: ./data/mimic3/new_mimic_word2vec_200.model
using this word2vec model: ./data/mimic3/new_mimic_word2vec_200.model
./data/mimic3//mimic3_processed_new200_3days.pkl
Training data not found. Creating dataset...
using new 200d embeddings from:  ./data/mimic3/new_mimic_word2vec_200.model
weights vector shape:  torch.Size([337754, 200])
embedding tensor shape:  Embedding(337754, 200)
length of indices to words:  337754
re-arranging the newer embeddings
before add special token: 337754
after add special token: 337758
337758 337758
Dictionary Got!
working on:  train
         ID                                               TEXT  Label
0  129458.0  have a significant family history of cancer. o...    0.0
1  104352.0  per protocol. social: wife & family in to visi...    0.0
2  198360.0  9/ her bil

In [22]:
dataset_discharge = TextDataMimic("mimic", "../clinicalBERT/data/", "discharge",trainLM = False, test_phase = False,big_emb = False, new_emb = True)




self tokenizer is:  <bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='\\w+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>
self.new emb in textData is:  True
self.big emb in textData is:  False
using new strict embeddings
using this embedding model:./data/mimic3/new_mimic_word2vec_200_strict.model 
./data/mimic3//mimic3_processed_new200strict_discharge.pkl
Found already saved data at ./data/mimic3//mimic3_processed_new200strict_discharge.pkl! Loading that instead
Loading dataset from ./data/mimic3//mimic3_processed_new200strict_discharge.pkl
training: 	 26245
dev: 	 3037
testing: 	 3063
w2i shape:  35790
i2w shape:  35790
embeding shape:  (35790, 200)
loaded
set
Loaded mimic: 35790 words, 0 QA


In [13]:
dataset_all =  TextDataMimic("mimic", "../clinicalBERT/data/", "discharge", "./data/mimic3/new_mimic_word2vec_200.model",trainLM = True, test_phase = False,big_emb = False, new_emb = True)

self tokenizer is:  <bound method RegexpTokenizer.tokenize of RegexpTokenizer(pattern='\\w+', gaps=False, discard_empty=True, flags=<RegexFlag.UNICODE|DOTALL|MULTILINE: 56>)>
using this embedding model: ./data/mimic3/new_mimic_word2vec_200.model
using this word2vec model: ./data/mimic3/new_mimic_word2vec_200.model
Already processed combined data not found. Creating dataset of all combined...
Loading dataset from ./data/mimic3//mimic3_processed_new200_3days.pkl
training: 	 47793
dev: 	 5774
testing: 	 5441
w2i shape:  337758
i2w shape:  337758
embeding shape:  (337758, 200)
Loading dataset from ./data/mimic3//mimic3_processed_new200_discharge.pkl
training: 	 26245
dev: 	 3037
testing: 	 3063
w2i shape:  337758
i2w shape:  337758
embeding shape:  (337758, 200)
weights vector shape:  torch.Size([337754, 200])
embedding tensor shape:  Embedding(337754, 200)
length of indices to words:  337754
re-arranging the newer embeddings
before add special token: 337754
after add special token: 337758

In [55]:
sample_words = dataset_discharge.sentence2enco("they were in pain")

the sentence was:  they were in pain
tokens are:  ['they', 'were', 'in', 'pain']
getting word ids for each token in provided sentences!
wordIds after gettitng ids:  [778, 54, 10, 61]


In [56]:
sample_words.encoderSeqs

[[778,
  54,
  10,
  61,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


In [5]:
pd.DataFrame(dataset_discharge.datasets["test"])

Unnamed: 0,0,1,2,3
0,"[115, 8, 438, 487, 3, 211, 670, 146, 1658, 475...",1.0,"[date, of, birth, sex, f, service, medicine, a...",-1
1,"[335, 329, 5, 18, 126, 14, 21, 12, 82, 18, 44,...",1.0,"[pneumonia, echo, the, left, atrium, is, norma...",-1
2,"[219, 1667, 696, 161, 151, 22, 70, 499, 121, 9...",1.0,"[may, recommend, lisinopril, low, dose, as, an...",-1
3,"[342, 2306, 185, 81, 43, 557, 8, 275, 444, 613...",1.0,"[should, develop, any, chest, pain, shortness,...",-1
4,"[115, 8, 438, 487, 3, 211, 670, 146, 1475, 146...",1.0,"[date, of, birth, sex, f, service, medicine, a...",-1
...,...,...,...,...
3058,"[7, 5, 138, 9, 336, 3069, 112, 97, 1515, 3896,...",1.0,"[to, the, hospital, with, multiple, injuries, ...",-1
3059,"[115, 8, 438, 487, 3, 211, 1765, 146, 11, 310,...",1.0,"[date, of, birth, sex, m, service, cardiothora...",-1
3060,"[352, 3, 361, 3, 3, 3, 460, 3, 3, 3, 580, 3, 3...",1.0,"[creat, 5, na, 137, k, 8, cl, 98, hco3, 30, an...",-1
3061,"[15, 41, 1535, 6, 5, 3820, 26, 10, 13, 19, 107...",1.0,"[for, his, cardiomyopathy, and, the, torsemide...",-1


In [72]:
dataset_discharge.org_index2vec[125]

array([-2.28249654e-01, -4.68891412e-02,  2.26994658e+00, -2.89653873e+00,
        1.67675793e+00, -2.79883528e+00,  1.78039038e+00,  5.36802053e+00,
        1.21413946e-01, -2.51195669e+00, -1.35765421e+00,  2.31679964e+00,
        2.20072556e+00, -3.69672465e+00, -5.31100130e+00,  1.21236646e+00,
       -6.59844995e-01,  2.64934945e+00,  3.82705092e-01,  5.35959065e-01,
        1.39937353e+00, -5.05955279e-01, -3.97054739e-02,  2.11853743e+00,
       -4.08448744e+00, -5.45036614e-01, -2.35644531e+00, -6.69783688e+00,
       -5.02659619e-01, -1.01595497e+00, -1.59731817e+00, -7.83136606e-01,
       -4.45203114e+00, -2.17383671e+00,  1.32144475e+00, -1.00786495e+00,
       -1.98650372e+00, -3.11862379e-01, -3.72263336e+00, -3.23598576e+00,
        5.90170026e-01, -7.41553247e-01, -9.54673350e-01, -1.22892344e+00,
        1.37275016e+00, -7.92469203e-01,  1.99579740e+00,  7.79799521e-01,
        3.45572519e+00,  1.38466156e+00, -1.09354055e+00,  1.50134194e+00,
       -1.23756599e+00, -

In [6]:
dataset_discharge.org_key2index["lung"]

193

In [7]:
dataset_discharge.org_word2vec['lung']

array([ 3.7347727 ,  3.5503638 ,  0.8790663 ,  3.4034002 , 10.375045  ,
       -0.8688268 ,  7.0506315 ,  2.4350076 , -4.912821  , -2.6350112 ,
        1.9429346 , -2.1650126 , -0.36405718,  6.9815807 ,  2.3783052 ,
       -5.794677  , -0.80958456, -3.042608  ,  3.1388557 , -4.7098594 ,
       -2.8478985 , -8.231066  ,  5.86091   ,  3.0265362 ,  3.0509253 ,
       -1.4302505 ,  6.2332954 ,  1.5747749 , -4.6630526 , -2.8885405 ,
       -2.690876  , -4.5650444 ,  1.6220841 ,  2.5244405 ,  1.8935099 ,
       -3.645891  , -4.105331  , -6.1624126 , -5.758234  ,  1.8119708 ,
        1.677489  ,  4.726636  ,  4.959354  ,  0.585234  , -7.4941664 ,
       -3.3074071 , -4.722119  , -1.0718637 , -1.1930306 ,  1.9402883 ,
       12.724307  ,  8.109781  , -5.707921  , -3.5645418 ,  2.7685587 ,
       -1.9491826 , -0.98713636,  5.3647323 ,  7.825179  ,  3.7673612 ,
        1.82831   ,  4.2955847 ,  1.4217446 , -2.3034778 ,  0.20594694,
        5.561591  ,  4.3498645 ,  0.5490263 ,  6.6431026 ,  6.54

In [74]:
dataset_discharge.word2vec

AttributeError: 'TextDataMimic' object has no attribute 'word2vec'

In [62]:
dataset_discharge.index2vector[198]

array([ 0.9222753 ,  4.4364567 , -0.87041205, -0.07489344,  1.1852257 ,
        2.874645  , -0.3279756 ,  1.1363229 ,  3.9213395 , -1.688579  ,
       -1.6212999 ,  3.3523107 ,  0.1367919 , -2.5868292 , -2.3092573 ,
        0.3351141 ,  1.9077681 ,  0.30085286,  1.4929311 ,  0.943036  ,
        0.4660683 ,  2.3497021 , -2.335803  , -2.6909966 ,  1.5033249 ,
       -1.22547   ,  1.2248974 , -3.9842522 ,  0.258253  ,  3.541843  ,
       -0.81235725, -0.24457139,  2.9789517 , -0.04412545, -2.085483  ,
       -0.4433359 ,  1.775197  , -0.805354  , -0.47864437,  1.5223753 ,
       -1.999852  , -1.4827038 , -0.7262382 ,  0.98203486,  2.2530682 ,
        1.2219807 , -1.3446138 , -0.09033704,  1.797854  ,  0.1976881 ,
        0.9246443 ,  0.30661616,  2.7923555 ,  1.4555554 ,  1.0013268 ,
        0.79890704, -1.0372248 , -2.8516705 ,  1.0711282 ,  0.2797041 ,
        3.0681953 , -1.4881992 , -0.12178833, -0.979855  ,  1.491791  ,
       -1.8726189 , -1.9570696 , -0.21535124, -1.1120571 ,  1.05

In [8]:
# test_dataset_discharge =TextDataMimic("mimic", "../clinicalBERT/data/", "discharge", trainLM = False, test_phase = True,big_emb = False)


In [23]:
test_discharge_df = pd.DataFrame(dataset_discharge.datasets["test"])


In [24]:
dataset_discharge.word2index["lung"]

129

In [26]:
dataset_discharge.index2vector[129]

array([-3.40139657e-01, -1.82405722e+00, -9.42534059e-02, -1.80250990e+00,
       -1.46985918e-01, -1.35256743e+00,  1.58724701e+00,  5.03400660e+00,
       -2.96955109e+00, -2.96375394e+00, -5.53072810e-01,  4.18898487e+00,
        3.61255145e+00, -3.75301504e+00, -3.83935142e+00,  1.09365499e+00,
       -6.73142135e-01,  2.66327405e+00,  8.57334137e-01,  6.34738803e-01,
        3.66407298e-02, -6.18511856e-01,  1.29695928e+00,  1.43249619e+00,
       -5.00627470e+00,  7.02718258e-01, -1.89690971e+00, -3.01608419e+00,
       -2.92607403e+00, -3.78944397e-01, -7.53323317e-01, -8.69365752e-01,
       -4.04480886e+00,  5.38093388e-01,  1.19405186e+00, -3.65213975e-02,
       -1.20833075e+00,  5.19232094e-01, -1.84757626e+00, -3.55335450e+00,
        1.89481831e+00,  2.65279865e+00, -2.07250118e+00, -5.35999417e-01,
        9.20227170e-01, -1.57229030e+00,  1.90771341e+00,  7.36952245e-01,
        5.36041927e+00, -1.50776482e+00, -1.74127829e+00, -9.15465772e-01,
        3.34077567e-01,  

In [25]:
dataset_discharge.org_word2vec["lung"]

array([-3.40139657e-01, -1.82405722e+00, -9.42534059e-02, -1.80250990e+00,
       -1.46985918e-01, -1.35256743e+00,  1.58724701e+00,  5.03400660e+00,
       -2.96955109e+00, -2.96375394e+00, -5.53072810e-01,  4.18898487e+00,
        3.61255145e+00, -3.75301504e+00, -3.83935142e+00,  1.09365499e+00,
       -6.73142135e-01,  2.66327405e+00,  8.57334137e-01,  6.34738803e-01,
        3.66407298e-02, -6.18511856e-01,  1.29695928e+00,  1.43249619e+00,
       -5.00627470e+00,  7.02718258e-01, -1.89690971e+00, -3.01608419e+00,
       -2.92607403e+00, -3.78944397e-01, -7.53323317e-01, -8.69365752e-01,
       -4.04480886e+00,  5.38093388e-01,  1.19405186e+00, -3.65213975e-02,
       -1.20833075e+00,  5.19232094e-01, -1.84757626e+00, -3.55335450e+00,
        1.89481831e+00,  2.65279865e+00, -2.07250118e+00, -5.35999417e-01,
        9.20227170e-01, -1.57229030e+00,  1.90771341e+00,  7.36952245e-01,
        5.36041927e+00, -1.50776482e+00, -1.74127829e+00, -9.15465772e-01,
        3.34077567e-01,  

'of'

In [82]:
test_discharge_df[0][0]

[114,
 7,
 437,
 3,
 486,
 3,
 3,
 210,
 3,
 669,
 145,
 3,
 1657,
 474,
 3,
 277,
 268,
 3,
 4904,
 614,
 3,
 2596,
 3,
 3,
 536,
 4,
 528,
 308,
 27,
 359,
 235,
 3,
 194,
 37,
 7,
 87,
 406,
 3,
 734,
 560,
 108,
 238,
 1619,
 5,
 1608,
 4,
 853,
 3,
 3,
 8,
 115,
 230,
 499,
 243,
 41,
 5435,
 111,
 47,
 4164,
 5425,
 107,
 66,
 32,
 9,
 1892,
 4,
 740,
 54,
 365,
 530,
 6,
 1777,
 3,
 29,
 157,
 4,
 4,
 16,
 9,
 107,
 6,
 72,
 3,
 511,
 885,
 113,
 156,
 4,
 1031,
 54,
 3548,
 560,
 5,
 54,
 21,
 1024,
 3,
 93,
 3,
 3,
 261,
 3218,
 3,
 3,
 3,
 68,
 3,
 3,
 3,
 600,
 3,
 3,
 12,
 3,
 767,
 4,
 4,
 4,
 16,
 9,
 188,
 6,
 641,
 32,
 9,
 107,
 6,
 72,
 3,
 941,
 7,
 3,
 3,
 68,
 3,
 5,
 484,
 3,
 4,
 569,
 9,
 18694,
 14,
 3,
 3052,
 1072,
 259,
 4,
 4,
 16,
 223,
 6120,
 3,
 5,
 231,
 3,
 8,
 10,
 791,
 4,
 32,
 9,
 5280,
 699,
 58,
 61,
 4,
 47,
 3,
 9,
 301,
 18,
 3,
 3,
 662,
 9,
 3,
 5,
 2367,
 3,
 3,
 11,
 4,
 373,
 7,
 123,
 1429,
 4,
 3,
 3,
 9,
 134,
 14,
 3,
 848,
 4,
 4,
 

In [24]:
converted_sentence = []
for w, t in zip(test_discharge_df.loc[0][0], test_discharge_df.loc[0][2]):
#     print(test_dataset_discharge.word2index[w])
    converted_sentence.append(dataset_discharge.index2word[w])
    print(dataset_discharge.index2word[w])
    print(t)
            
    
# print(converted_sentence)
# print(test_discharge_df.loc[0][2])

date
date
of
of
birth
birth
sex
sex
f
f
service
service
medicine
medicine
allergies
allergies
haldol
haldol
attending
attending
chief
chief
complaint
complaint
delta
delta
ms
ms
lethargy
lethargy
sepsis
sepsis
major
major
surgical
surgical
or
or
invasive
invasive
procedure
procedure
none
none
history
history
of
of
present
present
illness
illness
hx
hx
obtained
obtained
per
per
ed
ed
notes
notes
and
and
sister
sister
hpi
hpi
35f
35f
with
with
disease
disease
who
who
presented
presented
today
today
from
from
daycare
daycare
after
after
her
her
healthcare
healthcare
providers
providers
noted
noted
that
that
she
she
was
was
lethargic
lethargic
they
they
were
were
initially
initially
unable
unable
to
to
obtain
obtain
a
a
blood
blood
pressure
pressure
the
the
patient
patient
was
was
noted
noted
to
to
have
have
a
a
very
very
rapid
rapid
heart
heart
rate
rate
vitals
vitals
were
were
finally
finally
obtained
obtained
and
and
were
were
as
as
follows
follows
bp
bp
70
70
50
50
baseline
baseline
sb

In [16]:
test_3day = pd.DataFrame(dataset_3day.datasets["test"])
converted_sentence = []
for w, t in zip(test_3day.loc[0][0], test_3day.loc[0][2]):
#     print(test_dataset_discharge.word2index[w])
    converted_sentence.append(dataset_3day.index2word[w])
    print(dataset_3day.index2word[w])
    print(t)

sinus
sinus
tachycardia
tachycardia
generalized
generalized
low
low
voltage
voltage
delayed
delayed
r
r
wave
wave
progression
progression
with
with
late
late
precordial
precordial
qrs
qrs
transition
transition
findings
findings
are
are
non
non
specific
specific
clinical
clinical
correlation
correlation
is
is
suggested
suggested
since
since
the
the
previous
previous
tracing
tracing
of
of
same
same
date
date
sinus
sinus
tachycardia
tachycardia
rate
rate
is
is
faster
faster
tracing
tracing
1
1
sinus
sinus
rhythm
rhythm
delayed
delayed
r
r
wave
wave
progression
progression
with
with
late
late
precordial
precordial
qrs
qrs
transition
transition
generalized
generalized
low
low
qrs
qrs
voltage
voltage
findings
findings
are
are
non
non
specific
specific
clinical
clinical
correlation
correlation
is
is
suggested
suggested
since
since
the
the
previous
previous
tracing
tracing
of
of
the
the
rate
rate
is
is
faster
faster
and
and
voltage
voltage
is
is
lower
lower
title
title
45
45
y
y
o
o
man
man
wi

In [17]:
test_all = pd.DataFrame(dataset_all.datasets["test"])
converted_sentence = []
for w, t in zip(test_all.loc[0][0], test_all.loc[0][2]):
#     print(test_dataset_discharge.word2index[w])
    converted_sentence.append(dataset_all.index2word[w])
    print(dataset_all.index2word[w])
    print(t)

sinus
sinus
tachycardia
tachycardia
generalized
generalized
low
low
voltage
voltage
delayed
delayed
r
r
wave
wave
progression
progression
with
with
late
late
precordial
precordial
qrs
qrs
transition
transition
findings
findings
are
are
non
non
specific
specific
clinical
clinical
correlation
correlation
is
is
suggested
suggested
since
since
the
the
previous
previous
tracing
tracing
of
of
same
same
date
date
sinus
sinus
tachycardia
tachycardia
rate
rate
is
is
faster
faster
tracing
tracing
1
1
sinus
sinus
rhythm
rhythm
delayed
delayed
r
r
wave
wave
progression
progression
with
with
late
late
precordial
precordial
qrs
qrs
transition
transition
generalized
generalized
low
low
qrs
qrs
voltage
voltage
findings
findings
are
are
non
non
specific
specific
clinical
clinical
correlation
correlation
is
is
suggested
suggested
since
since
the
the
previous
previous
tracing
tracing
of
of
the
the
rate
rate
is
is
faster
faster
and
and
voltage
voltage
is
is
lower
lower
title
title
45
45
y
y
o
o
man
man
wi

In [66]:
"echo" in test_dataset_discharge.datasets["test"][83][2]

False

In [27]:
dataset_3day = TextDataMimic("mimic", "../clinicalBERT/data/", "3days")

./data/mimic3//mimic3_processed_3days.pkl
Training data not found. Creating dataset...
weights vector shape:  torch.Size([44082, 100])
embedding tensor shape:  Embedding(44082, 100)
length of indices to words:  44082
before add special token: 44082
after add special token: 44087
44087 44086
Dictionary Got!
working on:  train
         ID                                               TEXT  Label
0  129458.0  have a significant family history of cancer. o...    0.0
1  104352.0  per protocol. social: wife & family in to visi...    0.0
2  198360.0  9/ her bilirubin on was 0/8; four hours later ...    0.0
3  121738.0  to have re-current hemoptysis and dyspnea whil...    0.0
4  116798.0  patient/test information: indication: pericard...    1.0
working on:  dev
            ID                                               TEXT  Label
699   100184.0  10:11 pm chest (pre-op pa & lat) clip # reason...    0.0
700   100184.0  effusion. conclusions: the left atrium is mild...    0.0
1615  100456.0  c

In [16]:
data_all = TextDataMimic("mimic", "../clinicalBERT/data/", "3days", trainLM=True, test_phase=True)

already combined them so loading that instead! 
Loading dataset from ./data/mimic3//mimic3_procssed_all.pkl
training: 	 500
dev: 	 500
testing: 	 500
w2i shape:  44087
i2w shape:  44087
embeding shape:  (44087, 100)
set
Loaded mimic: 44087 words, 0 QA


IndentationError: expected an indented block (<ipython-input-43-0bc07ad4ef16>, line 360)

In [6]:
pd.DataFrame(data_all.datasets['train']).head(15)

Unnamed: 0,0,1,2,3
0,"[72, 3, 198, 128, 37, 7, 632, 4, 2104, 3, 845,...",0,"[have, a, significant, family, history, of, ca...",-1
1,"[108, 1457, 4, 356, 3, 859, 3, 128, 11, 6, 193...",0,"[per, protocol, ., social, :, wife, &, family,...",-1
2,"[3, 47, 1275, 12, 9, 3, 3, 755, 90, 1088, 3, 5...",0,"[9/, her, bilirubin, on, was, 0/8, ;, four, ho...",-1
3,"[6, 72, 3, 2105, 5, 744, 336, 12, 1595, 546, 1...",0,"[to, have, re-current, hemoptysis, and, dyspne...",-1
4,"[3, 442, 3, 466, 3, 185, 127, 4, 93, 3, 206, 5...",1,"[patient/test, information, :, indication, :, ...",-1
5,"[19, 4, 130, 3, 1157, 6, 1134, 3, 2521, 150, 2...",1,"[am, ., action, :, reported, to, ccu, team-, f...",-1
6,"[151, 3, 5, 4700, 720, 2882, 3943, 1683, 41, 4...",1,"[dilated, ,, and, periventricular, white, matt...",-1
7,"[3, 1003, 14, 1273, 155, 7, 814, 3, 36, 4, 22,...",0,"[,, order, for, short, course, of, prednisone,...",-1
8,"[1488, 217, 3, 629, 14, 142, 3256, 1047, 4, 13...",1,"[ngt, placed, ;, consulted, for, tube, feed, r...",-1
9,"[866, 45, 2250, 4, 4, 1712, 1572, 3, 11142, 45...",1,"[assess, this, finding, ., ., ruq, us, :, hete...",-1


In [33]:
data_all.word2index['END_TOKEN']

2

In [None]:
genism.models.word2vec.load


In [10]:
from gensim.models import KeyedVectors
# test_model = "../clinicalBERT/word2vec+fastText/word2vec+fastText/word2vec.model"

model = KeyedVectors.load_word2vec_format("../clinicalBERT/word2vec+fastText/word2vec+fastText/BioWordVec_PubMed_MIMICIII_d200.vec.bin", binary=True)
# test_model = gensim.models.keyedvectors.load_word2vec_format("../clinicalBERT/word2vec+fastText/word2vec+fastText/BioWordVec_PubMed_MIMICIII_d200.vec.bin")

In [29]:

# load in gensim format


# convert to tensor for pytorch
weights = torch.FloatTensor(model.vectors)
print("weights vector shape: ", weights.shape)

# convert to embedding layer
embedding = nn.Embedding.from_pretrained(weights)

print("embedding tensor shape: ", embedding)

words = model.index_to_key
# get the word2vec dictionary {'word':vector}
word2vec = {word: model[word] for word in words}

# get the vec2index array - essentially the vector arrays are in the index position corresponding to the word in word2vec
index2vec = model.vectors

key2index = model.key_to_index

weights vector shape:  torch.Size([16545452, 200])
embedding tensor shape:  Embedding(16545452, 200)


In [25]:

key2index = model.key_to_index
key2index

{'</s>': 0,
 '.': 1,
 'the': 2,
 'of': 3,
 ',': 4,
 'and': 5,
 'in': 6,
 ')': 7,
 '(': 8,
 'to': 9,
 'a': 10,
 'with': 11,
 ':': 12,
 'for': 13,
 'was': 14,
 'is': 15,
 'were': 16,
 'by': 17,
 'that': 18,
 'on': 19,
 '%': 20,
 'as': 21,
 'from': 22,
 'this': 23,
 'patients': 24,
 'at': 25,
 'or': 26,
 'are': 27,
 'an': 28,
 'be': 29,
 'we': 30,
 'not': 31,
 'study': 32,
 'results': 33,
 'these': 34,
 ';': 35,
 'cells': 36,
 'no': 37,
 'after': 38,
 'have': 39,
 'which': 40,
 'between': 41,
 '[': 42,
 ']': 43,
 'than': 44,
 'treatment': 45,
 '2': 46,
 'p': 47,
 'has': 48,
 'using': 49,
 'but': 50,
 '=': 51,
 'been': 52,
 'cell': 53,
 '1': 54,
 'during': 55,
 'group': 56,
 'may': 57,
 'it': 58,
 'two': 59,
 'both': 60,
 'also': 61,
 'had': 62,
 'their': 63,
 'can': 64,
 'all': 65,
 'more': 66,
 'disease': 67,
 'there': 68,
 'clinical': 69,
 'used': 70,
 'data': 71,
 'activity': 72,
 'analysis': 73,
 'increased': 74,
 'other': 75,
 'methods': 76,
 'effect': 77,
 'expression': 78,
 'compar

KeyError: "Key '</e>' not present"

In [28]:
model[11]

array([-9.5962e-02,  3.5165e-01, -1.7944e-01,  6.8773e-02, -3.8509e-01,
       -1.3111e-01, -4.3584e-01,  2.4559e-01,  6.8108e-01,  2.0460e-01,
       -7.3689e-01, -1.8587e-01,  2.8830e-01,  1.8371e-01,  7.9782e-02,
       -4.6849e-01, -3.8748e-01, -5.3304e-02,  1.8023e-01,  2.3767e-01,
       -3.9502e-02, -2.9111e-03, -4.5306e-01, -2.1011e-02,  3.9069e-01,
        2.1327e-01,  2.5107e-01,  3.3728e-01,  8.6563e-02,  1.9804e-01,
       -1.9827e-01,  2.5077e-01,  6.5796e-03, -3.3160e-01, -4.6570e-01,
       -3.2295e-01,  2.8121e-01,  1.0264e-01, -1.4415e-02,  3.4652e-01,
        1.4001e-01,  3.3041e-01, -7.8776e-02,  3.4361e-01,  2.7447e-01,
       -2.3958e-01,  1.8811e-02,  3.7658e-01, -1.3231e-01, -8.1214e-03,
       -4.9452e-02, -1.8819e-01, -2.5379e-01, -1.6240e-01, -8.5260e-01,
        2.9469e-02,  9.8296e-02, -2.7601e-01, -3.9249e-01, -2.3555e-01,
       -1.8017e-01,  2.4202e-01,  9.4171e-03, -1.3208e-01,  3.6413e-01,
       -5.6937e-02, -8.8756e-02, -3.9359e-01,  2.9157e-01, -3.50

In [23]:
model.vectors[""]

(16545452, 200)

In [None]:
gensim.models.keyedvectors.load_word2vec_format

In [21]:
test_batches = data_all.getBatches('dev')


getting batches for : dev


In [20]:
for index, batch in enumerate(test_batches):
    print("current batch number is: ", index)
#     print(batch)
    x={}
#     x['enc_input'] = batch.encoderSeqs
#     x['enc_len'] = batch.encoder_lens
#     x['labels'] = batch.label

    x['dec_input'] = batch.decoderSeqs
    x['dec_len'] = batch.decoder_lens
    x['dec_target'] = batch.targetSeqs
    print(pd.DataFrame(x))
    break
    
    
    

current batch number is:  0
Empty DataFrame
Columns: [dec_input, dec_len, dec_target]
Index: []


In [22]:
test_lm_batches = data_all.getBatches_forLM()

for index, batch in enumerate(test_lm_batches):
    print("current batch number is: ", index)
#     print(batch)
    x={}
    x['enc_input'] = batch.encoderSeqs
    x['enc_len'] = batch.encoder_lens
    x['labels'] = batch.label
    x['dec_input'] = batch.decoderSeqs
    x['dec_len'] = batch.decoder_lens
    x['dec_target'] = batch.targetSeqs
    print(pd.DataFrame(x))
    break

Shuffling the dataset...
split tokens:  [4]
13402
500


NameError: name 'args' is not defined

In [24]:
x = torch.tensor([[1.0]])
# x.item()

1

In [44]:
y = torch.range(1, 4)
y_long = y.long().numpy()

  """Entry point for launching an IPython kernel.


In [45]:
y_long

array([1, 2, 3, 4], dtype=int64)

In [41]:
y_long.numpy()

array([1, 2, 3, 4], dtype=int64)

In [37]:
y.numpy()

AttributeError: 'builtin_function_or_method' object has no attribute 'numpy'

In [34]:
test = np.asarray([1,2,3,4])
test


array([1, 2, 3, 4])

In [27]:
import collections
# test_dict = {}
if "training_stats" in test_dict:
    test_dict["training_stats"].append(input_dict2)
else:
    test_dict["training_stats"] = [input_dict]

print(test_dict)

{'training_stats': [{'a': 2, 'b': 3, 'c': 4}, {'a': 5, 'b': 6, 'c': 7}]}


In [3]:
input_dict = {'a':2,'b':3, 'c': 4}
input_dict2 = {'a':5,'b':6, 'c': 7}


In [13]:
test_dict['training_stats']= [input_dict]

In [14]:
test_dict['training_stats']

{'a': 2, 'b': 3, 'c': 4}

In [15]:
test_dict['training_stats'].append(input_dict2)

AttributeError: 'dict' object has no attribute 'append'

In [30]:
from datetime import date
now = date.today()
str(now)

'2021-05-04'

In [21]:
# a_dict = {}
if "a" in a_dict:
    print("rabble")
    a_dict["a"].append("hello")
else:
    a_dict["a"] = ["hello"]
print(a_dict)

rabble
{'a': ['hello', 'hello']}


In [4]:
# test logging

In [11]:
# import logging
# import os
# import errno
# def log_anything(path, file):
#     """[Create a log file to record the experiment's logs]

#     Arguments:
#         path {string} -- path to the directory
#         file {string} -- file name

#     Returns:
#         [obj] -- [logger that record logs]
#     """

#     # check if the file exist
#     log_file = os.path.join(path, file)
#     if not os.path.exists(os.path.dirname(log_file)):
#         try:
#             os.makedirs(os.path.dirname(log_file))
#         except OSError as exc:  # Guard against race condition
#             if exc.errno != errno.EEXIST:
#                 raise

#     if not os.path.isfile(log_file):
#         open(log_file, "w+").close()

#     console_logging_format = "%(levelname)s %(message)s"
#     file_logging_format = "%(levelname)s: %(asctime)s: %(message)s"

#     # configure logger
#     logging.basicConfig(level=logging.INFO, format=console_logging_format)
#     logger = logging.getLogger()

#     # create a file handler for output file
#     handler = logging.FileHandler(log_file)

#     # set the logging level for log file
#     handler.setLevel(logging.INFO)

#     # create a logging format
#     formatter = logging.Formatter(file_logging_format)
#     handler.setFormatter(formatter)

#     # add the handlers to the logger
#     logger.addHandler(handler)

#     return logger

In [6]:
# logger = log_anything(path="./artifacts/logs/" , file = "testing_log.logs")

In [21]:
# test_batches = test_dataset_discharge.getBatches('train')

In [20]:
# logger.info("first message")
# accuracy = 10
# for index, batch in enumerate(test_batches):
#     print("current batch number is: ", index)
# #     print(batch)
#     x={}
#     x['enc_input'] = batch.encoderSeqs
#     x['enc_len'] = batch.encoder_lens
#     x['labels'] = batch.label

#     print(pd.DataFrame(x))
    
#     logger.info(f"X shape is :  {len(x['enc_input'])}")
#     logger.info(f"accuracy is: {accuracy}")
#     logger.info(f"x is: {x} ")
#     logger.info(f"x again is: {x}")
            
#     break

INFO first message
INFO X shape is :  500
INFO accuracy is: 10
INFO x is: {'enc_input': [[255, 183, 81, 4, 20, 389, 255, 74, 122, 4, 2007, 289, 3, 3, 3, 4, 3, 714, 4, 10, 2480, 4, 28, 190, 3, 20, 380, 338, 81, 5, 192, 85, 181, 4, 163, 3, 89, 151, 429, 163, 4, 44, 30, 3, 395, 44, 30, 76, 4, 462, 149, 44, 30, 76, 4, 65, 21, 3, 674, 3, 3, 4, 685, 745, 4, 56, 30, 3, 89, 149, 56, 30, 76, 4, 222, 56, 1271, 1154, 4, 1724, 2623, 614, 154, 6, 5918, 4, 65, 6, 132, 3, 3, 3, 3, 187, 30, 3, 89, 149, 187, 30, 76, 4, 10, 3293, 4, 65, 586, 4, 1354, 647, 74, 157, 4, 374, 3, 112, 3, 10, 741, 4, 592, 3, 10, 185, 127, 4, 508, 4, 17, 125, 13, 20, 11, 81, 4, 10, 95, 553, 790, 13, 63, 46, 3, 27, 264, 119, 4, 4, 895, 28, 95, 157, 13, 3, 284, 4, 38, 13, 65, 393, 17, 43, 427, 4, 4, 17, 43, 183, 81, 13, 20, 4, 389, 17, 43, 85, 181, 13, 20, 4, 17, 43, 74, 122, 13, 2007, 3, 643, 3, 3, 3, 3, 4, 3, 3, 714, 13, 1075, 4, 38, 13, 10, 43, 553, 790, 4, 28, 43, 338, 81, 5, 192, 85, 181, 34, 20, 4, 4, 429, 163, 13, 89, 151

INFO x again is: {'enc_input': [[255, 183, 81, 4, 20, 389, 255, 74, 122, 4, 2007, 289, 3, 3, 3, 4, 3, 714, 4, 10, 2480, 4, 28, 190, 3, 20, 380, 338, 81, 5, 192, 85, 181, 4, 163, 3, 89, 151, 429, 163, 4, 44, 30, 3, 395, 44, 30, 76, 4, 462, 149, 44, 30, 76, 4, 65, 21, 3, 674, 3, 3, 4, 685, 745, 4, 56, 30, 3, 89, 149, 56, 30, 76, 4, 222, 56, 1271, 1154, 4, 1724, 2623, 614, 154, 6, 5918, 4, 65, 6, 132, 3, 3, 3, 3, 187, 30, 3, 89, 149, 187, 30, 76, 4, 10, 3293, 4, 65, 586, 4, 1354, 647, 74, 157, 4, 374, 3, 112, 3, 10, 741, 4, 592, 3, 10, 185, 127, 4, 508, 4, 17, 125, 13, 20, 11, 81, 4, 10, 95, 553, 790, 13, 63, 46, 3, 27, 264, 119, 4, 4, 895, 28, 95, 157, 13, 3, 284, 4, 38, 13, 65, 393, 17, 43, 427, 4, 4, 17, 43, 183, 81, 13, 20, 4, 389, 17, 43, 85, 181, 13, 20, 4, 17, 43, 74, 122, 13, 2007, 3, 643, 3, 3, 3, 3, 4, 3, 3, 714, 13, 1075, 4, 38, 13, 10, 43, 553, 790, 4, 28, 43, 338, 81, 5, 192, 85, 181, 34, 20, 4, 4, 429, 163, 13, 89, 151, 4, 38, 34, 395, 44, 30, 76, 4, 4, 44, 30, 76, 34, 462, 

current batch number is:  0
                                             enc_input  enc_len  labels
0    [255, 183, 81, 4, 20, 389, 255, 74, 122, 4, 20...      403     1.0
1    [6, 3, 3, 3, 55, 231, 469, 2312, 291, 699, 4, ...      352     0.0
2    [3512, 12, 67, 54, 3, 720, 532, 3, 3, 621, 3, ...      358     1.0
3    [3, 15, 26, 3, 2547, 49, 3, 48, 3, 3, 3, 26, 2...      377     1.0
4    [96, 153, 1412, 4, 3, 3, 473, 129, 79, 12, 3, ...      168     1.0
5    [114, 7, 437, 3, 486, 3, 3, 210, 3, 37, 7, 87,...      378     1.0
6    [12, 2281, 3, 18, 3, 3, 175, 358, 6, 1383, 69,...       24     0.0
7    [463, 84, 887, 3, 903, 73, 6, 229, 14, 184, 11...       73     0.0
8    [43, 1624, 9, 217, 8, 295, 5375, 4, 25, 9, 771...      359     0.0
9    [279, 1493, 1626, 3, 1306, 12, 3, 3, 1637, 3, ...       53     0.0
10   [114, 7, 437, 3, 486, 3, 3, 210, 3, 1764, 145,...      423     0.0
11   [29, 1362, 3, 83, 79, 3, 9774, 4, 3568, 5, 483...      197     1.0
12   [3, 15, 26, 49, 3, 3, 890, 23, 

In [41]:
# try import weird model format
from Hyperparameters import args

import argparse
import pandas as pd
from datetime import date

parser = argparse.ArgumentParser()
parser.add_argument('--gpu', '-g')
parser.add_argument('--modelarch', '-m')
parser.add_argument('--choose', '-c')
parser.add_argument('--use_big_emb', '-be')
parser.add_argument('--date', '-d')
cmdargs = parser.parse_args()

# usegpu = True

# if cmdargs.gpu is None:
#     usegpu = False
# else:
#     usegpu = True
#     args['device'] = 'cuda:' + str(cmdargs.gpu)

# if cmdargs.modelarch is None:
#     args['model_arch'] = 'lstm'
# else:
#     args['model_arch'] = cmdargs.modelarch


# if cmdargs.choose is None:
#     args['choose'] = 0
# else:
#     args['choose'] = int(cmdargs.choose)

# if cmdargs.use_big_emb:
#     args['big_emb'] = True
# else:
#     args['big_emb'] = False

# if cmdargs.date is None:
#     args['date'] = str(date.today())

# model_pretrained = torch.load("./artifacts/LSTM_IB_GAN_be_mimic3_2021-05-04.pt")

usage: ipykernel_launcher.py [-h] [--gpu GPU] [--modelarch MODELARCH]
                             [--choose CHOOSE] [--use_big_emb USE_BIG_EMB]
                             [--date DATE]
ipykernel_launcher.py: error: unrecognized arguments: -f C:\Users\ntaylor\AppData\Roaming\jupyter\runtime\kernel-8d67b017-51e6-4299-8121-0b2d3f36314d.json


SystemExit: 2