In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt

In [2]:
fp = "../../data/annotated_discharge.csv"
df = pd.read_csv(fp)
df = df.rename(columns={"SUBJECT_ID":"subject.id",
                        "HADM_ID":"Hospital.Admission.ID",
                        "TEXT":"text"})
cols_to_keep = ['text', 'Hospital.Admission.ID', 'subject.id', 'chart.time',
                'cohort', 'Obesity', 'Non.Adherence', 'Developmental.Delay.Retardation',
                'Advanced.Heart.Disease', 'Advanced.Lung.Disease',
                'Schizophrenia.and.other.Psychiatric.Disorders', 'Alcohol.Abuse',
                'Other.Substance.Abuse', 'Chronic.Pain.Fibromyalgia',
                'Chronic.Neurological.Dystrophies', 'Advanced.Cancer', 'Depression',
                'Dementia', 'Unsure']
df = df[cols_to_keep]
print(df.shape)
df.head()

(56839, 19)


Unnamed: 0,text,Hospital.Admission.ID,subject.id,chart.time,cohort,Obesity,Non.Adherence,Developmental.Delay.Retardation,Advanced.Heart.Disease,Advanced.Lung.Disease,Schizophrenia.and.other.Psychiatric.Disorders,Alcohol.Abuse,Other.Substance.Abuse,Chronic.Pain.Fibromyalgia,Chronic.Neurological.Dystrophies,Advanced.Cancer,Depression,Dementia,Unsure
0,Admission Date: [**2200-4-7**] Discharge ...,118003.0,3644,118003,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0
1,Tracing is of improved quality. Sinus tachycar...,118003.0,3644,118003,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0
2,Technically difficult study. P waves are atypi...,118003.0,3644,118003,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0
3,Sinus rhythm. Compared to the previous tracing...,118003.0,3644,118003,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0
4,Sinus rhythm. Since the previous tracing of [*...,118003.0,3644,118003,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0


In [3]:
df.isna().sum().sum()

0

In [4]:
df["text"].str.len().describe()

count    56839.000000
mean      2077.733071
std       2780.655499
min          3.000000
25%        526.000000
50%       1223.000000
75%       2081.000000
max      31151.000000
Name: text, dtype: float64

In [5]:
df[df['text'].str.len() < 20].shape

(36, 19)

In [6]:
df[df['text'].str.len() > 20]['text'].str.len().describe()

count    56803.000000
mean      2079.043096
std       2781.049400
min         21.000000
25%        528.000000
50%       1223.000000
75%       2083.000000
max      31151.000000
Name: text, dtype: float64

In [59]:
df[df["text"].str.len() > 1]['text'].str.len().describe()

count    56834.000000
mean      1713.293082
std       2272.476521
min          3.000000
25%        477.000000
50%       1065.500000
75%       1844.750000
max      28682.000000
Name: text, dtype: float64

In [67]:
# drop rows with no discharge text
df = df[df["text"].str.len() >= 20]

In [68]:
from gensim.models import Word2Vec
import nltk
from nltk.corpus import stopwords
import re

In [27]:
# may need to download resource from nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\colle\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\colle\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [6]:
def preprocess_text(hosp_adm_text):
    # https://www.h2kinfosys.com/blog/word-embeddings-with-word2vec-tutorial-all-you-need-to-know/
    #remove extra characters
    # processed_text = re.sub(r"[[0-9]*\]", " ", hosp_adm_text)
    processed_text = " ".join([re.sub(r'\W+', '', t) for t in hosp_adm_text.split()])
    #remove the extra spaces between words
    processed_text = re.sub(r"\s+", " ", processed_text)
    #convert all letters to lowercase
    processed_text = processed_text.lower()
    return processed_text

txt = "I w$ent to   the 7462 x*tmart?"
preprocess_text(txt)

'i went to the 7462 xtmart'

In [69]:
df['text'] = df['text'].apply(lambda x: preprocess_text(x))
df['text'].head()

0    admission date 220047 discharge date 2200410 d...
1    tracing is of improved quality sinus tachycard...
2    technically difficult study p waves are atypic...
3    sinus rhythm compared to the previous tracing ...
4    sinus rhythm since the previous tracing of 220...
Name: text, dtype: object

In [70]:
def tokenize_and_remove_stop_words(preproc_text):
    # tokenize the text to list of sentences
    tokenized_sentence = nltk.sent_tokenize(preproc_text)
    
    # tokenize the list of sentences to list of words
    tokenized_words = [nltk.word_tokenize(sentence) for sentence in tokenized_sentence]

    #define the english stopwords
    stop_words = stopwords.words('english')

    #remove the stop words from the text
    for i, _ in enumerate(tokenized_words):
        tokenized_words[i] = [word for word in tokenized_words[i] if word not in stop_words]

    return tokenized_words

In [41]:
txt = "I went to the $tore. I bou&T some Milk!"
# preproc = [preprocess_text(t) for t in txt]
nltk.sent_tokenize(txt)

['I went to the $tore.', 'I bou&T some Milk!']

In [71]:
df.loc[:, "tokenized_text"] = df['text'].apply(lambda x: tokenize_and_remove_stop_words(x)[0])
df[["text", "tokenized_text"]].head()

Unnamed: 0,text,tokenized_text
0,admission date 220047 discharge date 2200410 d...,"[admission, date, 220047, discharge, date, 220..."
1,tracing is of improved quality sinus tachycard...,"[tracing, improved, quality, sinus, tachycardi..."
2,technically difficult study p waves are atypic...,"[technically, difficult, study, p, waves, atyp..."
3,sinus rhythm compared to the previous tracing ...,"[sinus, rhythm, compared, previous, tracing, 2..."
4,sinus rhythm since the previous tracing of 220...,"[sinus, rhythm, since, previous, tracing, 2200..."


In [72]:
# how many words per text?
df["tokenized_text"].str.len().describe()

count    56801.000000
mean       211.313252
std        281.552996
min          3.000000
25%         59.000000
50%        123.000000
75%        228.000000
max       3439.000000
Name: tokenized_text, dtype: float64

In [74]:
min_word_freq = 10
model = Word2Vec(df["tokenized_text"], min_count=min_word_freq)

In [85]:
learned_words = list(model.wv.key_to_index.keys())
total_words = len(learned_words)
print(f'total learned words: {total_words}')

total learned words: 30008


In [100]:
# look at random word and get embedding
rand_word_idx = np.random.randint(0, total_words)
rand_word = learned_words[rand_word_idx]
print(f"embedding vector for {rand_word} (dim={model.wv[rand_word].shape}):\n{model.wv[rand_word]}")
print(f"most similar for {rand_word}:\n{model.wv.most_similar(rand_word)}")

embedding vector for retroperitoneum (dim=(100,)):
[-0.3234327  -0.08714876 -0.41889495 -0.12784195  0.4299063   0.1409734
 -0.1086438  -0.5961283  -0.05520626 -0.29020107 -0.14015035  0.02042352
  0.52164894 -0.3623354  -0.28166422  0.4036731  -0.19226323 -0.93970543
  0.04085512 -0.03299803 -0.6076594  -0.2963688  -0.29249665 -0.34483388
 -0.9593344   0.7390165   0.04934819  0.300522   -0.38194293  0.09530936
  0.5140394   0.04930574 -0.25044787  0.22476153  0.24438101  0.15467462
  0.4605872  -0.55806136 -0.33568698 -0.21682875  0.4284733   0.45965236
  0.06284796 -0.19029985  0.1246502  -0.30755588  0.14003193 -0.37856874
 -0.08485511 -0.10150181  1.4410158  -0.04163815  0.36326548  0.64727193
 -0.02619361 -0.03520259  0.33081475  0.41524503 -1.5393274   0.81891334
  0.4595149   0.18262264 -0.0494491   0.2358015  -0.39127144  0.2672655
 -0.5457325  -0.10353922 -0.28408605  0.58578855  0.05212955  0.05095126
 -0.02580983 -0.06410851 -0.15833084 -0.10264172  0.5214524   0.05891827
 -

In [102]:
# # save trained word vectors
# model.wv.save_word2vec_format("w2v.txt", binary=False, write_header=False)

# CountVectorizer

In [3]:
# https://www.h2kinfosys.com/blog/word-embeddings-with-word2vec-tutorial-all-you-need-to-know/
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

# define the corpus
corpus = df['text']

# fit and transform the vectorizer on the corpus
transformed_corpus = vectorizer.fit_transform(corpus)

# print the transformed data in matrix form
print(transformed_corpus.toarray())


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [4]:
# transformed corpus is a one hot encoded matrix
# one row for each sample in dataset, 
# one column for each word in vocabulary
transformed_corpus.shape

(56839, 85059)

In [5]:
ex = df.iloc[0]['text']
ex

"Admission Date:  [**2200-4-7**]     Discharge Date:  [**2200-4-10**]\n\nDate of Birth:   [**2146-9-21**]     Sex:  F\n\nService:  CARDIAC INTENSIVE CARE MEDICINE\n\nCHIEF COMPLAINT:  The patient was admitted to the Cardiac\nIntensive Care Unit Medicine Service on [**2200-4-7**], with the\nchief complaint of acute myocardial infarction and fever.\n\nHISTORY OF PRESENT ILLNESS:  The patient is a 53 year old\nwhite female with a history of coronary artery disease,\nhypertension, hypercholesterolemia and two pack per day\ntobacco use with previous coronary artery bypass graft\nsurgery presenting to an outside hospital on [**2200-4-6**], with a\ntwo day history of fevers and confusion.  The patient had a\nCT scan of the chest at that time which revealed pneumonia by\nreport in the left lower lobe.\n\nWhile in the outside hospital Emergency Department, the\npatient complained of chest pain.  The patient states that\nshe has had this pain for approximately two weeks with no\nrelief.  She was