# LDA and pyLDAvis with medical transcriptions and AWS SageMaker

* Gratefully indebted to my sources, from whom I have lifted quite a bit of the code in this notebook:
* https://github.com/priya-dwivedi/Deep-Learning/blob/master/topic_modeling/LDA_Newsgroup.ipynb
* https://github.com/XuanX111/Friends_text_generator/blob/master/Friends_LDAvis_Xuan_Qi.ipynb

In [None]:
# ! pip install gensim

In [None]:
# ! pip install pyLDAvis

In [1]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import models
from gensim.corpora import Dictionary, MmCorpus

In [2]:
'''
Loading nltk libraries
'''

import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

from nltk.corpus import stopwords


In [3]:
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
import pandas as pd
import numpy as np
np.random.seed(400)
np.set_printoptions(precision=3, suppress=True)

In [5]:
# accessing the SageMaker Python SDK
import boto3
import sagemaker
from sagemaker.amazon.common import numpy_to_record_serializer
from sagemaker.predictor import csv_serializer, json_deserializer
from sagemaker import get_execution_role

In [7]:
from __future__ import print_function
from IPython.core.debugger import set_trace

import tempfile
import string
# import funcy as fp
import sys
import logging
import os
import pickle
import re

In [None]:
import pyLDAvis.gensim as gensimvis
import pyLDAvis

In [8]:
import logging
logging.basicConfig(filename='gensim.log',
                    format="%(asctime)s:%(levelname)s:%(message)s",
                    level=logging.INFO)

## Step 1. Getting the data

In [9]:
df = pd.read_csv('mtsamples.csv')
print(df.columns)
df.head()

Index(['Unnamed: 0', 'description', 'medical_specialty', 'sample_name',
       'transcription', 'keywords'],
      dtype='object')


Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [12]:
surgery = df[df['medical_specialty']==" Surgery"]['transcription']

In [13]:
surgery.dropna(inplace=True)
surgery.isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(result)


0

In [14]:
surgery.iloc[0, ][:50]

'PREOPERATIVE DIAGNOSES:,1.  Hallux rigidus, left f'

## Step 2: Data Preprocessing
We will perform the following steps:

* Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
* Words that have fewer than 3 characters are removed.
* All stopwords are removed.
* Words are lemmatized - words in third person are changed to first person and verbs in past and future tenses are changed into present.
* Words are stemmed - words are reduced to their root form.

In [11]:
'''
Write a function to perform the pre processing steps on the entire dataset
'''

# Tokenize 
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token.lower())           
    return result

In [15]:
processed_docs = []
for doc in surgery:
    processed_docs.append(preprocess(doc))
print(len(processed_docs))

1088


### Step 3. Corpus and dictionary

In [16]:
stop = set(stopwords.words('english'))

In [17]:
# prepare the corpus

def nltk_stopwords():
    return set(nltk.corpus.stopwords.words('english'))

def prep_corpus(docs, additional_stopwords=set(), no_below=3, no_above=0.5):
    print('Building dictionary...')
    dictionary = Dictionary(docs)
#     print(dictionary)
    stopwords = nltk_stopwords().union(additional_stopwords)
#     print(stopwords)
    stopword_ids = map(dictionary.token2id.get, stopwords)
#     print(stopword_ids)
    dictionary.filter_tokens(stopword_ids)
#     print(dictionary)
#     dictionary.compactify()
# #     print(dictionary)
#     dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)

    
    print('Building corpus...')
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    
    return dictionary, corpus

In [18]:
dictionary, corpus = prep_corpus(processed_docs)
print(len(dictionary))
print(len(corpus))

Building dictionary...
Building corpus...
11434
1088


In [19]:
dictionary[100]

'hypertrophic'

In [20]:
len(corpus[50])

135

### Step 4. Modeling

In [21]:
role = get_execution_role()
bucket = 'sagemaker-us-east-1-443501626368'
prefix = 'lda-medical'

print('Training input/output will be stored in {}/{}'.format(bucket, prefix))
print('\nIAM Role: {}'.format(role))

Training input/output will be stored in sagemaker-us-east-1-443501626368/lda-medical

IAM Role: arn:aws:iam::443501626368:role/service-role/AmazonSageMaker-ExecutionRole-20200806T142735


In [22]:
from sagemaker.amazon.amazon_estimator import get_image_uri
# select the algorithm container based on this notebook's current location

region_name = boto3.Session().region_name
container = get_image_uri(region_name, 'lda')

print('Using SageMaker LDA container: {} ({})'.format(container, region_name))

Using SageMaker LDA container: 766337827248.dkr.ecr.us-east-1.amazonaws.com/lda:1 (us-east-1)


In [23]:
session = sagemaker.Session()

In [None]:
# specify general training job information
lda = sagemaker.estimator.Estimator(
    container,
    role,
    output_path='s3://{}/{}/output'.format(bucket, prefix),
    train_instance_count=1,
    train_instance_type='ml.c4.2xlarge',
    sagemaker_session=session,
)

In [None]:
# set algorithm-specific hyperparameters
num_topics = 10
vocabulary_size=len(dictionary)
num_documents=int(0.9*len(corpus))

lda.set_hyperparameters(
    num_topics=num_topics,
    feature_dim=vocabulary_size,
    mini_batch_size=num_documents,
    alpha0=1.0,
)

In [None]:
len(corpus[100])

In [None]:
# run the training job on input data stored in S3
lda.fit({'train': surgery.iloc[0]})

### Step 5. visualize

In [None]:
vis_data1 = gensimvis.prepare(lda_model, corpus, dictionary)

In [None]:
surgery_lda = open('surgery.html', 'w')
pyLDAvis.save_html(vis_data1, surgery_lda)