# LDA and pyLDAvis with medical transcriptions and AWS SageMaker

* Gratefully indebted to my sources, from whom I have lifted quite a bit of the code in this notebook:
* https://github.com/priya-dwivedi/Deep-Learning/blob/master/topic_modeling/LDA_Newsgroup.ipynb
* https://github.com/XuanX111/Friends_text_generator/blob/master/Friends_LDAvis_Xuan_Qi.ipynb

In [1]:
! pip install gensim

Collecting gensim
  Using cached gensim-3.8.3-cp36-cp36m-manylinux1_x86_64.whl (24.2 MB)
Processing /home/ec2-user/.cache/pip/wheels/88/2a/d4/f2e9023989d4d4b3574f268657cb6cd23994665a038803f547/smart_open-3.0.0-py3-none-any.whl
Installing collected packages: smart-open, gensim
Successfully installed gensim-3.8.3 smart-open-3.0.0
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/mxnet_p36/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
! pip install pyLDAvis

Processing /home/ec2-user/.cache/pip/wheels/57/de/11/0a038be70c2c212ce45fa0f4f9da165bb5dd87de1288394dc3/pyLDAvis-2.1.2-py2.py3-none-any.whl
Collecting funcy
  Using cached funcy-1.15-py2.py3-none-any.whl (32 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-1.15 pyLDAvis-2.1.2
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/mxnet_p36/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import models
from gensim.corpora import Dictionary, MmCorpus

In [4]:
'''
Loading nltk libraries
'''

import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

from nltk.corpus import stopwords


In [5]:
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
import pandas as pd
import numpy as np
np.random.seed(400)
np.set_printoptions(precision=3, suppress=True)

In [7]:
# accessing the SageMaker Python SDK
import boto3
import sagemaker
from sagemaker.amazon.common import numpy_to_record_serializer
from sagemaker.predictor import csv_serializer, json_deserializer
from sagemaker import get_execution_role

In [8]:
from __future__ import print_function
from IPython.core.debugger import set_trace

import tempfile
import string
# import funcy as fp
import sys
import logging
import os
import pickle
import re

In [9]:
import pyLDAvis.gensim as gensimvis
import pyLDAvis

In [10]:
import logging
logging.basicConfig(filename='gensim.log',
                    format="%(asctime)s:%(levelname)s:%(message)s",
                    level=logging.INFO)

## Step 1. Getting the data

In [11]:
df = pd.read_csv('mtsamples.csv')
print(df.columns)
df.head()

Index(['Unnamed: 0', 'description', 'medical_specialty', 'sample_name',
       'transcription', 'keywords'],
      dtype='object')


Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [12]:
surgery = df[df['medical_specialty']==" Surgery"]['transcription']

In [13]:
surgery.dropna(inplace=True)
surgery.isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(result)


0

In [14]:
surgery.iloc[0, ][:50]

'PREOPERATIVE DIAGNOSES:,1.  Hallux rigidus, left f'

## Step 2: Data Preprocessing
We will perform the following steps:

* Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
* Words that have fewer than 3 characters are removed.
* All stopwords are removed.
* Words are lemmatized - words in third person are changed to first person and verbs in past and future tenses are changed into present.
* Words are stemmed - words are reduced to their root form.

In [15]:
'''
Write a function to perform the pre processing steps on the entire dataset
'''

# Tokenize 
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token.lower())           
    return result

In [16]:
processed_docs = []
for doc in surgery:
    processed_docs.append(preprocess(doc))
print(len(processed_docs))

1088


### Step 3. Corpus and dictionary

In [17]:
stop = set(stopwords.words('english'))

In [18]:
# prepare the corpus

def nltk_stopwords():
    return set(nltk.corpus.stopwords.words('english'))

# he function doc2bow() simply counts the number of occurrences 
# of each distinct word, converts the word to its integer word id and returns the result as a sparse vector. 
def prep_corpus(docs, additional_stopwords=set(), no_below=3, no_above=0.5):
    print('Building dictionary...')
    dictionary = Dictionary(docs)
#     print(dictionary)
    stopwords = nltk_stopwords().union(additional_stopwords)
#     print(stopwords)
    stopword_ids = map(dictionary.token2id.get, stopwords)
#     print(stopword_ids)
    dictionary.filter_tokens(stopword_ids)
#     print(dictionary)
#     dictionary.compactify()
# #     print(dictionary)
#     dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)

    
    print('Building corpus...')
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    
    return dictionary, corpus

In [19]:
# calculate a word co-occurrence matrix with sklearn
from sklearn.feature_extraction.text import CountVectorizer
docs = ['this this this book',
        'this cat good',
        'cat good dog']
count_model = CountVectorizer(ngram_range=(1,1)) # default unigram model
X = count_model.fit_transform(docs)
# X[X > 0] = 1 # run this line if you don't want extra within-text cooccurence 
Xc = (X.T * X) # this is co-occurrence matrix in sparse csr format
Xc.setdiag(0) # sometimes you want to fill same word cooccurence to 0
print(Xc.todense()) # print out matrix in dense format
print(count_model.vocabulary_)
# https://stackoverflow.com/questions/35562789/how-do-i-calculate-a-word-word-co-occurrence-matrix-with-sklearn

[[0 0 0 0 3]
 [0 0 1 2 1]
 [0 1 0 1 0]
 [0 2 1 0 1]
 [3 1 0 1 0]]
{'this': 4, 'book': 0, 'cat': 1, 'good': 3, 'dog': 2}


In [20]:
Xc

<5x5 sparse matrix of type '<class 'numpy.longlong'>'
	with 17 stored elements in Compressed Sparse Column format>

In [21]:
# need to create a word co-occurence matrix that is 1088x11434

dictionary, corpus = prep_corpus(processed_docs)
print(len(dictionary))
print(len(processed_docs))
print(len(corpus))

Building dictionary...
Building corpus...
11434
1088
1088


In [22]:
print(dictionary[100])
print(len(dictionary))

hypertrophic
11434


In [23]:
# each row of the corpus is sparse
print(len(corpus[1]))
print(corpus[1])
print(len(corpus))

39
[(23, 4), (154, 3), (155, 1), (164, 1), (165, 1), (171, 2), (187, 3), (238, 1), (258, 1), (259, 1), (260, 1), (261, 1), (262, 2), (263, 1), (264, 2), (265, 1), (266, 1), (267, 1), (268, 1), (269, 1), (270, 1), (271, 1), (272, 3), (273, 1), (274, 3), (275, 1), (276, 1), (277, 1), (278, 1), (279, 1), (280, 1), (281, 1), (282, 1), (283, 1), (284, 1), (285, 3), (286, 1), (287, 1), (288, 1)]
1088


In [24]:
dense_matrix = gensim.matutils.corpus2dense(corpus, num_terms=len(dictionary)).T
print(len(dense_matrix))

1088


In [25]:
print(type(dense_matrix))
print(dense_matrix[:10])
print(len(dense_matrix))
print(len(dense_matrix[0]))

<class 'numpy.ndarray'>
[[1. 1. 2. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
1088
11434


In [26]:
# this is what we need to convert to recordio protobuf
pd.DataFrame(dense_matrix).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11424,11425,11426,11427,11428,11429,11430,11431,11432,11433
0,1.0,1.0,2.0,4.0,3.0,1.0,5.0,1.0,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,1.0,4.0,6.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
#import module in terms of dealing with various types of I/O
import io
#import sagemaker common library
import sagemaker.amazon.common as smac 
#converts the data in numpy array format to RecordIO format
buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, dense_matrix)
#reset in-memory byte arrays to zero
buf.seek(0)


0

In [30]:
bucket = 'sagemaker-us-east-1-443501626368'
prefix = 'lda-medical'

print(prefix)
print(bucket)

lda-medical
sagemaker-us-east-1-443501626368


In [31]:
# Now that we've created our recordIO-wrapped protobuf, 
# we'll need to upload it to S3, so that Amazon SageMaker training can use it.
#import module
import os
 
#Key refers to the name of the file    
key = 'lda-data'
#uploads the data in record-io format to S3 bucket
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, key)).upload_fileobj(buf)
#training data location in s3
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))

uploaded training data location: s3://sagemaker-us-east-1-443501626368/lda-medical/train/lda-data


In [32]:
# convert documents_training to Protobuf RecordIO format
recordio_protobuf_serializer = numpy_to_record_serializer()
fbuffer = recordio_protobuf_serializer(dense_matrix)

In [33]:
# upload to S3 in bucket/prefix/train
fname = 'lda.data'
s3_object = os.path.join(prefix, 'train', fname)
boto3.Session().resource('s3').Bucket(bucket).Object(s3_object).upload_fileobj(fbuffer)

s3_train_data = 's3://{}/{}'.format(bucket, s3_object)
print('Uploaded data to S3: {}'.format(s3_train_data))

Uploaded data to S3: s3://sagemaker-us-east-1-443501626368/lda-medical/train/lda.data


### Step 4. Modeling

In [34]:
role = get_execution_role()
bucket = 'sagemaker-us-east-1-443501626368'
prefix = 'lda-medical'

print('Training input/output will be stored in {}/{}'.format(bucket, prefix))
print('\nIAM Role: {}'.format(role))

Training input/output will be stored in sagemaker-us-east-1-443501626368/lda-medical

IAM Role: arn:aws:iam::443501626368:role/service-role/AmazonSageMaker-ExecutionRole-20200806T142735


In [35]:
from sagemaker.amazon.amazon_estimator import get_image_uri
# select the algorithm container based on this notebook's current location

region_name = boto3.Session().region_name
container = get_image_uri(region_name, 'lda')

print('Using SageMaker LDA container: {} ({})'.format(container, region_name))

Using SageMaker LDA container: 766337827248.dkr.ecr.us-east-1.amazonaws.com/lda:1 (us-east-1)


In [36]:
session = sagemaker.Session()

In [37]:
# specify general training job information
lda = sagemaker.estimator.Estimator(
    container,
    role,
    output_path='s3://{}/{}/output'.format(bucket, prefix),
    train_instance_count=1,
    train_instance_type='ml.c4.2xlarge',
    sagemaker_session=session,
)

In [38]:
# set algorithm-specific hyperparameters
num_topics = 10
vocabulary_size=len(dictionary)
num_documents=int(0.9*len(corpus))

lda.set_hyperparameters(
    num_topics=num_topics,
    feature_dim=vocabulary_size,
    mini_batch_size=num_documents,
    alpha0=1.0,
)

In [39]:
print(s3_train_data)

s3://sagemaker-us-east-1-443501626368/lda-medical/train/lda.data


In [40]:
# run the training job on input data stored in S3
lda.fit({'train': s3_train_data})

2020-11-03 19:13:56 Starting - Starting the training job...
2020-11-03 19:13:59 Starting - Launching requested ML instances.........
2020-11-03 19:15:30 Starting - Preparing the instances for training...
2020-11-03 19:16:27 Downloading - Downloading input data
2020-11-03 19:16:27 Training - Downloading the training image......
2020-11-03 19:17:10 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34mUsing mxnet backend.[0m
[34m[11/03/2020 19:17:12 INFO 140679425541952] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'alpha0': u'1.0', u'max_restarts': u'10', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'allow_svd_init': u'true', u'epochs': u'1', u'tol': u'1e-8', u'_kvstore': u'local', u'max_iterations': u'1000'}[0m
[34m[11/03/2020 19:17:12 INFO 140679425541952] Reading provided configuration from /opt/ml/input/config/hyperparam

[34m[11/03/2020 19:17:20 INFO 140679425541952] [CPDecomp] Using line search: error_diff=7.59959220886e-07[0m
[34m[11/03/2020 19:17:21 INFO 140679425541952] [CPDecomp] Using line search: error_diff=9.23871994019e-07[0m
[34m[11/03/2020 19:17:21 INFO 140679425541952] [CPDecomp] Using line search: error_diff=1.23679637909e-06[0m
[34m[11/03/2020 19:17:21 INFO 140679425541952] [CPDecomp] Using line search: error_diff=4.39584255219e-07[0m
[34m[11/03/2020 19:17:21 INFO 140679425541952] [CPDecomp] Using line search: error_diff=8.12113285065e-07[0m
[34m[11/03/2020 19:17:21 INFO 140679425541952] [CPDecomp] Using line search: error_diff=4.39584255219e-07[0m
[34m[11/03/2020 19:17:21 INFO 140679425541952] [CPDecomp] Using line search: error_diff=1.87754631042e-06[0m
[34m[11/03/2020 19:17:21 INFO 140679425541952] Restart 2/10[0m
[34m[11/03/2020 19:17:21 INFO 140679425541952] [CPDecomp] Starting CPDecomp...[0m
[34m[11/03/2020 19:17:21 INFO 140679425541952] [CPDecomp] Configuration: 

[34m[11/03/2020 19:17:35 INFO 140679425541952] [CPDecomp] Using line search: error_diff=6.0647726059e-06[0m
[34m[11/03/2020 19:17:35 INFO 140679425541952] [CPDecomp] Using line search: error_diff=5.76674938202e-06[0m
[34m[11/03/2020 19:17:36 INFO 140679425541952] [CPDecomp] Using line search: error_diff=5.87850809097e-06[0m
[34m[11/03/2020 19:17:36 INFO 140679425541952] [CPDecomp] Using line search: error_diff=4.97698783875e-06[0m
[34m[11/03/2020 19:17:36 INFO 140679425541952] [CPDecomp] Using line search: error_diff=2.23517417908e-06[0m
[34m[11/03/2020 19:17:36 INFO 140679425541952] [CPDecomp] Using line search: error_diff=5.89340925217e-06[0m
[34m[11/03/2020 19:17:36 INFO 140679425541952] [CPDecomp] Using line search: error_diff=5.25265932083e-06[0m
[34m[11/03/2020 19:17:36 INFO 140679425541952] [CPDecomp] Using line search: error_diff=4.41074371338e-06[0m
[34m[11/03/2020 19:17:36 INFO 140679425541952] [CPDecomp] Using line search: error_diff=3.53902578354e-06[0m
[

[34m[11/03/2020 19:17:40 INFO 140679425541952] [CPDecomp] Using line search: error_diff=3.50177288055e-07[0m
[34m[11/03/2020 19:17:40 INFO 140679425541952] [CPDecomp] Using line search: error_diff=3.23355197906e-06[0m
[34m[11/03/2020 19:17:41 INFO 140679425541952] [CPDecomp] Using line search: error_diff=2.28732824326e-06[0m
[34m[11/03/2020 19:17:41 INFO 140679425541952] [CPDecomp] Using line search: error_diff=3.12924385071e-07[0m
[34m[11/03/2020 19:17:41 INFO 140679425541952] [CPDecomp] Using line search: error_diff=1.31875276566e-06[0m
[34m[11/03/2020 19:17:41 INFO 140679425541952] [CPDecomp] Using line search: error_diff=8.79168510437e-07[0m
[34m[11/03/2020 19:17:41 INFO 140679425541952] [CPDecomp] Using line search: error_diff=1.05053186417e-06[0m
[34m[11/03/2020 19:17:41 INFO 140679425541952] [CPDecomp] Using line search: error_diff=8.34465026855e-07[0m
[34m[11/03/2020 19:17:41 INFO 140679425541952] [CPDecomp] Using line search: error_diff=3.79979610443e-07[0m


[34m[11/03/2020 19:17:55 INFO 140679425541952] [CPDecomp] Using line search: error_diff=1.49011611938e-07[0m
[34m[11/03/2020 19:17:56 INFO 140679425541952] [CPDecomp] Using line search: error_diff=2.23517417908e-07[0m
[34m[11/03/2020 19:17:56 INFO 140679425541952] [CPDecomp] Using line search: error_diff=1.56462192535e-07[0m
[34m[11/03/2020 19:17:56 INFO 140679425541952] [CPDecomp] Using line search: error_diff=3.57627868652e-07[0m
[34m[11/03/2020 19:17:56 INFO 140679425541952] [CPDecomp] Using line search: error_diff=4.09781932831e-07[0m
[34m[11/03/2020 19:17:56 INFO 140679425541952] [CPDecomp] Using line search: error_diff=3.65078449249e-07[0m
[34m[11/03/2020 19:17:56 INFO 140679425541952] [CPDecomp] Using line search: error_diff=2.98023223877e-08[0m
[34m[11/03/2020 19:17:56 INFO 140679425541952] [CPDecomp] Using line search: error_diff=2.38418579102e-07[0m
[34m[11/03/2020 19:17:56 INFO 140679425541952] [CPDecomp] Using line search: error_diff=2.9057264328e-07[0m
[

[34m[11/03/2020 19:18:05 INFO 140679425541952] [CPDecomp] Using line search: error_diff=1.65030360222e-06[0m
[34m[11/03/2020 19:18:06 INFO 140679425541952] [CPDecomp] Using line search: error_diff=1.60932540894e-06[0m
[34m[11/03/2020 19:18:06 INFO 140679425541952] [CPDecomp] Using line search: error_diff=1.02818012238e-06[0m
[34m[11/03/2020 19:18:06 INFO 140679425541952] [CPDecomp] Using line search: error_diff=1.81421637535e-06[0m
[34m[11/03/2020 19:18:06 INFO 140679425541952] [CPDecomp] Using line search: error_diff=1.11013650894e-06[0m
[34m[11/03/2020 19:18:06 INFO 140679425541952] [CPDecomp] Using line search: error_diff=4.65661287308e-07[0m
[34m[11/03/2020 19:18:06 INFO 140679425541952] [CPDecomp] Using line search: error_diff=2.57045030594e-07[0m
[34m[11/03/2020 19:18:06 INFO 140679425541952] [CPDecomp] Using line search: error_diff=8.38190317154e-07[0m
[34m[11/03/2020 19:18:06 INFO 140679425541952] [CPDecomp] Using line search: error_diff=1.07660889626e-06[0m



2020-11-03 19:18:18 Uploading - Uploading generated training model
2020-11-03 19:18:18 Completed - Training job completed
Training seconds: 123
Billable seconds: 123


In [41]:
print('Training job name: {}'.format(lda.latest_training_job.job_name))

Training job name: lda-2020-11-03-19-13-55-904


In [43]:

training_job_name = lda.latest_training_job.job_name

print('Training job name: {}'.format(training_job_name))

Training job name: lda-2020-11-03-19-13-55-904


In [45]:
import tarfile
# https://github.com/alex9311/alex9311.github.io/blob/master/code-projects/game-of-thrones-lda/train_lda.py
# https://alexandersimes.com/unsupervised/machine/learning/nlp/sagemaker/2019/09/01/got.html
model_fname = 'model.tar.gz'
model_object = os.path.join(prefix, 'output', training_job_name, 'output', model_fname)
boto3.Session().resource('s3').Bucket(bucket).Object(model_object).download_file(fname)
with tarfile.open(fname) as tar:
    tar.extractall()
print('Downloaded and extracted model tarball: {}'.format(model_object))


Downloaded and extracted model tarball: lda-medical/output/lda-2020-11-03-19-13-55-904/output/model.tar.gz


In [46]:
import mxnet as mx
# obtain the model file
model_list = [fname for fname in os.listdir('.') if fname.startswith('model_')]
model_fname = model_list[0]
print('Found model file: {}'.format(model_fname))


Found model file: model_algo-1


In [48]:

# get the model from the model file and store in Numpy arrays
alpha, beta = mx.ndarray.load(model_fname)
learned_alpha_permuted = alpha.asnumpy()
learned_beta_permuted = beta.asnumpy()

topic_distributions = learned_beta_permuted.tolist()

topic_word_weights_list = []
for topic_distribution in topic_distributions:
    this_topic_word_weights = {}
    for word_index, weight in enumerate(topic_distribution):
        this_topic_word_weights[dictionary[word_index]] = weight
    topic_word_weights_list.append(this_topic_word_weights)

top_words_in_topics = []
for topic_word_weights in topic_word_weights_list:
    top_words_in_topics.append(
        sorted(topic_word_weights, key=topic_word_weights.get, reverse=True)[:10]
    )
for index, top_words_in_topic in enumerate(top_words_in_topics):
    print('topic', index)
    for word in top_words_in_topic:
        print('\t', word, ':', topic_word_weights_list[index][word])

topic 0
	 right : 0.04034285247325897
	 patient : 0.023245779797434807
	 procedure : 0.016281088814139366
	 catheter : 0.012204998172819614
	 placed : 0.010959035716950893
	 artery : 0.006760246120393276
	 chest : 0.00606955960392952
	 tube : 0.006054094061255455
	 left : 0.00579813914373517
	 lobe : 0.0057295020669698715
topic 1
	 left : 0.058137327432632446
	 artery : 0.043930165469646454
	 coronary : 0.030304359272122383
	 right : 0.028467867523431778
	 catheter : 0.02059132233262062
	 french : 0.01325968187302351
	 disease : 0.011443668976426125
	 femoral : 0.011177356354892254
	 stenosis : 0.01075015403330326
	 patient : 0.010716749355196953
topic 2
	 normal : 0.03300194814801216
	 colon : 0.028911029919981956
	 procedure : 0.025585666298866272
	 scope : 0.022896697744727135
	 patient : 0.016566522419452667
	 cecum : 0.01419156976044178
	 colonoscopy : 0.013177668675780296
	 rectum : 0.013089579530060291
	 esophagus : 0.012220272794365883
	 withdrawn : 0.010903608985245228
topic 3

In [None]:
# from other notebook

In [49]:
# download and extract the model file from S3
job_name = lda.latest_training_job.job_name
model_fname = 'model.tar.gz'
model_object = os.path.join(prefix, 'output', job_name, 'output', model_fname)
boto3.Session().resource('s3').Bucket(bucket).Object(model_object).download_file(fname)
with tarfile.open(fname) as tar:
    tar.extractall()
print('Downloaded and extracted model tarball: {}'.format(model_object))

# obtain the model file
model_list = [fname for fname in os.listdir('.') if fname.startswith('model_')]
model_fname = model_list[0]
print('Found model file: {}'.format(model_fname))

# get the model from the model file and store in Numpy arrays
alpha, beta = mx.ndarray.load(model_fname)
learned_alpha_permuted = alpha.asnumpy()
learned_beta_permuted = beta.asnumpy()

print('\nLearned alpha.shape = {}'.format(learned_alpha_permuted.shape))
print('Learned beta.shape = {}'.format(learned_beta_permuted.shape))

Downloaded and extracted model tarball: lda-medical/output/lda-2020-11-03-19-13-55-904/output/model.tar.gz
Found model file: model_algo-1

Learned alpha.shape = (10,)
Learned beta.shape = (10, 11434)


In [52]:
# permutation, learned_beta = match_estimated_topics(known_beta, learned_beta_permuted)
# learned_alpha = learned_alpha_permuted[permutation]

fig = plot_lda(np.vstack([known_beta, learned_beta]), 2, 10)
fig.set_dpi(160)
fig.suptitle('Known vs. Found Topic-Word Probability Distributions')
fig.set_figheight(3)

beta_error = np.linalg.norm(known_beta - learned_beta, 1)
alpha_error = np.linalg.norm(known_alpha - learned_alpha, 1)
print('L1-error (beta) = {}'.format(beta_error))
print('L1-error (alpha) = {}'.format(alpha_error))

NameError: name 'plot_lda' is not defined