# Medical dataset EDA

Data source:
* https://www.kaggle.com/tboyle10/medicaltranscriptions
* Medical transcription data scraped from mtsamples.com

In [1]:
! pip install gensim

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
! pip install plotly

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import gensim

## Step 1. Getting the data

In [4]:
df = pd.read_csv('mtsamples.csv').drop(['Unnamed: 0'], axis=1)
print(df.columns)
df.head()

Index(['description', 'medical_specialty', 'sample_name', 'transcription',
       'keywords'],
      dtype='object')


Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [5]:
df['medical_specialty']=df['medical_specialty'].str.strip()

In [6]:
df['medical_specialty'].value_counts(dropna=False).head(10)

Surgery                          1103
Consult - History and Phy.        516
Cardiovascular / Pulmonary        372
Orthopedic                        355
Radiology                         273
General Medicine                  259
Gastroenterology                  230
Neurology                         223
SOAP / Chart / Progress Notes     166
Obstetrics / Gynecology           160
Name: medical_specialty, dtype: int64

In [7]:
df['medical_specialty'].value_counts(dropna=False).head().index.to_list()

['Surgery',
 'Consult - History and Phy.',
 'Cardiovascular / Pulmonary',
 'Orthopedic',
 'Radiology']

In [8]:
df['specialty']='Other'
df.loc[df['medical_specialty']==' Surgery', 'specialty']='Surgery'
df.loc[df['medical_specialty']==' Consult - History and Phy.', 'specialty']='Consultation'
df.loc[df['medical_specialty']==' Cardiovascular / Pulmonary', 'specialty']='Heart/Lungs'
df['specialty'].value_counts()

Other    4999
Name: specialty, dtype: int64

In [9]:
bar = df['medical_specialty'].value_counts(dropna=False).head(10)
bar.values

array([1103,  516,  372,  355,  273,  259,  230,  223,  166,  160])

In [13]:
mycolors=['#A7226E',   '#EC2049', '#16697a', '#db6400', '#ffa62b']
data=[go.Bar(
    x=bar.index, 
    y=bar.values, 
    marker_color=mycolors[0]
)]
layout=go.Layout(title='Number of transcripts, by medical specialty',
                 xaxis=dict(title='Medical Specialty'),
                 yaxis=dict(title='Frequency'),
    )
fig = go.Figure(data, layout)
fig.update_xaxes(tickangle = 45)
fig.show()
fig.write_html("cat10.html")

## Missing data

In [14]:
df['transcription'].isnull().sum()
df=df.dropna(subset=['transcription']).copy()
df['transcription'].isnull().sum()

0

In [15]:
print(df['keywords'].isnull().sum())
for value in bar.index:
    print(value, df[df['medical_specialty']==value]['keywords'].isnull().sum())

1068
Surgery 67
Consult - History and Phy. 282
Cardiovascular / Pulmonary 91
Orthopedic 52
Radiology 22
General Medicine 113
Gastroenterology 29
Neurology 55
SOAP / Chart / Progress Notes 24
Obstetrics / Gynecology 25


## Keywords

In [16]:
df['keywords'].nunique()

3816

In [17]:
df['keywords'].iloc[0]

'allergy / immunology, allergic rhinitis, allergies, asthma, nasal sprays, rhinitis, nasal, erythematous, allegra, sprays, allergic,'

In [18]:
# Tokenize 
def preprocess(result, text):
    
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token.lower())           
    return result
result=[]
preprocess(result, df['keywords'].iloc[0])

['allergy',
 'immunology',
 'allergic',
 'rhinitis',
 'allergies',
 'asthma',
 'nasal',
 'sprays',
 'rhinitis',
 'nasal',
 'erythematous',
 'allegra',
 'sprays',
 'allergic']

In [19]:
processed_docs = {}
for key in bar.index:
    result=[]
    for doc in df[(df['medical_specialty']==key)&(df['keywords'].notnull())]['keywords']:
        processed_docs[key]=preprocess(result, doc)

In [20]:
key=bar.index[0]
print(key, processed_docs[key][:100])
print(len(processed_docs[key])) # number of tokens
print(len(set(processed_docs[key]))) # number of unique tokens

Surgery ['surgery', 'hallux', 'rigidus', 'metatarsal', 'youngswick', 'bunionectomy', 'screw', 'fixation', 'ankle', 'tourniquet', 'joint', 'biopro', 'implant', 'proximal', 'phalanx', 'foot', 'austin', 'anesthesia', 'osteotomy', 'screw', 'surgery', 'abraham', 'capsulotomy', 'laser', 'capsulotomy', 'capsulotomy', 'laser', 'membrane', 'capsular', 'surgery', 'hallux', 'limitus', 'deformity', 'joint', 'plantar', 'youngswick', 'osteotomy', 'dorsal', 'cuts', 'ankle', 'tourniquet', 'proximal', 'phalanx', 'anesthesia', 'tourniquet', 'youngswick', 'phalanx', 'proximal', 'metatarsal', 'dorsal', 'osteotomy', 'surgery', 'open', 'wound', 'prosthetic', 'vascular', 'graft', 'closure', 'wound', 'surgisis', 'peripheral', 'vascular', 'disease', 'wound', 'debridement', 'subcutaneous', 'tissue', 'insertion', 'wound', 'betadine', 'debridement', 'xenograft', 'insertion', 'surgery', 'capsule', 'opacity', 'ophthalmic', 'laser', 'posterior', 'capsulotomy', 'capsulotomy', 'opacity', 'laser', 'visually', 'anesthes

In [21]:
keywords_dict = {}
for key in bar.index:
    keywords_dict[key]=pd.DataFrame(processed_docs[key], columns=['tokens'])

In [22]:
keywords_dict['Surgery']

Unnamed: 0,tokens
0,surgery
1,hallux
2,rigidus
3,metatarsal
4,youngswick
...,...
23096,carmine
23097,vault
23098,prolapse
23099,sacrocolpopexy


In [23]:
for item in keywords_dict.keys():
    print(item, '\n', keywords_dict[item]['tokens'].value_counts().head(), '\n')

Surgery 
 surgery     1023
reports      171
sample       171
artery       165
catheter     124
Name: tokens, dtype: int64 

Consult - History and Phy. 
 history    230
consult    224
weight      45
loss        41
pain        35
Name: tokens, dtype: int64 

Cardiovascular / Pulmonary 
 pulmonary         364
cardiovascular    280
artery            174
coronary          150
chest             117
Name: tokens, dtype: int64 

Orthopedic 
 orthopedic    296
cervical      124
carpal         97
fracture       84
anterior       76
Name: tokens, dtype: int64 

Radiology 
 radiology    248
artery        65
stress        55
brain         53
carotid       50
Name: tokens, dtype: int64 

General Medicine 
 medicine       139
general        138
respiratory     25
blood           24
disease         20
Name: tokens, dtype: int64 

Gastroenterology 
 colon              68
laparoscopic       66
colonoscopy        55
abdomen            53
cholecystectomy    45
Name: tokens, dtype: int64 

Neurology 
 neur

In [24]:
# top 10 tokens: surgery
item=bar.index[0]
print(item)
surgery_toks=pd.DataFrame(keywords_dict[item]['tokens'].value_counts()).iloc[3:13]
surgery_toks

Surgery


Unnamed: 0,tokens
artery,165
catheter,124
anterior,120
coronary,119
medical,114
transcription,114
cervical,107
incision,106
laparoscopic,100
carpal,97


In [25]:
# top 10 tokens: consult
item=bar.index[1]
print(item)
consult_toks=pd.DataFrame(keywords_dict[item]['tokens'].value_counts()).iloc[2:12]
consult_toks

Consult - History and Phy.


Unnamed: 0,tokens
weight,45
loss,41
pain,35
disease,23
blood,23
cancer,22
lumbar,22
dietary,21
physical,21
bypass,21


In [26]:
# top 10 tokens: consult
item=bar.index[2]
print(item)
cardio_toks=pd.DataFrame(keywords_dict[item]['tokens'].value_counts()).iloc[2:12]
cardio_toks

Cardiovascular / Pulmonary


Unnamed: 0,tokens
artery,174
coronary,150
chest,117
heart,85
stress,70
ventricular,62
angiography,61
catheterization,60
carotid,55
disease,53


In [27]:
# top 10 tokens: consult
item=bar.index[3]
print(item)
ortho_toks=pd.DataFrame(keywords_dict[item]['tokens'].value_counts()).iloc[2:12]
ortho_toks

Orthopedic


Unnamed: 0,tokens
carpal,97
fracture,84
anterior,76
tunnel,74
tendon,69
discectomy,68
knee,67
joint,67
lumbar,66
ligament,62


In [28]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

mycolors=['#2F9599',   '#EC2049', '#16697a', '#db6400', '#ffa62b']
fig = make_subplots(rows=2, cols=2, subplot_titles=("Surgery","Consultations", "Cardiovascular/Pulmonary", "Orthopedic"))

group=surgery_toks
fig.append_trace(go.Bar(
        x=group.index, 
        y=np.array([item[0] for item in group.values]), 
        marker_color=mycolors[0]
), row=1, col=1)

group=consult_toks
fig.append_trace(go.Bar(
        x=consult_toks.index, 
        y=np.array([item[0] for item in group.values]), 
        marker_color=mycolors[2]
), row=2, col=1)

group=cardio_toks
fig.append_trace(go.Bar(
        x=cardio_toks.index, 
        y=np.array([item[0] for item in group.values]), 
        marker_color=mycolors[3]
), row=1, col=2)


group=cardio_toks
fig.append_trace(go.Bar(
        x=ortho_toks.index, 
        y=np.array([item[0] for item in group.values]), 
        marker_color=mycolors[4]
), row=2, col=2)

fig.update_layout(height=1000, title_text="Top 10 Keywords, by Medical Specialty", showlegend=False,)
fig.update_xaxes(tickangle = 45)
fig.show()
fig.write_html("keywords.html")

## Average length of transcription

In [29]:
df.loc[0, 'transcription']

'SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried Claritin, and Zyrtec.  Both worked for short time but then seemed to lose effectiveness.  She has used Allegra also.  She used that last summer and she began using it again two weeks ago.  It does not appear to be working very well.  She has used over-the-counter sprays but no prescription nasal sprays.  She does have asthma but doest not require daily medication for this and does not think it is flaring up.,MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra.,ALLERGIES: , She has no known medicine allergies.,OBJECTIVE:,Vitals:  Weight was 130 pounds and blood pressure 124/78.,HEENT:  Her throat was mildly erythematous without exudate.  Nasal mucosa was erythematous and swollen.  Only clear drainage was seen.  TMs were clear.,Neck:  Supple without adenopathy.,

In [30]:
df['length']=df['transcription'].apply(lambda x: len(x))
df.head()

Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords,specialty,length
0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller...",Other,1331
1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh...",Other,2431
2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart...",Other,4422
3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple...",Other,495
4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo...",Other,1618


In [31]:
small=df.loc[df['medical_specialty'].isin(bar.index)]
small.sample(10)

Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords,specialty,length
4788,Left heart catheterization and bilateral sele...,Cardiovascular / Pulmonary,Heart Cath & Coronary Angiography,"PROCEDURE PERFORMED:,1. Left heart catheteriz...","cardiovascular / pulmonary, left heart cathete...",Other,4920
241,Tracheotomy for patient with respiratory fail...,Surgery,Tracheotomy - 1,"PREOPERATIVE DIAGNOSIS: , Respiratory failure....","surgery, bivona tube, duoderm, tracheotomy tub...",Other,2123
1321,A 65-year-old man with chronic prostatitis re...,SOAP / Chart / Progress Notes,Prostatitis - Recheck,"SUBJECTIVE:, The patient is a 65-year-old man...",,Other,4238
4689,Obstructive sleep apnea syndrome. Loud snori...,Cardiovascular / Pulmonary,Pulmonary Consultation - 1,"CHIEF COMPLAINT:, Rule out obstructive sleep ...",,Other,2378
351,Open septorhinoplasty with placement of bilat...,Surgery,Septorhinoplasty,"PREOPERATIVE DIAGNOSIS: , Acquired nasal septa...",,Other,8527
2547,A white female who presents for complete phys...,Obstetrics / Gynecology,Physical Exam and Pap -2,"SUBJECTIVE:, The patient is a 68-year-old whi...",,Other,3250
2627,Delivery is a normal spontaneous vaginal deli...,Obstetrics / Gynecology,Delivery Note,"HISTORY: , This patient with prenatal care in ...","obstetrics / gynecology, decreased fetal movem...",Other,4239
2083,Pain management for post-laminectomy low back...,Orthopedic,Pain Management Consult - 1,Mr. XYZ forgot his hearing aids at home today ...,"orthopedic, pain management, opioid dependence...",Other,10434
4583,Possible free air under the diaphragm. On a ...,Consult - History and Phy.,Air Under Diaphragm - Consult,"REASON FOR CONSULTATION: , Possible free air u...",,Other,2026
1532,Complex Regional Pain Syndrome Type I. Stell...,Radiology,Radiofrequency Thermocoagulation - 2,"PREOPERATIVE DIAGNOSIS:, Complex Regional Pai...","radiology, sheath, vertebral body, regional pa...",Other,3223


In [32]:
import plotly.express as px
mycolors=['#2F9599',   '#EC2049', '#16697a', '#db6400', '#ffa62b']

fig = px.box(small, x='medical_specialty', y='length', title='Average Length of Medical Transcripts',
            color_discrete_sequence=[mycolors[1]]
            )
fig.update_layout(xaxis={'categoryorder':'array', 'categoryarray':bar.index})
fig.update_layout(yaxis_title='Average number of characters', xaxis_title='Medical Specialty')
fig.update_xaxes(tickangle = 45)
fig.show()
fig.write_html("boxplots.html")