In [4]:
import re
import string

import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import spacy

In [5]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adimyth/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**Loading Data**

In [6]:
df = pd.read_csv("../data/raw/Case_Study_Data.csv")

In [7]:
df.shape

(4999, 3)

**Drop NAN rows**

In [8]:
df = df.loc[df["Medical_Description"].notna()]

**Lower Casing**

In [9]:
df["Medical_Description"] = df["Medical_Description"].str.lower()

In [10]:
df["Sample"] = df["Sample"].str.lower()

In [11]:
df.sample(n=5)

Unnamed: 0,Package,Sample,Medical_Description
2157,Orthopedic,mcbride bunionectomy & wedge osteotomy,"preoperative diagnosis:, right hallux abducto..."
274,Surgery,thyroidectomy - 1,"preoperative diagnosis: , thyroid goiter.,post..."
1783,Psychiatry / Psychology,psych consult - pain meds,"reason for consultation: , management of pain ..."
450,Surgery,phacoemulsification & lens implantation,"preoperative diagnosis:, cataract, nuclear sc..."
41,Urology,"spermatocelectomy, epididymectomy, & vasectomy","preoperative diagnoses:,1. left spermatocele...."


**Remove Punctuations**

In [12]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [13]:
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [14]:
df["Medical_Description"] = df["Medical_Description"].apply(lambda text: remove_punctuation(text))
df["Sample"] = df["Sample"].apply(lambda text: remove_punctuation(text))

In [15]:
df.sample(n=5)

Unnamed: 0,Package,Sample,Medical_Description
1978,Pain Management,lumbar epidural steroid injection 1,operation lumbar epidural steroid injection i...
2738,Neurosurgery,anterior cervical discectomy fusion 2,preoperative diagnosis herniated nucleus pulp...
1089,Surgery,cardioversion direct current,procedure direct current cardioversionreason ...
4617,Cardiovascular / Pulmonary,transthoracic echocardiography,reason for exam coronary artery bypass surger...
784,Surgery,flexor carpi radialis palmaris longus repair,preoperative diagnosis right wrist laceration...


**Remove Stopwords**

In [16]:
# https://cs.stanford.edu/people/sonal/gupta14jamia_supl.pdf
medical_stopwords_list1 = ["disease", "diseases", "disorder", "symptom", "symptoms", "drug", "drugs", "problems", "problem",
                           "prob", "probs", "med", "meds", "pill", "pills", "medicine", "medicines", "medication", "medications", 
                           "treatment", "treatments", "caps", "capsules", "capsule", "tablet", "tablets", "tabs", "doctor", 
                           "dr", "dr.", "doc", "physician", "physicians", "test", "tests", "testing", "specialist", 
                           "specialists", "side-effect", "side-effects", "pharmaceutical", "pharmaceuticals", "pharma", 
                           "diagnosis", "diagnose", "diagnosed", "exam", "challenge", "device", "condition", "conditions", 
                           "suffer", "suffering", "suffered", "feel", "feeling", "prescription", "prescribe",
                           "prescribed", "over-the-counter", "otc", "contain", "contains"]


In [17]:
medical_stopwords_list2 = pd.read_csv("https://raw.githubusercontent.com/kavgan/clinical-concepts/master/clinical-stopwords.txt")["#regular stop words with clinical"].tolist()

In [18]:
medical_stopwords = medical_stopwords_list1+medical_stopwords_list2

In [19]:
english_stopwords = ", ".join(stopwords.words('english'))

In [20]:
english_stopwords = english_stopwords.split(",")

In [21]:
total_stopwords = medical_stopwords+english_stopwords

In [22]:
len(total_stopwords), len(medical_stopwords), len(english_stopwords)

(1053, 874, 179)

In [23]:
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in total_stopwords])

In [24]:
df["Medical_Description"] = df["Medical_Description"].apply(lambda text: remove_stopwords(text))
df["Sample"] = df["Sample"].apply(lambda text: remove_stopwords(text))

In [25]:
df.head()

Unnamed: 0,Package,Sample,Medical_Description
0,Allergy / Immunology,allergic rhinitis,subjective 23yearold white female presents com...
1,Bariatrics,laparoscopic gastric bypass consult 2,past history difficulty climbing stairs diffic...
2,Bariatrics,laparoscopic gastric bypass consult 1,history present illness seen abc today pleasan...
3,Cardiovascular / Pulmonary,2d echocardiogram 1,2d mmode 1 left atrial enlargement left atrial...
4,Cardiovascular / Pulmonary,2d echocardiogram 2,1 left ventricular cavity size wall thickness ...


**Add Length**

In [26]:
df["DescriptionLength"] = df["Medical_Description"].str.len()

**Save Processed Data**

In [27]:
df.to_csv("../data/processed/processed_data.csv", index=False)