In [14]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Abdoi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Cleaning Indiana Dataset

In [14]:
import spacy
import language_tool_python

# Load spaCy model for tokenization and sentence segmentation
nlp = spacy.load("en_core_web_sm")

def add_punctuation_and_correct_grammar(text):
    # Use spaCy to process the text
    doc = nlp(text)
    
    # Initialize list to hold the corrected sentences
    corrected_sentences = []
    
    for sent in doc.sents:
        tokens = [token.text for token in sent]
        corrected_sentence = ""
        
        # Add commas and conjunctions where needed
        for i, token in enumerate(tokens):
            corrected_sentence += token
            if i < len(tokens) - 1:
                next_token = tokens[i + 1]
                
                # Add a comma before conjunctions or coordinating conjunctions
                if next_token.lower() in {"and", "or", "but", "nor", "for", "so", "yet"}:
                    corrected_sentence += ","
                
                # Add a comma after specific medical terms
                if token.lower() in {"emphysema", "opacity", "pneumothorax", "nodule", "granuloma", "cardiomegaly", "atelectasis"}:
                    corrected_sentence += ","
                
                corrected_sentence += " "
        
        # Add period at the end of the sentence if not present
        if not corrected_sentence.strip().endswith("."):
            corrected_sentence = corrected_sentence.strip() + "."
        corrected_sentences.append(corrected_sentence)
    
    # Join corrected sentences into a single text
    corrected_text = ' '.join(corrected_sentences)
    
    return corrected_text





# Initialize the LanguageTool object
tool = language_tool_python.LanguageTool('en-US')

def correct_grammar(text):
    # Check the text for errors
    matches = tool.check(text)
    
    # Correct the text based on the matches found
    corrected_text = language_tool_python.utils.correct(text, matches)
    
    return corrected_text


text = "emphysema without focal airspace opacity pneumothorax or nodule is no acute pulmonary granuloma are clear of the heart size and mild cardiomegaly with chronic interstitial opacities atelectasis"
corrected_text = add_punctuation_and_correct_grammar(text)
print(corrected_text)



# text = "emphysema without focal airspace opacity pneumothorax or nodule is no acute pulmonary granuloma are clear of the heart size and mild cardiomegaly with chronic interstitial opacities atelectasis"
corrected_text = correct_grammar(corrected_text)
print(corrected_text)





emphysema, without focal airspace opacity, pneumothorax,, or nodule, is no acute pulmonary granuloma, are clear of the heart size, and mild cardiomegaly, with chronic interstitial opacities atelectasis.
Emphysema, without focal airspace opacity, pneumothorax, or nodule, is no acute pulmonary granuloma, are clear of the heart size, and mild cardiomegaly, with chronic interstitial opacities' atelectasis.


In [1]:
tags = ['cardiac monitor',
 'lymphatic diseases',
 'pulmonary disease',
 'osteophytes',
 'foreign body', #ttshal
 'dish', #ttshal
 'aorta, thoracic',
 'atherosclerosis',
 'histoplasmosis',
 'hypoventilation',
 'catheterization, central venous',
 'pleural effusions',
 'pleural effusion',
 'callus',
 'sternotomy',
 'lymph nodes',
 'tortuous aorta',
 'stent', #ttshal
 'interstitial pulmonary edema',
 'cholecystectomies', #ttshal
 'neoplasm', #ttshal
 'central venous catheter', #catheter olyla awy bs venous kteera
 'pneumothorax',
 'metastatic disease',
 'vena cava, superior',
 'cholecystectomy',
 'scoliosis',
 'subcutaneous emphysema',
 'thoracolumbar scoliosis',
 'spinal osteophytosis',
 'pulmonary fibroses',
 'rib fractures',
 'sarcoidosis',
 'eventration',
 'fibrosis',
 'spine',
 'obstructive lung disease',
 'pneumonitis',
 'osteopenia',
 'air trapping',
 'demineralization',
 'mass lesion',
 'pulmonary hypertension',
 'pleural diseases',
 'pleural thickening',
 'calcifications of the aorta',
 'calcinosis',
 'cystic fibrosis',
 'empyema',
 'catheter',
 'lymph',
 'pericardial effusion',
 'lung cancer',
 'rib fracture',
 'granulomatous disease',
 'chronic obstructive pulmonary disease',
 'rib',
 'clip',
 'aortic ectasia',
 'shoulder',
 'scarring',
 'scleroses',
 'adenopathy',
 'emphysemas',
 'pneumonectomy',
 'infection',
 'aspiration',
 'bilateral pleural effusion',
 'bulla',
 'lumbar vertebrae',
 'lung neoplasms',
 'lymphadenopathy',
 'hyperexpansion',
 'ectasia',
 'bronchiectasis',
 'nodule',
 'pneumonia',
 'right-sided pleural effusion',
 'osteoarthritis',
 'thoracic spondylosis',
 'picc',
 'cervical fusion',
 'tracheostomies',
 'fusion',
 'thoracic vertebrae',
 'catheters',
 'emphysema',
 'trachea',
 'surgery',
 'cervical spine fusion',
 'hypertension, pulmonary',
 'pneumoperitoneum',
 'scar',
 'atheroscleroses',
 'aortic calcifications',
 'volume overload',
 'right upper lobe pneumonia',
 'apical granuloma',
 'diaphragms',
 'copd',
 'kyphoses',
 'spinal fractures',
 'fracture',
 'clavicle',
 'focal atelectasis',
 'collapse',
 'thoracotomies',
 'congestive heart failure',
 'calcified lymph nodes',
 'edema',
 'degenerative disc diseases',
 'cervical vertebrae',
 'diaphragm',
 'humerus',
 'heart failure',
 'normal',
 'coronary artery bypass',
 'pulmonary atelectasis',
 'lung diseases,interstitial',
 'pulmonary disease,chronic obstructive',
 'opacity',
 'deformity',
 'chronic disease',
 'pleura',
 'aorta',
 'tuberculoses',
 'hiatal hernia',
 'scolioses',
 'pleural fluid',
 'malignancy',
 'kyphosis',
 'bronchiectases',
 'congestion',
 'discoid atelectasis',
 'nipple',
 'bronchitis',
 'pulmonary artery',
 'cardiomegaly',
 'thoracic aorta',
 'arthritic changes',
 'pulmonary edema',
 'vascular calcification',
 'sclerotic',
 'central venous catheters',
 'catheterization',
 'hydropneumothorax',
 'aortic valve',
 'hyperinflation',
 'prostheses',
 'pacemaker,artificial',
 'bypass grafts',
 'pulmonary fibrosis',
 'multiple myeloma',
 'postoperative period',
 'cabg',
 'right lower lobe pneumonia',
 'granuloma',
 'degenerative change',
 'atelectasis',
 'inflammation',
 'effusion',
 'cicatrix',
 'tracheostomy',
 'aortic diseases',
 'sarcoidoses',
 'granulomas',
 'interstitial lung disease',
 'infiltrates',
 'displaced fractures',
 'chronic lung disease',
 'picc line',
 'intubation,gastrointestinal',
 'lung diseases',
 'multiple pulmonary nodules',
 'intervertebral disc degeneration',
 'pulmonary emphysema',
 'spine curvature',
 'fibroses',
 'chronic granulomatous disease',
 'degenerative disease',
 'atelectases',
 'ribs',
 'pulmonary arterial hypertension',
 'edemas',
 'pectus excavatum',
 'lung granuloma',
 'plate-like atelectasis',
 'enlarged heart',
 'hilar calcification',
 'heart valve prosthesis',
 'tuberculosis',
 'old injury',
 'patchy atelectasis',
 'histoplasmoses',
 'exostoses',
 'mastectomies',
 'right atrium',
 'large hiatal hernia',
 'hernia, hiatal',
 'aortic aneurysm',
 'lobectomy',
 'spinal fusion',
 'spondylosis',
 'ascending aorta',
 'granulomatous infection',
 'fractures, bone',
 'calcified granuloma',
 'degenerative joint disease',
 'intubation, intratracheal',
 'others']

print(len(tags))

210


In [41]:
#read indiana dataset 
df = pd.read_csv('./indiana_reports.csv')

#to capture
measurement = re.compile(r'(\d+(.\d+)?)( )?((cm|mm)?( )?(x) (\d+(.\d+)?) )?(cm|mm)')
ratio = re.compile(r'(\d+(.\d+)\/)')
rankNumbers = re.compile(r'[0-9](st|nd|rd|th)', re.I)
words = re.compile(r'(day|film|recommend|prior|comparison|compare|image|T6|T8|T11|T12|low|right|left|mid|status|patient|exam|please|unchanged)', re.I)
intact = re.compile(r'((?<= )( )?(is|are) intact)|((?<=  )(is|are) unremarkable)')
#create a new dataframe df2
df2 = pd.DataFrame()
# df2['uid'] = df['uid']

df['findings'] = df['findings'].str.replace(r'XXXX', '',regex = True)
df['impression'] = df['impression'].str.replace(r'(XXXX\.|XXXX)', '',regex = True)

#remove any list numbu2. 3. and so on
df['findings'] = df['findings'].str.replace(r'([0-9](\.))|(^[0-9](\.))', '',regex=True)
df['impression'] = df['impression'].str.replace(r'([0-9](\.))|(^[0-9](\.))', '',regex=True) 

#remove comas
df['findings'] = df['findings'].str.replace(r',|-|:|<>|\\', '',regex=True)
df['impression'] = df['impression'].str.replace(r',|-', '',regex=True)

#split each to list of sentences 
df['findings'] = df['findings'].map(lambda x: str(x).lower().split('.'))
df['impression'] = df['impression'].map(lambda x: str(x).lower().split('.'))


df['findings'] = df['findings'].apply(lambda x: [sentence for sentence in x if not (intact.search(sentence) or words.search(sentence) or rankNumbers.search(sentence) or measurement.search(sentence) or ratio.search(sentence))])
df['impression'] = df['impression'].apply(lambda x: [sentence for sentence in x if not (intact.search(sentence) or words.search(sentence) or rankNumbers.search(sentence) or measurement.search(sentence) or ratio.search(sentence))])

#loop on the each senctence in the list of sentences and remove any remaining numbers from the sentence 
df['findings'] = df['findings'].apply(lambda x: [re.sub(r'\d+', '', sentence) for sentence in x])
df['impression'] = df['impression'].apply(lambda x: [re.sub(r'\d+', '', sentence) for sentence in x])

#remove any empty sentences
df['findings'] = df['findings'].apply(lambda x: [sentence for sentence in x if sentence.strip()])
df['impression'] = df['impression'].apply(lambda x: [sentence for sentence in x if sentence.strip()])

df2['imgID']= df['uid']

df2['captions'] = df['findings'] + df['impression']

df2['captions'] = df2['captions'].apply(lambda x: [sentence.split() for sentence in x if sentence != 'nan'])
df2['captions'] = df2['captions'].apply(lambda x: [sentence for sentence in x if len(sentence) >2])
# remove row with empty list
df2 = df2[df2['captions'].map(len) > 0]

df2.to_csv('indiana_reports_cleaned3.csv', index=False)



In [42]:
df3 = df2.explode("captions")
# df3.explode("captions")
# #save df2 in csv 
# # split sentence and remove any row that has words that are <= 2
# df3 = df3['captions'].map(lambda x: str(x).split())
# # remove any row of size <= 2 
# df3 = df3[df3.map(len) > 2]

df3.to_csv('indiana_reports_cleaned.csv', index=False)
# print(df3.isna().sum())

In [None]:
import pandas as pd
# read
df = pd.read_parquet('D:/GAM3A/5-Senior02/GP/KENGIC/MIMIC-medical-report/data/train-00000-of-00001-0dc3c7ebb0311aec.parquet')
formatted_df = pd.DataFrame()
#split the text given in to sentences
#remove the following from findings and impression
# any ___
formatted_df['FINDINGS'] = df['FINDINGS'].str.replace(r'___', '', regex = True)
formatted_df['IMPRESSION'] = df['IMPRESSION'].str.replace(r'___', '', regex = True)

# any Dr.
formatted_df['FINDINGS'] = formatted_df['FINDINGS'].str.replace(r'Dr.', '', regex = True)
formatted_df['IMPRESSION'] = formatted_df['IMPRESSION'].str.replace(r'Dr.', '', regex = True)

# any time formats ex: at 12:00 / at floating numbers
formatted_df['FINDINGS'] = formatted_df['FINDINGS'].str.replace(r'(at \d{1,2}:\d{1,2})|(\d{1,2}:\d{1,2})', '', regex = True)
formatted_df['IMPRESSION'] = formatted_df['IMPRESSION'].str.replace(r'(at \d{1,2}:\d{1,2})|(\d{1,2}:\d{1,2})', '', regex = True)

# any p.m/a.m/am/pm
formatted_df['FINDINGS'] = formatted_df['FINDINGS'].str.replace(r'( am )|( pm )|( p\.m)|( a\.m)', '', regex = True)
formatted_df['IMPRESSION'] = formatted_df['IMPRESSION'].str.replace(r'( am )|( pm )|( p\.m)|( a\.m)', '', regex = True)

# remove floating numbers followed by measurements ex: 12.5
formatted_df['FINDINGS'] = formatted_df['FINDINGS'].str.replace(r'\d+\.\d+', '', regex = True)
formatted_df['FINDINGS'] = formatted_df['FINDINGS'].str.replace(r'\d+\.', '', regex = True)
formatted_df['IMPRESSION'] = formatted_df['IMPRESSION'].str.replace(r'\d+\.\d+', '', regex = True)
formatted_df['IMPRESSION'] = formatted_df['IMPRESSION'].str.replace(r'\d+\.', '', regex = True)

#remove any cm mm inch
formatted_df['FINDINGS'] = formatted_df['FINDINGS'].str.replace(r'( cm)|( mm)', '', regex = True)
formatted_df['IMPRESSION'] = formatted_df['IMPRESSION'].str.replace(r'( cm)|( mm)', '', regex = True)

# remove any 1.,2.,3.,etc.
#done in the above step

# remove , =
formatted_df['FINDINGS'] = formatted_df['FINDINGS'].str.replace(r',|=', '', regex = True)
formatted_df['IMPRESSION'] = formatted_df['IMPRESSION'].str.replace(r',|=', '', regex = True)

#remove any numbers
formatted_df['FINDINGS'] = formatted_df['FINDINGS'].str.replace(r'\d+', '', regex = True)
formatted_df['IMPRESSION'] = formatted_df['IMPRESSION'].str.replace(r'\d+', '', regex = True)

#remove any \n
formatted_df['FINDINGS'] = formatted_df['FINDINGS'].str.replace(r'\n', '', regex = True)
formatted_df['IMPRESSION'] = formatted_df['IMPRESSION'].str.replace(r'\n', '', regex = True)

#split each paragraph on .
formatted_df['FINDINGS'] = formatted_df['FINDINGS'].map(lambda x: str(x).split('.'))
formatted_df['IMPRESSION'] = formatted_df['IMPRESSION'].map(lambda x: str(x).split('.'))

#remove empty strings
formatted_df['FINDINGS'] = formatted_df['FINDINGS'].map(lambda x: [i.split() for i in x if i != ''])
formatted_df['IMPRESSION'] = formatted_df['IMPRESSION'].map(lambda x: [i.split() for i in x if i != ''])

#check for since, through, by, on,
#make every token a lower case 
formatted_df['FINDINGS'] = formatted_df['FINDINGS'].apply(lambda x: [[word.lower() for word in sentence] for sentence in x])
formatted_df['IMPRESSION'] = formatted_df['IMPRESSION'].apply(lambda x: [[word.lower() for word in sentence] for sentence in x])


# #remove at ; however, new, from the sentence 
toRemove = ['at', 'however', 'new', 'from',';']
formatted_df['FINDINGS'] = formatted_df['FINDINGS'].apply(lambda x: [[word for word in sentence if word not in toRemove] for sentence in x])
formatted_df['IMPRESSION'] = formatted_df['IMPRESSION'].apply(lambda x: [[word for word in sentence if word not in toRemove] for sentence in x])


#remove sentence with through, since, submitted, unchanged, compared, comparison, previous, prior,increase, decrease,increased, decreased,
#findings, film, PICC, yesterday, today, SVC, tube,  
toRemoveSentence = ['through', 'since', 'submitted', 'unchanged', 'compared', 'comparison', 'previous', 'prior', 'increase', 'decrease', 'increased', 'decreased', 'findings', 'film', 'picc', 'yesterday', 'today', 'svc', 'tubes']
formatted_df['FINDINGS'] = formatted_df['FINDINGS'].apply(lambda x: [sentence for sentence in x if not any(word in sentence for word in toRemoveSentence)])
formatted_df['IMPRESSION'] = formatted_df['IMPRESSION'].apply(lambda x: [sentence for sentence in x if not any(word in sentence for word in toRemoveSentence)])

finalDf = pd.DataFrame()
finalDf['captions'] = formatted_df['FINDINGS'] + formatted_df['IMPRESSION']

# remove ['as','above'],['status','quo']
toRemoveSentence = ['above', 'quo']
finalDf['captions'] = finalDf['captions'].apply(lambda x: [sentence for sentence in x if (not any(word in sentence for word in toRemoveSentence) and len(sentence) > 2)])

In [31]:
#create a dataframe with list of list of words in row
temp = [
   [ [["bla", "bla", "bla"],
    ["bla2", "bla2", "bla2"],
    ["bla3", "bla3", "bla3"]]]
]
temp = pd.DataFrame(temp)
print(temp.head())

                                                   0
0  [[bla, bla, bla], [bla2, bla2, bla2], [bla3, b...


In [39]:
#split lists to row
# new = finalDf.explode('captions')
newList = df3['captions'].tolist()
print(len(newList))
# Convert each inner list to a tuple and add them to a set
# print(type(newList))
unique_ref = set()
for x in newList:
    if isinstance(x, list):
        t = tuple(x)
        unique_ref.add(t)

16763


In [27]:
sw = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

In [43]:
#convert from set of tuple to list of list
unique_ref = [list(x) for x in unique_ref]
print(len(unique_ref))
print(unique_ref[0])
#wrie to csv

# sw = stopwords.words('english')
#remove stopwords
#convert list of lists to dataframe
for i in range(len(unique_ref)):
    x = unique_ref[i]
    y = []
    for j in x:
        if j not in sw:
            y.append(j)
    unique_ref[i] = y
    
print(unique_ref[0])

with open('mimic_reports_cleaned.csv', 'w') as f:
    for item in unique_ref:
        f.write("%s\n" % item)
    
# df = pd.DataFrame(unique_ref, columns=['captions'], index=None)


4642
['soft', 'tissues', 'within', 'normal', 'limits']
['soft', 'tissues', 'within', 'normal', 'limits']
