In [1]:
!pip3 install pyspellchecker

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
import pandas as pd
from spellchecker import SpellChecker
import re

In [3]:
df = pd.read_csv('../data/inputs/ct_desc.csv')
df['Hashtag']

0        [unchanged]_intramural_hematoma, ascending_aor...
1        [unchange]_[right]_pleural effusion, [decrease...
2        [negx]_pulmonary_embolism, [decrease]_[bilater...
3                                       [negex]_metastasis
4           [left]_[supraclavicular]_LN, [risk]_metastasis
5                                       [negex]_metastasis
6        [negx]_pulmonary_embolism, [improved]_Erdheim_...
7                                       [negex]_metastasis
8        [negex]_pulmonary_embolism, [bilateral]_[moder...
9        [negex]_metastasis, [RML]_[3mm]_pulmonary_nodu...
10                                                     NaN
11       [negex]_pulmonary_embolism, consolidation, [ri...
12                           [left]_[6mm]_pulmonary_nodule
13                  [unchanged]_[RLL]_[4mm]_pleural_nodule
14                [unchanged]_[RLL]_[3mm]_pulmonary_nodule
15                                                     NaN
16       [increase]_[RLL]_[6mm]_pulmonary_nodule, [risk.

In [4]:
def process_hashtag(tags):
    
    descriptors = set()
    entities = set()
    
    tags = tags.split(', ')
    for tag in tags:
        tag = tag.split('_')
        
        for i, v in enumerate(tag):
            if v[0] == '[':
                descriptors.add(v)
            else:
                entities.add("_".join(tag[i:]))
                break
        
    return descriptors, entities

In [7]:
uncleaned_descriptors = set()
uncleaned_entities = set()

for hashtag in df['Hashtag']:
    if isinstance(hashtag, str):
        d, e = process_hashtag(hashtag)
        uncleaned_descriptors.update(d)
        uncleaned_entities.update(e)

In [11]:
def descriptor_cleaner(desc):
    descriptors_cleaned = set()
    
    for d in desc:
        temp = re.sub(r'[^a-zA-Z]', "", d)
        
        descriptors_cleaned.add(temp)
    
    return descriptors_cleaned

cleaned_descriptors = descriptor_cleaner(uncleaned_descriptors)

In [12]:
cleaned_descriptors 

{'',
 'AP',
 'BLL',
 'BML',
 'BUL',
 'GGN',
 'Gortex',
 'L',
 'LLL',
 'LLLcm',
 'LUL',
 'RBLL',
 'RISK',
 'RLL',
 'RML',
 'RUL',
 'SVC',
 'T',
 'TT',
 'acute',
 'age',
 'aggravated',
 'anterior',
 'aortopulmonary',
 'apex',
 'apical',
 'appandage',
 'arch',
 'atypical',
 'axillary',
 'azygoesophageal',
 'basilar',
 'bilateral',
 'biltaeral',
 'borderline',
 'brachicephalic',
 'brachiocephalic',
 'breast',
 'bronchus',
 'calcified',
 'cannular',
 'cardiomegaly',
 'cardiophrenic',
 'category',
 'catheter',
 'cavitary',
 'chest',
 'chronic',
 'clavicle',
 'cm',
 'comminuted',
 'common',
 'confirmed',
 'costophrenic',
 'cystic',
 'decrease',
 'decreased',
 'decrese',
 'descending',
 'diaphragm',
 'diffuse',
 'distal',
 'enhancing',
 'esophagus',
 'extrapleural',
 'exudative',
 'fatty',
 'fissural',
 'fissure',
 'focal',
 'glenoid',
 'groud',
 'ground',
 'hilar',
 'humerus',
 'hyperenhancing',
 'improved',
 'increase',
 'indeterminate',
 'indrease',
 'infeior',
 'inferior',
 'infrahilar',
 

In [14]:
uncleaned_entities

{'2]_CPAM',
 '2]_Lung_RADs',
 '2]_endoleak',
 '3]_Lung_RADs',
 '3]_endoleak',
 '4]_Lung_RADs',
 'AAH',
 'ADH',
 'AIS',
 'AML_related',
 'A]_aortic_dissection',
 'B]_aortic_dissection',
 'CABG',
 'Erdheim_Chester disease',
 'IPF',
 'LN',
 'LN_metastasis',
 'LV_enlargement',
 'Langerhans_cell_histiocytosis',
 'MAC',
 'MIA',
 'Morgagni_hernia',
 'NISP',
 'NSIP',
 'NTM',
 'NTM_infection',
 'Port_A_Cath_clot',
 'RB_ILD',
 'SVC_narrowing',
 'SVC_stent',
 'SVC_thrombosis',
 'UIP',
 ']mtuliple]_[subpleural]_pulmonary_nodule',
 'abdominal_aorta_dilatation',
 'abnormality',
 'achalasia',
 'acute_aortic_syndrome',
 'acute_disease',
 'acute_injury',
 'acute_lung_injury',
 'acute_rejection',
 'adenmatous_hyperplasia',
 'adrenal_gland_nodule',
 'adrenal_nodule',
 'aggravated_disease',
 'aggravated_metastasis',
 'air_trapping',
 'airway_impaction',
 'airway_infection',
 'airway_lesion',
 'airway_obstruction',
 'airway_thickening',
 'allergic_bronchopulmonary_aspergillosis',
 'alveolar_hemorrhage',
 '

In [16]:
# USE NAIVE SPELL CHECKER
spell = SpellChecker()

naive_checked_descriptors = set()

for w in cleaned_descriptors:
    corrected = spell.correction(w)
    print('original: {}, predicted: {}'.format(w, corrected))
    naive_checked_descriptors.add(corrected)


naive_checked_descriptors

original: , predicted: a
original: segmental, predicted: segmental
original: scleroci, predicted: sclerosis
original: perigraft, predicted: perigraft
original: teralogy, predicted: trilogy
original: brachiocephalic, predicted: brachiocephalic
original: common, predicted: common
original: anterior, predicted: anterior
original: multifocal, predicted: multifocal
original: sternal, predicted: sternal
original: unhanged, predicted: unchanged
original: rib, predicted: rib
original: fatty, predicted: fatty
original: infeior, predicted: inferior
original: two, predicted: two
original: left, predicted: left
original: bronchus, predicted: bronchus
original: LLL, predicted: all
original: subcarainal, predicted: subcarainal
original: humerus, predicted: humerus
original: uchanged, predicted: changed
original: unchanged, predicted: unchanged
original: patchy, predicted: patchy
original: unchaged, predicted: unchanged
original: atypical, predicted: atypical
original: upper, predicted: upper
origina

{'AP',
 'BML',
 'L',
 'LUL',
 'RISK',
 'RLL',
 'T',
 'TT',
 'a',
 'acute',
 'age',
 'aggravated',
 'all',
 'anterior',
 'apex',
 'apical',
 'appendage',
 'arch',
 'atypical',
 'axillary',
 'azygoesophageal',
 'basil',
 'bilateral',
 'borderline',
 'brachicephalic',
 'brachiocephalic',
 'breast',
 'bronchus',
 'but',
 'calcified',
 'cannula',
 'cardiomegaly',
 'cardiophrenic',
 'category',
 'catheter',
 'cavity',
 'changed',
 'chest',
 'chronic',
 'clavicle',
 'cm',
 'committed',
 'common',
 'confirmed',
 'cortex',
 'costophrenic',
 'created',
 'cystic',
 'decrease',
 'decreased',
 'descending',
 'diaphragm',
 'diffuse',
 'distal',
 'educative',
 'enhancing',
 'esophagus',
 'extrapleural',
 'fatty',
 'fissure',
 'fissures',
 'focal',
 'gon',
 'ground',
 'group',
 'hilary',
 'humerus',
 'hyperenhancing',
 'improved',
 'increase',
 'indeterminate',
 'inferior',
 'infrahilar',
 'intercostal',
 'internal',
 'isthmus',
 'large',
 'left',
 'lelc',
 'leonid',
 'located',
 'lower',
 'lung',
 'l

In [17]:
without_brackets_entities = set()
with_brackets_entities = set()

for entity in uncleaned_entities:
    if '[' in entity or ']' in entity:
        with_brackets_entities.add(entity)
    else:
        without_brackets_entities.add(entity)

In [18]:
without_brackets_entities

{'AAH',
 'ADH',
 'AIS',
 'AML_related',
 'CABG',
 'Erdheim_Chester disease',
 'IPF',
 'LN',
 'LN_metastasis',
 'LV_enlargement',
 'Langerhans_cell_histiocytosis',
 'MAC',
 'MIA',
 'Morgagni_hernia',
 'NISP',
 'NSIP',
 'NTM',
 'NTM_infection',
 'Port_A_Cath_clot',
 'RB_ILD',
 'SVC_narrowing',
 'SVC_stent',
 'SVC_thrombosis',
 'UIP',
 'abdominal_aorta_dilatation',
 'abnormality',
 'achalasia',
 'acute_aortic_syndrome',
 'acute_disease',
 'acute_injury',
 'acute_lung_injury',
 'acute_rejection',
 'adenmatous_hyperplasia',
 'adrenal_gland_nodule',
 'adrenal_nodule',
 'aggravated_disease',
 'aggravated_metastasis',
 'air_trapping',
 'airway_impaction',
 'airway_infection',
 'airway_lesion',
 'airway_obstruction',
 'airway_thickening',
 'allergic_bronchopulmonary_aspergillosis',
 'alveolar_hemorrhage',
 'aorta_pseudoaneurysm',
 'aortic_abnormality',
 'aortic_arch_dilatation',
 'aortic_dissection',
 'aortic_stent',
 'aortic_stent_graft',
 'aortic_valve_calcification',
 'aplastic_anemia',
 'ar

In [21]:
# WE NEED TO MANUALLY ADD THE DESCRIPTORS
cleaned_with_brackets_entities = set()

for e in with_brackets_entities:
    cleaned_with_brackets_entities.add(e.split(']_')[-1])

In [22]:
cleaned_with_brackets_entities

{'CPAM',
 'LN',
 'Lung_RADs',
 'air_collection',
 'airway_obstruction',
 'aortic_dissection',
 'artrial_thrombosis',
 'atelectasis',
 'bone_metastasis',
 'bronchial_narrowing',
 'bronchial_obstruction',
 'cardiac_infarct',
 'centrilobular_pulmonary_nodules',
 'compression_fracture',
 'consolidation',
 'coronary_artery_dilatation',
 'coronary_disease',
 'coronary_mixed_plaque',
 'coronary_stenosis',
 'endoleak',
 'epidural_soft_tissue_lesion',
 'expiratory_tracheal_narrowoing',
 'fluid_collection',
 'ground_glass_opacity',
 'hyperdense_lesion',
 'infection',
 'intramural_hematoma',
 'low_attenuated_lesion',
 'lung_cancer',
 'lymphdenopathy',
 'lymphoproliferative_disease',
 'malignancy',
 'metastasis',
 'myocardiac_infarction_sequalae',
 'myocardial_bridging',
 'nodular_consolidation',
 'postoperative_change',
 'pulmonary nodule',
 'pulmonary_centrilobular_nodules',
 'pulmonary_nodule',
 'scar',
 'soft_tissue_lesion',
 'stump_thrombosis',
 'thrombus',
 'thyroid_goiter',
 'treatment_rela

In [29]:
all_entities = without_brackets_entities.union(cleaned_with_brackets_entities)
print(all_entities)

{'postoperative_change', 'bronchitis', 'viral_pneumonia', 'pleural_nodular_thickening', 'hydropneumothorax', 'empyema', 'myocardial_bridging', 'pleural_nodule', 'hematologic_malignancy', 'hyperdense_lesion', 'ascites', 'supleural_nodule', 'coronary_calcium', 'wedge_resection', 'delayed_myocardial_enhancement', 'septal_thickening', 'rejection', 'intubation', 'malignant_pleural_effusion', 'ascending_aorta_graft_repair', 'fibrosis', 'pulmonary_venooclusive_disease', 'degenerative_arthritis', 'cardiovascular_disease_evaluation', 'masslike_consolidation', 'irregular_nodular_opacity', 'breast_prosthesis_calcification', 'mucous_impaction', 'humerus_fracture', 'aggravated_disease', 'coronary_stenosis', 'polyspenia_syndrome', 'CABG', 'fluid_collection', 'MAC', 'ventricular_infection_sequelae', 'neoplasm', 'primary_malignancy', 'epidural_soft_tissue_lesion', 'pneumomediastinum', 'breat_mass', 'lymphocele', 'aortic_arch_dilatation', 'vascular_abnormality', 'Erdheim_Chester disease', 'venous_narro

In [28]:
all_descriptors = cleaned_descriptors #.union( set manual extraction of descriptors from with_brackets_entities )
print(all_descriptors)

{'', 'segmental', 'scleroci', 'perigraft', 'teralogy', 'brachiocephalic', 'common', 'anterior', 'multifocal', 'sternal', 'unhanged', 'rib', 'fatty', 'infeior', 'two', 'left', 'bronchus', 'LLL', 'subcarainal', 'humerus', 'uchanged', 'unchanged', 'patchy', 'unchaged', 'atypical', 'upper', 'subpectoral', 'indrease', 'thoracic', 'sp', 'increase', 'indeterminate', 'subclavian', 'decrease', 'negx', 'unchanaged', 'cannular', 'BUL', 'cystic', 'Gortex', 'RML', 'trace', 'type', 'necrptic', 'spleen', 'partial', 'basilar', 'diffuse', 'old', 'right', 'new', 'major', 'subcarinal', 'rd', 'diaphragm', 'subsegmental', 'fissure', 'enhancing', 'BLL', 'tree', 'mandible', 'lobulated', 'inferior', 'tiny', 'thoraic', 'probable', 'focal', 'noncalcified', 'severe', 'pathologic', 'aortopulmonary', 'small', 'medistinal', 'solid', 'splenic', 'scapular', 'biltaeral', 'sternoclavicular', 'large', 'multile', 'lingular', 'risk', 'descending', 'postoperative', 'cm', 'apex', 'rigk', 'mediastinal', 'TT', 'infrahilar', '