In [1]:
import numpy as np
import re
import nlpaug.augmenter.word as naw
import pandas as pd
from tqdm import tqdm
import os

In [2]:
file_path = ""

In [3]:
df = pd.read_csv(os.path.join(file_path, 'mtsamples.csv'))
try:
    df.drop('Unnamed: 0', axis=1, inplace=True)
except Exception:
    pass

In [4]:
df.head(5)

Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [5]:
def replace_side(s):
    side = re.search(r'\bleft\b|\bright\b', s, flags=re.IGNORECASE)
    if side:
        side = side.group().lower()

        if side == 'left':
            return re.sub(r'\bleft\b', 'right', s, flags=re.IGNORECASE)

        elif side == 'right':
            return re.sub(r'\bright\b', 'left', s, flags=re.IGNORECASE)

    return s


def replace_age(s):
    res = re.search(r'\d{2}\b-(year|month)-old\b', s)
    if res:
        res = res.group()

        age, restofstr = int(res[:2]), res[2:]
        age += np.random.randint(1, 10) * np.random.choice([-1, 1])

        return re.sub(r'\d{2}\b-(year|month)-old\b', str(age)+restofstr, s, flags=re.IGNORECASE)
    else:
        return s

def replace_gen(s):
    genkeys_f = ['female', 'girl', 'she', 'her', 'woman', 'lady']
    genkeys_m = ['male', 'boy', 'he', 'him', 'gentleman', 'man', 'his']

    allgenkeys = genkeys_f.copy()
    allgenkeys.extend(genkeys_m)

    pattern = '|'.join([f'\\b{w}\\b' for w in allgenkeys])
    gen = re.search(pattern, s, flags=re.IGNORECASE | re.MULTILINE)

    if gen:
        gen = gen.group().lower()

        if gen in genkeys_f:
            s = re.sub(r'\bfemale\b', 'male', s, flags=re.IGNORECASE)
            s = re.sub(r'\bshe\b', 'he', s, flags=re.IGNORECASE)
            s = re.sub(r'\bher\b', 'him', s, flags=re.IGNORECASE)
            s = re.sub(r'\bgirl\b', 'boy', s, flags=re.IGNORECASE)
            s = re.sub(r'\blady\b|\bwoman\b', 'man', s, flags=re.IGNORECASE)

        elif gen in genkeys_m:
            s = re.sub(r'\bmale\b', 'female', s, flags=re.IGNORECASE)
            s = re.sub(r'\bhe\b', 'she', s, flags=re.IGNORECASE)
            s = re.sub(r'\bhim\b|\bhis\b', 'her', s, flags=re.IGNORECASE)
            s = re.sub(r'\bboy\b', 'girl', s, flags=re.IGNORECASE)
            s = re.sub(r'\bman\b|\bgentleman\b', 'woman', s, flags=re.IGNORECASE)

    return s

def random_key(keywords: list):
    if keywords:
        return np.random.choice(np.array(keywords))

def replace_disease(s, keyword_str):
    try:
        keywords = re.split(r', +', keyword_str.rstrip(','))
        pattern = '|'.join(f'\\b{w}\\b' for w in keywords)
        return re.sub(pattern, lambda m: random_key(keywords), s, flags=re.IGNORECASE | re.MULTILINE)
    except:
        return s


def change_num(m):
    if m.isdigit():
        try:
            m = int(m)
            if m > 10:
                m += np.random.randint(1, 10) * np.random.choice([-1, 1])
            else:
                subexp = np.random.randint(1, 5) * np.random.choice([-1, 1])
                if m + subexp > 0:
                    m += subexp
        except:
            return m
    else:
        try:
            m = float(m)
            if m <= 1.0:
                subexp = np.random.rand() * np.random.choice([-1, 1])
                if m + subexp > 0.0:
                    m += subexp

            elif 1.0 < m <= 10.0:
                subexp = np.random.uniform(1.0, 3.1) * np.random.choice([-1, 1])
                if m + subexp > 0.0:
                    m += subexp

            else:
                m += np.random.uniform(1.0, 5.0) * np.random.choice([-1, 1])
        except:
            return m

    return str(np.round(m, decimals=2))


def replace_num(s):
    try:
        return re.sub(r'\d*\.?\d+', lambda m: change_num(m.group()), s)
    except:
        return s

In [6]:
aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action='substitute')

In [7]:
data = []

In [8]:
skipped_rec = []
for idx, row in tqdm(df.iterrows()):   
 
    desc = row[0]
    trans = row[3]
    keywords = row[4]

    if not (pd.isnull(desc) or pd.isnull(trans)):

        # re aug-----------------------------
        
        desc = replace_num(desc)        
        desc = replace_gen(desc)
        desc = replace_side(desc)

        if not pd.isnull(keywords):
            desc = replace_disease(desc, keywords)
            trans = replace_disease(trans, keywords)

        trans = replace_gen(trans)
        trans = replace_num(trans)
        trans = replace_side(trans)
        
        # --------------------comb 1--------------------------

        data.append([desc, row[1], row[2], trans, keywords])

        # # ----------------bert aug---------------------------

        trans = aug.augment(trans)
        desc = aug.augment(desc)

        # # ---------------------comb 2--------------------------

        data.append([desc, row[1], row[2], trans, keywords])

    else:
        skipped_rec.append(idx)



4999it [00:59, 83.34it/s]


In [9]:
df2 = pd.DataFrame(data, columns=list(df.columns))
df2.head(5)

Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,A 15-year-old white male presents with compla...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 29-year-old white male pres...","allergy / immunology, allergic rhinitis, aller..."
1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, she has difficulty clim...","bariatrics, laparoscopic gastric bypass, weigh..."
2,Consult for sleep study.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,3-D M-Mode. pericardial effusion.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"pericardial effusion: , ,1. right regurgitati...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,aortic root atrium,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,2. The right echocardiogram 2-d size and wall...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [10]:
df2.to_csv(os.path.join(file_path, 're_augmented_mtsamples.csv'), sep=',')

In [11]:
print(skipped_rec), len(skipped_rec)

[97, 116, 205, 263, 459, 622, 628, 680, 729, 871, 879, 983, 984, 985, 987, 1017, 1055, 2016, 2455, 2498, 2529, 2585, 2588, 2650, 3582, 3588, 3621, 3626, 3629, 3632, 3725, 3771, 4649]


(None, 33)