In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from shutil import copyfile
from eda import eda
import os

In [2]:
DATA = Path('/mimic/data/')
NEW_DATA=Path('data')

# Full resource
ORIGINAL=DATA/'preprocessed/tgt-test.txt'
TRANSFORMER=DATA/'t2t_experiments/transformer/full_context/output/transformer_decoded/tgt-test.001.txt'
GPT2=DATA/'gpt2/test-output-text.txt'

# Low resource
ORIGINAL_LOW=DATA/'preprocessed/low_resource/tgt-test.txt'
TRANSFORMER_LOW=DATA/'t2t_experiments/transformer/low_resource/full_context/output/transformer_decoded/tgt-test.001.txt'
GPT2_LOW=DATA/'gpt2/low_resource/test-output-text.txt'


NEW_DATA.mkdir(exist_ok=True)

In [None]:
copyfile(ORIGINAL,NEW_DATA/'original.txt')
copyfile(ORIGINAL_LOW,NEW_DATA/'original-lowresource.txt')
copyfile(TRANSFORMER,NEW_DATA/'transformer.txt')
copyfile(TRANSFORMER_LOW,NEW_DATA/'transformer-lowresource.txt')
copyfile(GPT2,NEW_DATA/'gpt2.txt')
copyfile(GPT2_LOW,NEW_DATA/'gpt2-lowresource.txt')

In [3]:
# Verify we have the right number of lines

!wc -l data/*

wc: data/data: Is a directory
        0 data/data
     3692 data/gpt2-lowresource.combined.txt
     1846 data/gpt2-lowresource.txt
    11454 data/gpt2.combined.txt
     5727 data/gpt2.txt
     1846 data/original-lowresource.txt
     5727 data/original.txt
     3692 data/transformer-lowresource.combined.txt
     1846 data/transformer-lowresource.txt
    11454 data/transformer.combined.txt
     5727 data/transformer.txt
    53011 total


In [4]:
!cat data/original.txt data/transformer.txt > data/transformer.combined.txt
!cat data/original-lowresource.txt data/transformer-lowresource.txt > data/transformer-lowresource.combined.txt
!cat data/original.txt data/gpt2.txt > data/gpt2.combined.txt
!cat data/original-lowresource.txt data/gpt2-lowresource.txt > data/gpt2-lowresource.combined.txt

## EDA

In [8]:
with open(NEW_DATA/'original.txt', 'r') as f:
    original = f.readlines()
original=pd.DataFrame({'text':original})
          
with open(NEW_DATA/'original-lowresource.txt', 'r') as f:
    original_low = f.readlines()
original_low=pd.DataFrame({'text':original_low})

Full resource

In [9]:
alpha = 0.1
num_aug = 1 # number of augmented sentences per original sentence
aug=[]

for i in range(len(original)):
    sentence = original['text'][i]
    aug_sentences = eda.eda(sentence, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug)
    aug.append(aug_sentences)

In [10]:
aug_text = pd.Series((v[0] for v in aug))
df_eda = original
df_eda['text']=aug_text

In [11]:
df_eda = original.append(df_eda, ignore_index=True)
np.savetxt(NEW_DATA/'eda.txt', df_eda, fmt='%s', newline=os.linesep)

Low resource

In [None]:
alpha = 0.1
num_aug = 1 # number of augmented sentences per original sentence
aug=[]

for i in range(len(original_low)):
    sentence = original_low['text'][i]
    aug_sentences = eda.eda(sentence, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug)
    aug.append(aug_sentences)

In [None]:
aug_text = pd.Series((v[0] for v in aug))
eda_low = original_low
eda_low['text']=aug_text

In [None]:
eda_low = original_low.append(eda_low, ignore_index=True)
np.savetxt(NEW_DATA/'eda-lowresource.txt', eda_low, fmt='%s', newline=os.linesep)

#### Final verification

In [None]:
!wc -l data/*