In [1]:
import pandas as pd
from collections import Counter
import numpy as np
from copy import deepcopy
import random
from functools import reduce
from itertools import chain, zip_longest
import pickle
import re
from sklearn.model_selection import train_test_split

In [2]:
train_replace_dict = {'atis_flight#atis_airfare': 'atis_airfare', 
                                     'atis_ground_service#atis_ground_fare': 'atis_ground_fare',
                                     'atis_aircraft#atis_flight#atis_flight_no': 'atis_flight_no',
                                     'atis_airfare#atis_flight_time':'atis_flight_time',
                                     'atis_airline#atis_flight_no':'atis_flight_no',
                                     'atis_flight_no#atis_airline' : 'atis_flight_no',
                                     'atis_airfare#atis_flight': 'atis_airfare',
                                     'atis_flight#atis_airline' : 'atis_airline'}

# intent to idx mapping
intent_list = (pd.read_csv('../../data/multiATIS/intent_list.csv',sep=',',header=None,names=['INTENT']).INTENT.values.tolist())
intent2idx = { intent:idx for idx,intent in enumerate(intent_list)}

# slot idx mapping
final_slots = pd.read_csv('../../data/multiATIS/slots_list.csv',sep=',',header=None,names=['SLOTS']).SLOTS.values.tolist()
slots2idx  = {slots:idx for idx,slots in enumerate(final_slots)}
idx2slots = {v: k for k, v in slots2idx.items()}

In [3]:
with open('../../data/BG_Noise_Phrase.txt') as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
phrase = [x.strip() for x in content] 

In [4]:
def mapping2idx(df):
    
    #del df['id']

    # adding intent encoding
    encoded_intent = []
    for intent in list(df.INTENT):
        try:
            encoded_intent.append(intent2idx[intent])
        except:
            print(intent)
            encoded_intent.append(intent2idx['OOI'])
            
        
    df['INTENT_ID'] = encoded_intent 

    # adding mapped slot column
    slots_ID = []

    for annotations in list(df.SLOTS):
        encoded_slot = ""
        #print((annotations.split()))
        for tokens in annotations.split():
            try: 
                encoded_slot += str(slots2idx[tokens]) + " "
            except:
                print('token',tokens)
                encoded_slot += str(slots2idx['O']) + " "
        slots_ID.append(" ".join(encoded_slot.split()))

    df['SLOTS_ID'] = slots_ID
    
    return df

In [5]:
def merge_text_label(text,slot):
    data = []
    for i in range(len(text)):
        data.append([text[i] , slot[i]])
    return data

def twolists(l1, l2 , prob):
    
    spl = [0]*int((1000)*prob) + [1]*int(1000*(1-prob))
    final = []
    while len(l1) >0 and len(l2) > 0:
        if random.sample(spl,1)[0] == 0:
            final.append(l1.pop(0))
        else:
            final.append(l2.pop(0))
    if len(l1) == 0:
        final = final + l2
    else:
        final = final + l1
    
    #print(final)
    text,slot = '',''
    for token in final:
        text += token[0] + ' '
        slot += token[1] + ' '
    #print(text,slot)
    return text,slot

In [6]:
def BG_Noise(data, prob):
    
    orig_data = deepcopy(data)
    
    augINTENT, augSLOTS, augTEXT,augID = [],[],[],[]
    
    cnt = 0
    for sample in data.values.tolist():
        
        TEXT, SLOTS = sample[1], sample[2]
        
        #if CP_length >= 20:
        #    del_index = 20
        #    TEXT = sample[1][:del_index]
        #    SLOTS = sample[2][:del_index]
        
        bg_TEXT = random.sample(phrase,1)[0]
        bg_SLOTS = re.sub(' +', ' ', 'O '*len(bg_TEXT.split(' ')))
        #print(bg_TEXT,bg_SLOTS)
        noisyData = merge_text_label(bg_TEXT.split(' '),bg_SLOTS.split(' '))
        cleanData = merge_text_label(TEXT.split(' '),SLOTS.split(' '))
        
        augText , augSlots = twolists(noisyData,cleanData,prob)
        
        #print(augTEXT,augSlots)
        #assert(len(noisySLOTS)==len(noisyTEXT))
        augINTENT.append(sample[3])
        augTEXT.append(augText)
        augSLOTS.append(augSlots)
        augID.append(cnt)
        cnt+=1
        #print(augText,aug)
        #print(' '.join(noisyTEXT),noisySLOTS)
               
    augPD = pd.DataFrame([augID,augTEXT,augSLOTS,augINTENT],index=['ID','TEXT','SLOTS','INTENT']).T
    
    return augPD
    

In [7]:
def ASRN_aug(data_path,out_path,fn,prob):
    
    # loading dataset
    data = pd.read_csv(data_path,sep='\t',header=0,names=['ID','TEXT','SLOTS','INTENT'])
    
    # lowerCasing the TEXT column
    data['TEXT'] = data['TEXT'].str.lower()
    
    # handling multi-label instances 
    data['INTENT'] = data['INTENT'].replace(train_replace_dict)
    
    aug_data = BG_Noise(data,prob)
    #print(aug_data.head())
    aug_data = mapping2idx(aug_data)
        
    aug_data = aug_data.sample(frac=1).reset_index(drop=True)
    aug_data.to_csv(out_path+fn,sep='\t',index=False)
    

In [16]:
ASRN_aug('../../data/multiATIS/raw/train_dev_test/test_EN.tsv','../../data/multiATIS/split/test/BG_Noise/0_25/','test_EN_01.tsv',0.25)
ASRN_aug('../../data/multiATIS/raw/train_dev_test/test_EN.tsv','../../data/multiATIS/split/test/BG_Noise/0_25/','test_EN_02.tsv',0.25)
ASRN_aug('../../data/multiATIS/raw/train_dev_test/test_EN.tsv','../../data/multiATIS/split/test/BG_Noise/0_25/','test_EN_03.tsv',0.25)
ASRN_aug('../../data/multiATIS/raw/train_dev_test/test_EN.tsv','../../data/multiATIS/split/test/BG_Noise/0_25/','test_EN_04.tsv',0.25)
ASRN_aug('../../data/multiATIS/raw/train_dev_test/test_EN.tsv','../../data/multiATIS/split/test/BG_Noise/0_25/','test_EN_05.tsv',0.25)

ASRN_aug('../../data/multiATIS/raw/train_dev_test/test_EN.tsv','../../data/multiATIS/split/test/BG_Noise/0_75/','test_EN_01.tsv',0.75)
ASRN_aug('../../data/multiATIS/raw/train_dev_test/test_EN.tsv','../../data/multiATIS/split/test/BG_Noise/0_75/','test_EN_02.tsv',0.75)
ASRN_aug('../../data/multiATIS/raw/train_dev_test/test_EN.tsv','../../data/multiATIS/split/test/BG_Noise/0_75/','test_EN_03.tsv',0.75)
ASRN_aug('../../data/multiATIS/raw/train_dev_test/test_EN.tsv','../../data/multiATIS/split/test/BG_Noise/0_75/','test_EN_04.tsv',0.75)
ASRN_aug('../../data/multiATIS/raw/train_dev_test/test_EN.tsv','../../data/multiATIS/split/test/BG_Noise/0_75/','test_EN_05.tsv',0.75)

ASRN_aug('../../data/multiATIS/raw/train_dev_test/test_EN.tsv','../../data/multiATIS/split/test/BG_Noise/0_5/','test_EN_01.tsv',0.50)
ASRN_aug('../../data/multiATIS/raw/train_dev_test/test_EN.tsv','../../data/multiATIS/split/test/BG_Noise/0_5/','test_EN_02.tsv',0.50)
ASRN_aug('../../data/multiATIS/raw/train_dev_test/test_EN.tsv','../../data/multiATIS/split/test/BG_Noise/0_5/','test_EN_03.tsv',0.50)
ASRN_aug('../../data/multiATIS/raw/train_dev_test/test_EN.tsv','../../data/multiATIS/split/test/BG_Noise/0_5/','test_EN_04.tsv',0.50)
ASRN_aug('../../data/multiATIS/raw/train_dev_test/test_EN.tsv','../../data/multiATIS/split/test/BG_Noise/0_5/','test_EN_05.tsv',0.50)



atis_day_name
atis_day_name
token B-compartment
token B-stoploc.airport_code
token B-booking_class
token B-flight
atis_day_name
atis_day_name
token B-compartment
token B-stoploc.airport_code
token B-booking_class
token B-flight
atis_day_name
atis_day_name
token B-compartment
token B-stoploc.airport_code
token B-booking_class
token B-flight
atis_day_name
atis_day_name
token B-compartment
token B-stoploc.airport_code
token B-booking_class
token B-flight
atis_day_name
atis_day_name
token B-compartment
token B-stoploc.airport_code
token B-booking_class
token B-flight
atis_day_name
atis_day_name
token B-compartment
token B-stoploc.airport_code
token B-booking_class
token B-flight
atis_day_name
atis_day_name
token B-compartment
token B-stoploc.airport_code
token B-booking_class
token B-flight
atis_day_name
atis_day_name
token B-compartment
token B-stoploc.airport_code
token B-booking_class
token B-flight
atis_day_name
atis_day_name
token B-compartment
token B-stoploc.airport_code
token B-boo

In [11]:
def ASRN_aug(data_path,out_path,fn,ratio):
    
    # loading dataset
    data = pd.read_csv(data_path,sep='\t',header=0,names=['ID','TEXT','SLOTS','INTENT'])
    
    # lowerCasing the TEXT column
    data['TEXT'] = data['TEXT'].str.lower()
    
    # handling multi-label instances 
    data['INTENT'] = data['INTENT'].replace(train_replace_dict)
    
    aug_data_25 = BG_Noise(data,0.25)
    aug_data_50 = BG_Noise(data,0.50)
    aug_data_75 = BG_Noise(data,0.75)
    
    aug_data = pd.concat([aug_data_25,aug_data_50,aug_data_75])
    
    aug_data = mapping2idx(aug_data)
    data = mapping2idx(data)
    
    aug_data = aug_data.groupby('INTENT', group_keys=False).apply(lambda x: x.sample(int(np.rint(int(4450*ratio)*len(x)/len(aug_data))))).sample(frac=1).reset_index(drop=True)
    
    augmented_data = pd.concat([aug_data,data])
    
    
    augmented_data = augmented_data.sample(frac=1).reset_index(drop=True)
    augmented_data.to_csv(out_path+fn,sep='\t',index=False)
    

In [12]:
ASRN_aug('../../data/multiATIS/raw/train_dev_test/train_EN.tsv','../../data/multiATIS/split/train/ASRN_augmented/0_25/','train_EN_01.tsv', 0.25)
ASRN_aug('../../data/multiATIS/raw/train_dev_test/train_EN.tsv','../../data/multiATIS/split/train/ASRN_augmented/0_25/','train_EN_02.tsv', 0.25)
ASRN_aug('../../data/multiATIS/raw/train_dev_test/train_EN.tsv','../../data/multiATIS/split/train/ASRN_augmented/0_25/','train_EN_03.tsv', 0.25)
ASRN_aug('../../data/multiATIS/raw/train_dev_test/train_EN.tsv','../../data/multiATIS/split/train/ASRN_augmented/0_25/','train_EN_04.tsv', 0.25)
ASRN_aug('../../data/multiATIS/raw/train_dev_test/train_EN.tsv','../../data/multiATIS/split/train/ASRN_augmented/0_25/','train_EN_05.tsv', 0.25)

ASRN_aug('../../data/multiATIS/raw/train_dev_test/train_EN.tsv','../../data/multiATIS/split/train/ASRN_augmented/0_50/','train_EN_01.tsv', 0.50)
ASRN_aug('../../data/multiATIS/raw/train_dev_test/train_EN.tsv','../../data/multiATIS/split/train/ASRN_augmented/0_50/','train_EN_02.tsv', 0.50)
ASRN_aug('../../data/multiATIS/raw/train_dev_test/train_EN.tsv','../../data/multiATIS/split/train/ASRN_augmented/0_50/','train_EN_03.tsv', 0.50)
ASRN_aug('../../data/multiATIS/raw/train_dev_test/train_EN.tsv','../../data/multiATIS/split/train/ASRN_augmented/0_50/','train_EN_04.tsv', 0.50)
ASRN_aug('../../data/multiATIS/raw/train_dev_test/train_EN.tsv','../../data/multiATIS/split/train/ASRN_augmented/0_50/','train_EN_05.tsv', 0.50)

ASRN_aug('../../data/multiATIS/raw/train_dev_test/train_EN.tsv','../../data/multiATIS/split/train/ASRN_augmented/0_75/','train_EN_01.tsv', 0.75)
ASRN_aug('../../data/multiATIS/raw/train_dev_test/train_EN.tsv','../../data/multiATIS/split/train/ASRN_augmented/0_75/','train_EN_02.tsv', 0.75)
ASRN_aug('../../data/multiATIS/raw/train_dev_test/train_EN.tsv','../../data/multiATIS/split/train/ASRN_augmented/0_75/','train_EN_03.tsv', 0.75)
ASRN_aug('../../data/multiATIS/raw/train_dev_test/train_EN.tsv','../../data/multiATIS/split/train/ASRN_augmented/0_75/','train_EN_04.tsv', 0.75)
ASRN_aug('../../data/multiATIS/raw/train_dev_test/train_EN.tsv','../../data/multiATIS/split/train/ASRN_augmented/0_75/','train_EN_05.tsv', 0.75)

ASRN_aug('../../data/multiATIS/raw/train_dev_test/train_EN.tsv','../../data/multiATIS/split/train/ASRN_augmented/1_0/','train_EN_01.tsv', 1.0)
ASRN_aug('../../data/multiATIS/raw/train_dev_test/train_EN.tsv','../../data/multiATIS/split/train/ASRN_augmented/1_0/','train_EN_02.tsv', 1.0)
ASRN_aug('../../data/multiATIS/raw/train_dev_test/train_EN.tsv','../../data/multiATIS/split/train/ASRN_augmented/1_0/','train_EN_03.tsv', 1.0)
ASRN_aug('../../data/multiATIS/raw/train_dev_test/train_EN.tsv','../../data/multiATIS/split/train/ASRN_augmented/1_0/','train_EN_04.tsv', 1.0)
ASRN_aug('../../data/multiATIS/raw/train_dev_test/train_EN.tsv','../../data/multiATIS/split/train/ASRN_augmented/1_0/','train_EN_05.tsv', 1.0)