In [2]:
import pandas as pd
from collections import Counter
import numpy as np
from copy import deepcopy
import random
from functools import reduce
import pickle
from sklearn.model_selection import train_test_split
random.seed(42)

In [17]:
train_replace_dict = {'atis_flight#atis_airfare': 'atis_airfare', 
                                     'atis_ground_service#atis_ground_fare': 'atis_ground_fare',
                                     'atis_aircraft#atis_flight#atis_flight_no': 'atis_flight_no',
                                     'atis_airfare#atis_flight_time':'atis_flight_time',
                                     'atis_airline#atis_flight_no':'atis_flight_no',
                                     'atis_flight_no#atis_airline' : 'atis_flight_no',
                                     'atis_airfare#atis_flight': 'atis_airfare',
                                     'atis_flight#atis_airline' : 'atis_airline'}

# intent to idx mapping
intent_list = (pd.read_csv('../../data/ATIS/intent_list.tsv',sep='\t',header=None,names=['INTENT']).INTENT.values.tolist())
intent2idx = { intent:idx for idx,intent in enumerate(intent_list)}

# slot idx mapping
final_slots = pd.read_csv('../../data/ATIS/slot_intent.tsv',sep='\t',header=None,names=['SLOTS']).SLOTS.values.tolist()
slots2idx  = {slots:idx for idx,slots in enumerate(final_slots)}
idx2slots = {v: k for k, v in slots2idx.items()}

In [18]:
def mapping2idx(df):

    # adding intent encoding
    intentID = []
    for intent in list(df.INTENT):
        try:
            intentID.append(intent2idx[intent])
        except:
            print(intent)
            intentID.append(intent2idx['UNK'])
            
        
    df['INTENT_ID'] = intentID 

    # adding mapped slot column
    slots_ID = []

    for annotations in list(df.SLOTS):
        slotID = ""

        for tokens in annotations.split():
            try: 
                slotID += str(slots2idx[tokens]) + " "
            except:
                print('token',tokens)
                slotID += str(slots2idx['O']) + " "
                
        slots_ID.append(" ".join(slotID.split()))

    df['SLOTS_ID'] = slots_ID
    
    return df

In [19]:
def get_carrier_phrase_length(annotations):
    CP_idx = []
    for idx,token in enumerate(annotations.split(' ')):
        if token == 'O':
            CP_idx.append(idx)
    
    return CP_idx 

In [31]:
def carrier_aug(data,tau):
    
    orig_data = deepcopy(data)
    
    augINTENT, augSLOTS, augTEXT,augID = [],[],[],[]
    
    cnt = 0
    for sample in data.values.tolist():
        
        
        CP_idx = get_carrier_phrase_length(sample[3])
        
        CP_length = len(CP_idx)
        
        if CP_length <= 2:
            
            augINTENT.append(sample[2])
            augTEXT.append(sample[1])
            augSLOTS.append(sample[3])
            augID.append(cnt)
            cnt+=1
        else:
            del_count = int(CP_length/2) if CP_length <= 5 else int(tau*CP_length)
        
            del_index = random.sample(CP_idx,del_count)

            TEXT = ' '.join([i for j, i in enumerate(sample[1].split(' ')) if j not in del_index])
            SLOTS = ' '.join([i for j, i in enumerate(sample[3].split(' ')) if j not in del_index])
         
            augINTENT.append(sample[2])
            augTEXT.append(TEXT)
            augSLOTS.append(SLOTS)
            augID.append(cnt)
            cnt+=1
    
               
    augPD = pd.DataFrame([augID,augTEXT,augINTENT,augSLOTS],index=['ID','TEXT','INTENT','SLOTS']).T
    
    return augPD

In [36]:
def OOC_aug(data_path,out_path,fn,tau):
    
    # loading dataset
    data = pd.read_csv(data_path,sep='\t',header=0)
    
    # lowerCasing the TEXT column
    data['TEXT'] = data['TEXT'].str.lower()
    
    # handling multi-label instances 
    data['INTENT'] = data['INTENT'].replace(train_replace_dict)
    
    aug_data = carrier_aug(data,tau=tau)
    
    aug_data = mapping2idx(aug_data)
    
    aug_data = aug_data.sample(frac=1).reset_index(drop=True)
    aug_data.to_csv(out_path+fn,sep='\t',index=False)

In [37]:
OOC_aug('../../data/ATIS/raw/test/test.tsv','../../data/ATIS/experiment/test/NoiseOOC/20per/','test_01.tsv',tau=0.20)
OOC_aug('../../data/ATIS/raw/test/test.tsv','../../data/ATIS/experiment/test/NoiseOOC/20per/','test_02.tsv',tau=0.20)
OOC_aug('../../data/ATIS/raw/test/test.tsv','../../data/ATIS/experiment/test/NoiseOOC/20per/','test_03.tsv',tau=0.20)
OOC_aug('../../data/ATIS/raw/test/test.tsv','../../data/ATIS/experiment/test/NoiseOOC/20per/','test_04.tsv',tau=0.20)
OOC_aug('../../data/ATIS/raw/test/test.tsv','../../data/ATIS/experiment/test/NoiseOOC/20per/','test_05.tsv',tau=0.20)
OOC_aug('../../data/ATIS/raw/test/test.tsv','../../data/ATIS/experiment/test/NoiseOOC/40per/','test_01.tsv',tau=0.40)
OOC_aug('../../data/ATIS/raw/test/test.tsv','../../data/ATIS/experiment/test/NoiseOOC/40per/','test_02.tsv',tau=0.40)
OOC_aug('../../data/ATIS/raw/test/test.tsv','../../data/ATIS/experiment/test/NoiseOOC/40per/','test_03.tsv',tau=0.40)
OOC_aug('../../data/ATIS/raw/test/test.tsv','../../data/ATIS/experiment/test/NoiseOOC/40per/','test_04.tsv',tau=0.40)
OOC_aug('../../data/ATIS/raw/test/test.tsv','../../data/ATIS/experiment/test/NoiseOOC/40per/','test_05.tsv',tau=0.40)
OOC_aug('../../data/ATIS/raw/test/test.tsv','../../data/ATIS/experiment/test/NoiseOOC/60per/','test_01.tsv',tau=0.60)
OOC_aug('../../data/ATIS/raw/test/test.tsv','../../data/ATIS/experiment/test/NoiseOOC/60per/','test_02.tsv',tau=0.60)
OOC_aug('../../data/ATIS/raw/test/test.tsv','../../data/ATIS/experiment/test/NoiseOOC/60per/','test_03.tsv',tau=0.60)
OOC_aug('../../data/ATIS/raw/test/test.tsv','../../data/ATIS/experiment/test/NoiseOOC/60per/','test_04.tsv',tau=0.60)
OOC_aug('../../data/ATIS/raw/test/test.tsv','../../data/ATIS/experiment/test/NoiseOOC/60per/','test_05.tsv',tau=0.60)

atis_day_name
atis_day_name
token I-flight_number
token B-compartment
token B-stoploc.airport_code
token I-state_name
token B-booking_class
token B-flight
atis_day_name
atis_day_name
token I-flight_number
token B-compartment
token B-stoploc.airport_code
token I-state_name
token B-booking_class
token B-flight
atis_day_name
atis_day_name
token I-flight_number
token B-compartment
token B-stoploc.airport_code
token I-state_name
token B-booking_class
token B-flight
atis_day_name
atis_day_name
token I-flight_number
token B-compartment
token B-stoploc.airport_code
token I-state_name
token B-booking_class
token B-flight
atis_day_name
atis_day_name
token I-flight_number
token B-compartment
token B-stoploc.airport_code
token I-state_name
token B-booking_class
token B-flight
atis_day_name
atis_day_name
token I-flight_number
token B-compartment
token B-stoploc.airport_code
token I-state_name
token B-booking_class
token B-flight
atis_day_name
atis_day_name
token I-flight_number
token B-compartment


# OOC augmentated trainset generation

In [9]:
def OOC_aug(data_path,out_path,fn,ratio):
    
    # loading dataset
    data = pd.read_csv(data_path,sep='\t',header=0)
    
    # lowerCasing the TEXT column
    data['TEXT'] = data['TEXT'].str.lower()
    
    # handling multi-label instances 
    data['INTENT'] = data['INTENT'].replace(train_replace_dict)
    
    aug_data_20 = carrier_aug(data,tau=0.20)
    aug_data_40 = carrier_aug(data,tau=0.40)
    aug_data_60 = carrier_aug(data,tau=0.60)
    
    aug_data = pd.concat([aug_data_20,aug_data_40,aug_data_60])
    
    aug_data = mapping2idx(aug_data)
    data = mapping2idx(data)
    
    aug_data = aug_data.groupby('INTENT', group_keys=False).apply(lambda x: x.sample(int(np.rint(int(4450*ratio)*len(x)/len(aug_data))))).sample(frac=1).reset_index(drop=True)
    
    augmented_data = pd.concat([aug_data,data])
    
    
    augmented_data = augmented_data.sample(frac=1).reset_index(drop=True)
    augmented_data.to_csv(out_path+fn,sep='\t',index=False)
    

In [38]:
OOC_aug('../../data/ATIS/raw/train/train.tsv','../../data/ATIS/experiment/train/augmentedOOC/25per/','train_01.tsv', 0.25)
OOC_aug('../../data/ATIS/raw/train/train.tsv','../../data/ATIS/experiment/train/augmentedOOC/25per/','train_02.tsv', 0.25)
OOC_aug('../../data/ATIS/raw/train/train.tsv','../../data/ATIS/experiment/train/augmentedOOC/25per/','train_03.tsv', 0.25)
OOC_aug('../../data/ATIS/raw/train/train.tsv','../../data/ATIS/experiment/train/augmentedOOC/25per/','train_04.tsv', 0.25)
OOC_aug('../../data/ATIS/raw/train/train.tsv','../../data/ATIS/experiment/train/augmentedOOC/25per/','train_05.tsv', 0.25)

OOC_aug('../../data/ATIS/raw/train/train.tsv','../../data/ATIS/experiment/train/augmentedOOC/50per/','train_01.tsv', 0.50)
OOC_aug('../../data/ATIS/raw/train/train.tsv','../../data/ATIS/experiment/train/augmentedOOC/50per/','train_02.tsv', 0.50)
OOC_aug('../../data/ATIS/raw/train/train.tsv','../../data/ATIS/experiment/train/augmentedOOC/50per/','train_03.tsv', 0.50)
OOC_aug('../../data/ATIS/raw/train/train.tsv','../../data/ATIS/experiment/train/augmentedOOC/50per/','train_04.tsv', 0.50)
OOC_aug('../../data/ATIS/raw/train/train.tsv','../../data/ATIS/experiment/train/augmentedOOC/50per/','train_05.tsv', 0.50)

OOC_aug('../../data/ATIS/raw/train/train.tsv','../../data/ATIS/experiment/train/augmentedOOC/75per/','train_01.tsv', 0.75)
OOC_aug('../../data/ATIS/raw/train/train.tsv','../../data/ATIS/experiment/train/augmentedOOC/75per/','train_02.tsv', 0.75)
OOC_aug('../../data/ATIS/raw/train/train.tsv','../../data/ATIS/experiment/train/augmentedOOC/75per/','train_03.tsv', 0.75)
OOC_aug('../../data/ATIS/raw/train/train.tsv','../../data/ATIS/experiment/train/augmentedOOC/75per/','train_04.tsv', 0.75)
OOC_aug('../../data/ATIS/raw/train/train.tsv','../../data/ATIS/experiment/train/augmentedOOC/75per/','train_05.tsv', 0.75)

OOC_aug('../../data/ATIS/raw/train/train.tsv','../../data/ATIS/experiment/train/augmentedOOC/100per/','train_01.tsv', 1.0)
OOC_aug('../../data/ATIS/raw/train/train.tsv','../../data/ATIS/experiment/train/augmentedOOC/100per/','train_02.tsv', 1.0)
OOC_aug('../../data/ATIS/raw/train/train.tsv','../../data/ATIS/experiment/train/augmentedOOC/100per/','train_03.tsv', 1.0)
OOC_aug('../../data/ATIS/raw/train/train.tsv','../../data/ATIS/experiment/train/augmentedOOC/100per/','train_04.tsv', 1.0)
OOC_aug('../../data/ATIS/raw/train/train.tsv','../../data/ATIS/experiment/train/augmentedOOC/100per/','train_05.tsv', 1.0)