In [1]:
import pandas as pd
from collections import Counter
import numpy as np
from copy import deepcopy
import random
from functools import reduce
import pickle
from sklearn.model_selection import train_test_split
random.seed(42)

In [2]:
train_replace_dict = {'atis_flight#atis_airfare': 'atis_airfare', 
                                     'atis_ground_service#atis_ground_fare': 'atis_ground_fare',
                                     'atis_aircraft#atis_flight#atis_flight_no': 'atis_flight_no',
                                     'atis_airfare#atis_flight_time':'atis_flight_time',
                                     'atis_airline#atis_flight_no':'atis_flight_no',
                                     'atis_flight_no#atis_airline' : 'atis_flight_no',
                                     'atis_airfare#atis_flight': 'atis_airfare',
                                     'atis_flight#atis_airline' : 'atis_airline'}

# intent to idx mapping
intent_list = (pd.read_csv('../data/multiATIS/intent_list.csv',sep=',',header=None,names=['INTENT']).INTENT.values.tolist())
intent2idx = { intent:idx for idx,intent in enumerate(intent_list)}

# slot idx mapping
final_slots = pd.read_csv('../data/multiATIS/slots_list.csv',sep=',',header=None,names=['SLOTS']).SLOTS.values.tolist()
slots2idx  = {slots:idx for idx,slots in enumerate(final_slots)}
idx2slots = {v: k for k, v in slots2idx.items()}

In [3]:
def mapping2idx(df):

    encoded_intent = []
    for intent in list(df.INTENT):
        try:
            encoded_intent.append(intent2idx[intent])
        except:
            print(intent)
            encoded_intent.append(intent2idx['OOI'])
            
        
    df['INTENT_ID'] = encoded_intent 

    # adding mapped slot column
    slots_ID = []

    for annotations in list(df.SLOTS):
        encoded_slot = ""
        #print((annotations.split()))
        for tokens in annotations.split():
            try: 
                encoded_slot += str(slots2idx[tokens]) + " "
            except:
                print('token',tokens)
                encoded_slot += str(slots2idx['O']) + " "
        slots_ID.append(" ".join(encoded_slot.split()))

    df['SLOTS_ID'] = slots_ID
    
    return df

In [4]:
def get_carrier_phrase_length(annotations):
    CP_idx = []
    for idx,token in enumerate(annotations.split(' ')):
        if token == 'O':
            CP_idx.append(idx)
    
    return CP_idx 

def twolists(l1, l2):
    return [x for x in chain.from_iterable(zip_longest(l1, l2)) if x is not None]


def BG_Noise(data):
    
    orig_data = deepcopy(data)
    
    augINTENT, augSLOTS, augTEXT,augID = [],[],[],[]
    
    cnt = 0
    for sample in data.values.tolist():
        
        CP_idx = get_carrier_phrase_length(sample[2])
        
        CP_length = len(CP_idx)
        
        TEXT, SLOTS = sample[1], sample[2]
        
        if CP_length >= 20:
            del_index = 20
            TEXT = sample[1][:del_index]
            SLOTS = sample[2][:del_index]
        
        bg_TEXT = random.sample(phrase,1)[0]
        bg_SLOTS = re.sub(' +', ' ', 'O '*len(bg_TEXT.split(' ')))
        
        noisyTEXT = twolists(TEXT.split(' '), bg_TEXT.split(' '))
        noisySLOTS = twolists(SLOTS.split(' '), bg_SLOTS.split(' '))
        noisySLOTS = re.sub(' +', ' ',' '.join(noisySLOTS))

        augINTENT.append(sample[3])
        augTEXT.append(' '.join(noisyTEXT))
        augSLOTS.append(noisySLOTS)
        augID.append(cnt)
        cnt+=1
        
        #print(' '.join(noisyTEXT),noisySLOTS)
               
    augPD = pd.DataFrame([augID,augTEXT,augSLOTS,augINTENT],index=['ID','TEXT','SLOTS','INTENT']).T
    
    return augPD
    

In [5]:
def get_phrase_length(text):
    return text.split(" ")

def carrier_aug(data,tau):
    
    orig_data = deepcopy(data)
    
    TEXT,ID, tokenID = [],[],[]
    
    aug_data = []
    for sample in data.values.tolist():
        #print(sample)
        CP_idx = get_phrase_length(sample[1])
        CP_length = len(CP_idx)
        #print(CP_length)
        if CP_length <= 2:
            
            TEXT.append(sample[1])
            ID.append(sample[0])
            tokenID.append(' '.join([str(i) for i in sample[2]]))
            aug_data.append([sample[1],sample[0],sample[2]])
            
        else:
            del_count = int(CP_length/2) if CP_length <= 5 else int(tau*CP_length)
            del_index = random.sample(list(range((CP_length))),del_count)
            #print(sample)
            text = ' '.join([i for j, i in enumerate(sample[1].split(' ')) if j not in del_index])
            tID = ' '.join([str(i) for j, i in enumerate(sample[2]) if j not in del_index])
            #print(text,tID)
            TEXT.append(text)
            ID.append(sample[0])
            tokenID.append(tID)
            aug_data.append([text,sample[0],tID])
            
            
    augPD = pd.DataFrame([ID,TEXT,tokenID],index=['ID','TEXT','tokenID']).T
    
    return aug_data

In [6]:
def contrastiveDataGeneration(data_path,out_path,fn,tau):
    
    # loading dataset
    data = pd.read_csv(data_path,sep='\t',header=0,names=['ID','TEXT','SLOTS','INTENT'])
    # lowerCasing the TEXT column
    sample,tokenCnt = [],0
    
    for idx,text in enumerate(data['TEXT'].values.tolist()):
        CP_length = len(get_phrase_length(text))
        tID = list(range(tokenCnt,tokenCnt+CP_length))
        tokenCnt += CP_length
        sample.append([idx,text,tID])
    
    textData = pd.DataFrame(sample, columns=['ID','TEXT','tokenID'])

    carrier_aug_01 = carrier_aug(textData,tau=0.2)
    carrier_aug_02 = carrier_aug(textData,tau=0.4)
    carrier_aug_03 = carrier_aug(textData,tau=0.6)
    carrier_aug_04 = carrier_aug(textData,tau=0.1)
    
    #bg_aug_01 = BG_Noise(textData)
    #bg_aug_01 = BG_Noise(textData)
    #bg_aug_01 = BG_Noise(textData)
    
    for i in range(len(sample)):
        sample[i][2] = ' '.join([str(i) for i in sample[i][2]])
    contrastiveData = []
    
    for i in range(len(carrier_aug_01)):
        contrastiveData.append(carrier_aug_01[i])
        contrastiveData.append(carrier_aug_02[i])
        contrastiveData.append(carrier_aug_03[i])
        contrastiveData.append(carrier_aug_04[i])
        sample[i][0] , sample[i][1] = sample[i][1] , sample[i][0] 
        contrastiveData.append(sample[i])
        #contrastiveData.append(bg_aug_01[i])
        #contrastiveData.append(bg_aug_02[i])
        #contrastiveData.append(bg_aug_03[i])
        
    contrastiveData = pd.DataFrame(contrastiveData,columns=['TEXT','ID','tokenID'])

    contrastiveData.to_csv(out_path+fn,sep='\t',index=False)
    

In [7]:
contrastiveDataGeneration('../data/multiATIS/raw/train_dev_test/train_EN.tsv','../data/multiATIS/split/train/contraSet/',"train_EN.tsv",1)