In [1]:
import pandas as pd
from collections import Counter
import numpy as np
from copy import deepcopy
import random
from functools import reduce
import pickle
from sklearn.model_selection import train_test_split

In [2]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)

In [3]:
set_seed(42)

# Processing ATIS dataset

In [4]:
# processing training set [ATIS]

# reading individual files
text = pd.read_csv('../data/ATIS/raw/train/text.tsv',sep='\t',header=None,names=['TEXT'])
labels = pd.read_csv('../data/ATIS/raw/train/label.tsv',sep='\t',header=None,names=['INTENT'])

# merging them together
train = pd.concat([text,labels],axis=1)

# adding id column
train['ID'] = list(range(len(train)))

# reordering columns
train = train[["ID","TEXT",'INTENT']]

# saving to file
train.to_csv('../data/ATIS/raw/train/train.tsv',sep='\t',index=False)

In [5]:
# processing dev set [ATIS]

# reading individual files
text = pd.read_csv('../data/ATIS/raw/dev/text.tsv',sep='\t',header=None,names=['TEXT'])
labels = pd.read_csv('../data/ATIS/raw/dev/label.tsv',sep='\t',header=None,names=['INTENT'])

# merging them together
train = pd.concat([text,labels],axis=1)

# adding id column
train['ID'] = list(range(len(train)))

# reordering columns
train = train[["ID","TEXT",'INTENT']]

# saving to file
train.to_csv('../data/ATIS/raw/dev/dev.tsv',sep='\t',index=False)

In [6]:
# processing test set [ATIS]

# reading individual files
text = pd.read_csv('../data/ATIS/raw/test/text.tsv',sep='\t',header=None,names=['TEXT'])
labels = pd.read_csv('../data/ATIS/raw/test/label.tsv',sep='\t',header=None,names=['INTENT'])

# merging them together
train = pd.concat([text,labels],axis=1)

# adding id column
train['ID'] = list(range(len(train)))

# reordering columns
train = train[["ID","TEXT",'INTENT']]

# saving to file
train.to_csv('../data/ATIS/raw/test/test.tsv',sep='\t',index=False)

In [7]:
train_replace_dict = {'atis_flight#atis_airfare': 'atis_flight', 
                                     'atis_ground_service#atis_ground_fare': 'atis_ground_service',
                                     'atis_aircraft#atis_flight#atis_flight_no': 'atis_aircraft',
                                     'atis_airfare#atis_flight_time':'atis_airfare',
                                     'atis_airline#atis_flight_no':'atis_airline',
                                     'atis_flight_no#atis_airline' : 'atis_flight_no',
                                     'atis_airfare#atis_flight': 'atis_airfare',
                                     'atis_flight#atis_airline' : 'atis_flight'}

# intent to idx mapping
intent_list = (pd.read_csv('../data/ATIS/intent_list.tsv',sep='\t',header=None,names=['INTENT']).INTENT.values.tolist())
intent2idx = { intent:idx for idx,intent in enumerate(intent_list)}

In [8]:
def mapping2idx(df):

    # adding intent encoding
    intentID = []
    for intent in list(df.INTENT):
        try:
            intentID.append(intent2idx[intent])
        except:
            print(intent)
            intentID.append(intent2idx['UNK'])
            
        
    df['INTENT_ID'] = intentID 
    
    return df

In [11]:
def process_data(data_path, out_path, shuffle):
   
    # loading dataset

    data = pd.read_csv(data_path,sep='\t',header=0,names=['ID','TEXT','INTENT'])
    
    # lowerCasing the TEXT column
    data['TEXT'] = data['TEXT'].str.lower()
    
    # re-mapping multi-label classes 
    data['INTENT'] = data['INTENT'].replace(train_replace_dict)
    
    data = mapping2idx(data)
    
    print('Length of data set... ',len(data))
    
    if shuffle:
        data = data.sample(frac=1).reset_index(drop=True)
        
    data.to_csv(out_path,index=False,sep='\t')
    

In [12]:
process_data('../data/ATIS/raw/train/train.tsv','../data/ATIS/experiments/clean/train/train.tsv',True)
process_data('../data/ATIS/raw/dev/dev.tsv','../data/ATIS/experiments/clean/dev/dev.tsv',False)
process_data('../data/ATIS/raw/test/test.tsv','../data/ATIS/experiments/clean/test/test.tsv',False)

Length of data set...  4478
Length of data set...  500
atis_day_name
atis_day_name
Length of data set...  893


# Processing SNIPS dataset

In [14]:
# processing training set [SNIPS]

# reading individual files
text = pd.read_csv('../data/SNIPS/raw/train/text.tsv',sep='\t',header=None,names=['TEXT'])
labels = pd.read_csv('../data/SNIPS/raw/train/label.tsv',sep='\t',header=None,names=['INTENT'])

# merging them together
train = pd.concat([text,labels],axis=1)

# adding id column
train['ID'] = list(range(len(train)))

# reordering columns
train = train[["ID","TEXT",'INTENT']]

# saving to file
train.to_csv('../data/SNIPS/raw/train/train.tsv',sep='\t',index=False)

In [15]:
# processing dev set [SNIPS]

# reading individual files
text = pd.read_csv('../data/SNIPS/raw/dev/text.tsv',sep='\t',header=None,names=['TEXT'])
labels = pd.read_csv('../data/SNIPS/raw/dev/label.tsv',sep='\t',header=None,names=['INTENT'])

# merging them together
train = pd.concat([text,labels],axis=1)

# adding id column
train['ID'] = list(range(len(train)))

# reordering columns
train = train[["ID","TEXT",'INTENT']]

# saving to file
train.to_csv('../data/SNIPS/raw/dev/dev.tsv',sep='\t',index=False)

In [17]:
# processing test set [SNIPS]

# reading individual files
text = pd.read_csv('../data/SNIPS/raw/test/text.tsv',sep='\t',header=None,names=['TEXT'])
labels = pd.read_csv('../data/SNIPS/raw/test/label.tsv',sep='\t',header=None,names=['INTENT'])

# merging them together
train = pd.concat([text,labels],axis=1)

# adding id column
train['ID'] = list(range(len(train)))

# reordering columns
train = train[["ID","TEXT",'INTENT']]

# saving to file
train.to_csv('../data/SNIPS/raw/test/test.tsv',sep='\t',index=False)

In [18]:
intent_list = (pd.read_csv('../data/SNIPS/intent_list.tsv',sep='\t',header=None,names=['INTENT']).INTENT.values.tolist())
intent2idx = { intent:idx for idx,intent in enumerate(intent_list)}

In [19]:
def mapping2idx(df):

    # adding intent encoding
    intentID = []
    for intent in list(df.INTENT):
        try:
            intentID.append(intent2idx[intent])
        except:
            print(intent)
            intentID.append(intent2idx['UNK'])
            
        
    df['INTENT_ID'] = intentID 
    
    return df

In [20]:
def process_data(data_path, out_path, shuffle):
   
    # loading dataset

    data = pd.read_csv(data_path,sep='\t',header=0,names=['ID','TEXT','INTENT'])
    
    # lowerCasing the TEXT column
    data['TEXT'] = data['TEXT'].str.lower()
    
    data = mapping2idx(data)
    
    print('Length of data set... ',len(data))
    
    if shuffle:
        data = data.sample(frac=1).reset_index(drop=True)
        
    data.to_csv(out_path,index=False,sep='\t')
    

In [21]:
process_data('../data/SNIPS/raw/train/train.tsv','../data/SNIPS/experiments/clean/train/train.tsv',True)
process_data('../data/SNIPS/raw/dev/dev.tsv','../data/SNIPS/experiments/clean/dev/dev.tsv',False)
process_data('../data/SNIPS/raw/test/test.tsv','../data/SNIPS/experiments/clean/test/test.tsv',False)

Length of data set...  13084
Length of data set...  700
Length of data set...  700


# Processing SNIPS dataset

In [83]:
train = pd.read_csv('../data/TOD/raw/train/train-en.tsv',sep='\t',header=None,names=['INTENT','SLOTS','TEXT','LANG','TOKENIZATION'])
dev = pd.read_csv('../data/TOD/raw/dev/eval-en.tsv',sep='\t',header=None,names=['INTENT','SLOTS','TEXT','LANG','TOKENIZATION'])
test = pd.read_csv('../data/TOD/raw/test/test-en.tsv',sep='\t',header=None,names=['INTENT','SLOTS','TEXT','LANG','TOKENIZATION'])

In [84]:
train = train[['TEXT','INTENT']]
train['ID'] = list(range(len(train)))
train = train[['ID','TEXT','INTENT']]
train.to_csv('../data/TOD/raw/train/train.tsv',sep='\t',index=False)

In [85]:
dev = dev[['TEXT','INTENT']]
dev['ID'] = list(range(len(dev)))
dev = dev[['ID','TEXT','INTENT']]
dev.to_csv('../data/TOD/raw/dev/eval.tsv',sep='\t',index=False)

In [86]:
test = test[['TEXT','INTENT']]
test['ID'] = list(range(len(test)))
test = test[['ID','TEXT','INTENT']]
test.to_csv('../data/TOD/raw/test/test.tsv',sep='\t',index=False)

In [87]:
intent_list = (pd.read_csv('../data/TOD/intent_list.tsv',sep='\t',header=None,names=['INTENT']).INTENT.values.tolist())
intent2idx = { intent:idx for idx,intent in enumerate(intent_list)}

In [88]:
train_replace_dict = {'alarm/cancel_alarm': 'alarm_cancel_alarm', 
                                     'alarm/modify_alarm': 'alarm_modify_alarm',
                                     'alarm/set_alarm': 'alarm_set_alarm',
                                     'alarm/show_alarms':'alarm_show_alarms',
                                     'alarm/snooze_alarm':'alarm_snooze_alarm',
                                     'alarm/time_left_on_alarm' : 'alarm_time_left_on_alarm',
                                     'reminder/cancel_reminder': 'reminder_cancel_reminder',
                                     'reminder/set_reminder' : 'reminder_set_reminder',
                                     'reminder/show_reminders' : 'reminder_show_reminders',
                                     'weather/checkSunrise' : 'weather_checkSunrise',
                                     'weather/checkSunset' : 'weather_checkSunset',
                                     'weather/find' : 'weather_find'}

In [89]:
def mapping2idx(df):

    # adding intent encoding
    intentID = []
    for intent in list(df.INTENT):
        try:
            intentID.append(intent2idx[intent])
        except:
            print(intent)
            intentID.append(intent2idx['UNK'])
            
        
    df['INTENT_ID'] = intentID 
    
    return df

In [92]:
def process_data(data_path, out_path, shuffle):
   
    # loading dataset

    data = pd.read_csv(data_path,sep='\t',header=0,names=['ID','TEXT','INTENT'])
    
    # lowerCasing the TEXT column
    data['TEXT'] = data['TEXT'].str.lower()
    
    data['INTENT'] = data['INTENT'].replace(train_replace_dict)
    
    data = mapping2idx(data)
    
    print('Length of data set... ',len(data))
    
    if shuffle:
        data = data.sample(frac=1).reset_index(drop=True)
        
    data.to_csv(out_path,index=False,sep='\t')
    

In [93]:
process_data('../data/TOD/raw/train/train.tsv','../data/TOD/experiments/clean/train/train.tsv',True)
process_data('../data/TOD/raw/dev/eval.tsv','../data/TOD/experiments/clean/dev/dev.tsv',False)
process_data('../data/TOD/raw/test/test.tsv','../data/TOD/experiments/clean/test/test.tsv',False)

Length of data set...  30521
Length of data set...  4181
Length of data set...  8621
