In [2]:
import pandas as pd
from collections import Counter
import numpy as np
from copy import deepcopy
import random
from functools import reduce
import pickle
from sklearn.model_selection import train_test_split


In [14]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)

In [15]:
set_seed(42)

In [16]:
# processing training set [ATIS]

# reading individual files
text = pd.read_csv('../data/ATIS/raw/train/text.tsv',sep='\t',header=None,names=['TEXT'])
annotations = pd.read_csv('../data/ATIS/raw/train/annotation.tsv',sep='\t',header=None,names=['SLOTS'])
labels = pd.read_csv('../data/ATIS/raw/train/label.tsv',sep='\t',header=None,names=['INTENT'])

# merging them together
train = pd.concat([text,annotations,labels],axis=1)

# adding id column
train['ID'] = list(range(len(train)))

# reordering columns
train = train[["ID","TEXT",'INTENT',"SLOTS"]]

# saving to file
train.to_csv('../data/ATIS/raw/train/train.tsv',sep='\t',index=False)

In [18]:
# processing dev set [ATIS]

# reading individual files
text = pd.read_csv('../data/ATIS/raw/dev/text.tsv',sep='\t',header=None,names=['TEXT'])
annotations = pd.read_csv('../data/ATIS/raw/dev/annotations.tsv',sep='\t',header=None,names=['SLOTS'])
labels = pd.read_csv('../data/ATIS/raw/dev/label.tsv',sep='\t',header=None,names=['INTENT'])

# merging them together
train = pd.concat([text,annotations,labels],axis=1)

# adding id column
train['ID'] = list(range(len(train)))

# reordering columns
train = train[["ID","TEXT",'INTENT',"SLOTS"]]

# saving to file
train.to_csv('../data/ATIS/raw/dev/dev.tsv',sep='\t',index=False)

In [23]:
# processing test set [ATIS]

# reading individual files
text = pd.read_csv('../data/ATIS/raw/test/text.tsv',sep='\t',header=None,names=['TEXT'])
annotations = pd.read_csv('../data/ATIS/raw/test/annotations.tsv',sep='\t',header=None,names=['SLOTS'])
labels = pd.read_csv('../data/ATIS/raw/test/label.tsv',sep='\t',header=None,names=['INTENT'])

# merging them together
train = pd.concat([text,annotations,labels],axis=1)

# adding id column
train['ID'] = list(range(len(train)))

# reordering columns
train = train[["ID","TEXT",'INTENT',"SLOTS"]]

# saving to file
train.to_csv('../data/ATIS/raw/test/test.tsv',sep='\t',index=False)

In [24]:
train_replace_dict = {'atis_flight#atis_airfare': 'atis_airfare', 
                                     'atis_ground_service#atis_ground_fare': 'atis_ground_fare',
                                     'atis_aircraft#atis_flight#atis_flight_no': 'atis_flight_no',
                                     'atis_airfare#atis_flight_time':'atis_flight_time',
                                     'atis_airline#atis_flight_no':'atis_flight_no',
                                     'atis_flight_no#atis_airline' : 'atis_flight_no',
                                     'atis_airfare#atis_flight': 'atis_airfare',
                                     'atis_flight#atis_airline' : 'atis_airline'}

# intent to idx mapping
intent_list = (pd.read_csv('../data/ATIS/intent_list.tsv',sep='\t',header=None,names=['INTENT']).INTENT.values.tolist())
intent2idx = { intent:idx for idx,intent in enumerate(intent_list)}

# slot idx mapping
final_slots = pd.read_csv('../data/ATIS/slot_intent.tsv',sep='\t',header=None,names=['SLOTS']).SLOTS.values.tolist()
slots2idx  = {slots:idx for idx,slots in enumerate(final_slots)}
idx2slots = {v: k for k, v in slots2idx.items()}

In [25]:
def mapping2idx(df):

    # adding intent encoding
    intentID = []
    for intent in list(df.INTENT):
        try:
            intentID.append(intent2idx[intent])
        except:
            print(intent)
            intentID.append(intent2idx['UNK'])
            
        
    df['INTENT_ID'] = intentID 

    # adding mapped slot column
    slots_ID = []

    for annotations in list(df.SLOTS):
        slotID = ""

        for tokens in annotations.split():
            try: 
                slotID += str(slots2idx[tokens]) + " "
            except:
                print('token',tokens)
                slotID += str(slots2idx['O']) + " "
                
        slots_ID.append(" ".join(slotID.split()))

    df['SLOTS_ID'] = slots_ID
    
    return df

In [26]:
def process_data(data_path, out_path, shuffle):
   
    # loading dataset

    data = pd.read_csv(data_path,sep='\t',header=0,names=['ID','TEXT','INTENT','SLOTS'])
    
    # lowerCasing the TEXT column
    data['TEXT'] = data['TEXT'].str.lower()
    
    # re-mapping multi-label classes 
    data['INTENT'] = data['INTENT'].replace(train_replace_dict)
    
    data = mapping2idx(data)
    
    print('Length of data set... ',len(data))
    
    if shuffle:
        data = data.sample(frac=1).reset_index(drop=True)
        
    data.to_csv(out_path,index=False,sep='\t')
    

In [27]:
process_data('../data/ATIS/raw/train/train.tsv','../data/ATIS/processed/train.tsv',True)
process_data('../data/ATIS/raw/dev/dev.tsv','../data/ATIS/processed/dev.tsv',False)
process_data('../data/ATIS/raw/test/test.tsv','../data/ATIS/processed/test.tsv',False)

Length of data set...  4478
token I-return_date.day_number
Length of data set...  500
atis_day_name
atis_day_name
token I-flight_number
token B-compartment
token B-stoploc.airport_code
token I-state_name
token B-booking_class
token B-flight
Length of data set...  893
