In [14]:
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
from functools import reduce
import pickle

In [15]:
train_replace_dict = {'atis_flight#atis_airfare': 'atis_airfare', 
                                     'atis_ground_service#atis_ground_fare': 'atis_ground_fare',
                                     'atis_aircraft#atis_flight#atis_flight_no': 'atis_flight_no',
                                     'atis_airfare#atis_flight_time':'atis_flight_time',
                                     'atis_airline#atis_flight_no':'atis_flight_no',
                                     'atis_flight_no#atis_airline' : 'atis_flight_no',
                                     'atis_airfare#atis_flight': 'atis_airfare',
                                     'atis_flight#atis_airline' : 'atis_airline'}

# intent to idx mapping
intent_list = (pd.read_csv('../data/multiATIS/intent_list.csv',sep=',',header=None,names=['INTENT']).INTENT.values.tolist())
intent2idx = { intent:idx for idx,intent in enumerate(intent_list)}

# slot idx mapping
final_slots = pd.read_csv('../data/multiATIS/slots_list.csv',sep=',',header=None,names=['SLOTS']).SLOTS.values.tolist()
slots2idx  = {slots:idx for idx,slots in enumerate(final_slots)}
idx2slots = {v: k for k, v in slots2idx.items()}

In [16]:
def mapping2idx(df,fn):
    
    #del df['id']

    # adding intent encoding
    encoded_intent = []
    for intent in list(df.INTENT):
        try:
            encoded_intent.append(intent2idx[intent])
        except:
            print(intent)
            encoded_intent.append(intent2idx['OOI'])
            
        
    df['INTENT_ID'] = encoded_intent 

    # adding mapped slot column
    slots_ID = []

    for annotations in list(df.SLOTS):
        encoded_slot = ""
        #print((annotations.split()))
        for tokens in annotations.split():
            try: 
                encoded_slot += str(slots2idx[tokens]) + " "
            except:
                print('token',tokens,fn)
                encoded_slot += str(slots2idx['O'])
        slots_ID.append(" ".join(encoded_slot.split()))

    df['SLOTS_ID'] = slots_ID
    
    return df

In [17]:
def process_data(data_path,out_path,fn):
    
    # loading dataset
    data = pd.read_csv(data_path,sep='\t',header=0,names=['ID','TEXT','SLOTS','INTENT'])
    
    # lowerCasing the TEXT column
    data['TEXT'] = data['TEXT'].str.lower()
    
    # handling multi-label instances 
    data['INTENT'] = data['INTENT'].replace(train_replace_dict)
    
    data = mapping2idx(data,fn)
    
    data.to_csv(out_path,sep='\t',index=False)
    

In [18]:
# processing EN training data
process_data('../data/multiATIS/raw/train_dev_test/train_EN.tsv','../data/multiATIS/splits/train_EN.tsv','EN')
# processing ES training data
process_data('../data/multiATIS/raw/train_dev_test/train_ES.tsv','../data/multiATIS/splits/train_ES.tsv','ES')
# processing DE training data
process_data('../data/multiATIS/raw/train_dev_test/train_DE.tsv','../data/multiATIS/splits/train_DE.tsv','DE')
# processing FR training data
process_data('../data/multiATIS/raw/train_dev_test/train_FR.tsv','../data/multiATIS/splits/train_FR.tsv','FR')

In [19]:
# processing EN training data
process_data('../data/multiATIS/raw/train_dev_test/dev_EN.tsv','../data/multiATIS/splits/dev_EN.tsv','EN')
# processing ES training data
process_data('../data/multiATIS/raw/train_dev_test/dev_ES.tsv','../data/multiATIS/splits/dev_ES.tsv','ES')
# processing DE training data
process_data('../data/multiATIS/raw/train_dev_test/dev_DE.tsv','../data/multiATIS/splits/dev_DE.tsv','DE')
# processing FR training data
process_data('../data/multiATIS/raw/train_dev_test/dev_FR.tsv','../data/multiATIS/splits/dev_FR.tsv','FR')

In [20]:
# processing EN training data
process_data('../data/multiATIS/raw/train_dev_test/test_EN.tsv','../data/multiATIS/splits/test_EN.tsv','EN')
# processing ES training data
process_data('../data/multiATIS/raw/train_dev_test/test_ES.tsv','../data/multiATIS/splits/test_ES.tsv','ES')
# processing DE training data
process_data('../data/multiATIS/raw/train_dev_test/test_DE.tsv','../data/multiATIS/splits/test_DE.tsv','DE')
# processing FR training data
process_data('../data/multiATIS/raw/train_dev_test/test_FR.tsv','../data/multiATIS/splits/test_FR.tsv','FR')

atis_day_name
atis_day_name
token B-compartment EN
token B-stoploc.airport_code EN
token B-booking_class EN
token B-flight EN
atis_day_name
atis_day_name
token B-compartment ES
token B-stoploc.airport_code ES
token B-booking_class ES
token B-flight ES
atis_day_name
atis_day_name
token B-compartment DE
token B-stoploc.airport_code DE
token B-booking_class DE
token B-flight DE
atis_day_name
atis_day_name
token B-compartment FR
token I-compartment FR
token B-stoploc.airport_code FR
token B-booking_class FR
token B-flight FR
