In [2]:
import pandas as pd
import json
import warnings

warnings.filterwarnings("ignore")

random_state = 0

In [None]:
# Preprocessing for dataset HWU64

# Read data from Dataset/NLU-Data-Home-Domain-Annotated-All.csv
df = pd.read_csv('../Dataset/NLU-Data-Home-Domain-Annotated-All.csv', delimiter=';')

print('Num of samples in dataset:', len(df))

# Count the number of unique values in column intent
intents = df['intent'].unique()
print('Num of different labels in dataset:', len(intents))
# for each intent, print the number of questions in the dataset. find also min
min_intent = None
for intent in intents:
    length = len(df[df['intent'] == intent])
    if min_intent is None:
        min_intent = length
    else:
        if length < min_intent:
            min_intent = length
print('Num of minimum samples in each label:', min_intent)

# if column answer_normalised is empty, fill it with the value in column answer
df['answer_normalised'] = df['answer_normalised'].fillna(df['answer'])

# Drop columns userid, answerid, status, answer_annotation, notes, suggested_entities, answer, question
df = df.drop(columns=['userid', 'answerid', 'scenario', 'status', 'answer_annotation', 'notes', 'suggested_entities', 'answer', 'question'])

# for each intent, select min_intent questions randomly and remove the rest
df_balanced = df.groupby('intent').apply(lambda x: x.sample(min_intent, random_state=random_state)).reset_index(drop=True)

# add header to the dataset
df_balanced.columns = ['label', 'question']

# For each label, we will assign a position based on the order of the label in the intents list
label_dict = {label: pos for pos, label in enumerate(intents)}
labels = df_balanced['label'].tolist()
label_positions = [label_dict[label] for label in labels]
# add a column 'label_position' to df
df_balanced['label_position'] = label_positions

display(df_balanced)
print('Num of samples in dataset after balancing:', len(df_balanced))

# print rows with empty question, empty label or empty label_position
print('Num of rows with empty question:', len(df_balanced[df_balanced['question'] == '']))
print('Num of rows with empty label:', len(df_balanced[df_balanced['label'] == '']))
print('Num of rows with empty label_position:', len(df_balanced[df_balanced['label_position'] == '']))

# Save the balanced dataset to HWU64.csv file 
df_balanced.to_csv('../Dataset/HWU64.csv', index=False)
# Save the list of labels to HWU64_labels.json file as an array
with open('../Dataset/HWU64_labels.json', 'w') as f:
    json.dump(intents.tolist(), f)


In [None]:
# Preprocessing for dataset CLINC150

# Read data from Dataset/data_full.json
data = None
with open('../Dataset/data_full.json') as f:
    d = json.load(f)

train = d.get('train')
val = d.get('val')
test = d.get('test')

# Each element in array is in the form of array [question, intent]. We need to convert it to DataFrame with columns question and intent
df_train = pd.DataFrame(train, columns=['question', 'label'])
df_val = pd.DataFrame(val, columns=['question', 'label'])
df_test = pd.DataFrame(test, columns=['question', 'label'])

# for each dataframe, add a column 'source' with value 'train', 'val', 'test' respectively
df_train['source'] = 'train'
df_val['source'] = 'val'
df_test['source'] = 'test'

# Concatenate df_train, df_val, df_test to form a single dataframe
df = pd.concat([df_train, df_val, df_test], ignore_index=True)
# drop column 'source' from df
df = df.drop(columns=['source'])
# order columns in df as 'label', 'question'
df = df[['label', 'question']]

# print unique values in column intent
intents = df['label'].unique()
print('Num of different intents in dataset:', len(intents))
print('Num of samples in dataset:', len(df))

# For each label, we will assign a position based on the order of the label in the intents list
label_dict = {label: pos for pos, label in enumerate(intents)}
labels = df['label'].tolist()
label_positions = [label_dict[label] for label in labels]
# add a column 'label_position' to df
df['label_position'] = label_positions

display(df)

# print rows with empty question, empty label or empty label_position
print('Num of rows with empty question:', len(df[df['question'] == '']))
print('Num of rows with empty label:', len(df[df['label'] == '']))
print('Num of rows with empty label_position:', len(df[df['label_position'] == '']))


# Save the balanced dataset to CLINC150.csv file 
df.to_csv('../Dataset/CLINC150.csv', index=False)
# Save the list of labels to CLINC150_labels.json file as an array
with open('../Dataset/CLINC150_labels.json', 'w') as f:
    json.dump(intents.tolist(), f)

# Save a subset of the dataset with all the labels but only 50 samples for each label
df_subset = df.groupby('label').apply(lambda x: x.sample(25, random_state=random_state)).reset_index(drop=True)
df_subset.to_csv('../Dataset/CLINC150_subset.csv', index=False)
with open('../Dataset/CLINC150_subset_labels.json', 'w') as f:
    json.dump(intents.tolist(), f)

In [None]:
# Preprocessing for dataset BANKING77

# Read the dataset from the parquet file
banking77 = pd.read_parquet('../Dataset/banking77.parquet')
# Get the questions text from each sample contained in the dataset
sample_ls = list(banking77['text'])
# Get the id of the target label from each sample contained in the dataset, i.e. the position of the target label in the original list of labels
target_ls = list(banking77['label'])
# Original list of labels
label_ls_initial = [
    "activate_my_card",
    "age_limit",
    "apple_pay_or_google_pay",
    "atm_support",
    "automatic_top_up",
    "balance_not_updated_after_bank_transfer",
    "balance_not_updated_after_cheque_or_cash_deposit",
    "beneficiary_not_allowed",
    "cancel_transfer",
    "card_about_to_expire",
    "card_acceptance",
    "card_arrival",
    "card_delivery_estimate",
    "card_linking",
    "card_not_working",
    "card_payment_fee_charged",
    "card_payment_not_recognised",
    "card_payment_wrong_exchange_rate",
    "card_swallowed",
    "cash_withdrawal_charge",
    "cash_withdrawal_not_recognised",
    "change_pin",
    "compromised_card",
    "contactless_not_working",
    "country_support",
    "declined_card_payment",
    "declined_cash_withdrawal",
    "declined_transfer",
    "direct_debit_payment_not_recognised",
    "disposable_card_limits",
    "edit_personal_details",
    "exchange_charge",
    "exchange_rate",
    "exchange_via_app",
    "extra_charge_on_statement",
    "failed_transfer",
    "fiat_currency_support",
    "get_disposable_virtual_card",
    "get_physical_card",
    "getting_spare_card",
    "getting_virtual_card",
    "lost_or_stolen_card",
    "lost_or_stolen_phone",
    "order_physical_card",
    "passcode_forgotten",
    "pending_card_payment",
    "pending_cash_withdrawal",
    "pending_top_up",
    "pending_transfer",
    "pin_blocked",
    "receiving_money",
    "Refund_not_showing_up",
    "request_refund",
    "reverted_card_payment?",
    "supported_cards_and_currencies",
    "terminate_account",
    "top_up_by_bank_transfer_charge",
    "top_up_by_card_charge",
    "top_up_by_cash_or_cheque",
    "top_up_failed",
    "top_up_limits",
    "top_up_reverted",
    "topping_up_by_card",
    "transaction_charged_twice",
    "transfer_fee_charged",
    "transfer_into_account",
    "transfer_not_received_by_recipient",
    "transfer_timing",
    "unable_to_verify_identity",
    "verify_my_identity",
    "verify_source_of_funds",
    "verify_top_up",
    "virtual_card_not_working",
    "visa_or_mastercard",
    "why_verify_identity",
    "wrong_amount_of_cash_received",
    "wrong_exchange_rate_for_cash_withdrawal",
]

label_ls = [label_ls_initial[target] for target in target_ls]

# Create df from sample_ls and target_ls with columns 'question' and 'label'
df = pd.DataFrame(list(zip(sample_ls, label_ls, target_ls)), columns=['question', 'label', 'label_position'])
# order columns in df as 'label', 'question', 'label_position'
df = df[['label', 'question', 'label_position']]
display(df.head())

# print rows with empty question, empty label or empty label_position
print('Num of rows with empty question:', len(df[df['question'] == '']))
print('Num of rows with empty label:', len(df[df['label'] == '']))
print('Num of rows with empty label_position:', len(df[df['label_position'] == '']))

# Save the balanced dataset to BANKING77.csv file
df.to_csv('../Dataset/BANKING77.csv', index=False)
# Save the list of labels to BANKING77_labels.json file as an array
with open('../Dataset/BANKING77_labels.json', 'w') as f:
    json.dump(label_ls_initial, f)
