In [52]:
import pandas as pd
import os

DATASET_PATH = './cleaned-datasets/atis/'

splits = {'train': 'atis_train.csv', 'test': 'atis_test.csv'}
train_df = pd.read_csv("hf://datasets/tuetschek/atis/" + splits["train"])
train_df = train_df.drop('id', axis=1)
os.makedirs(DATASET_PATH, exist_ok=True)
train_df.to_csv(DATASET_PATH + '/train.csv', index=False)

test_df = pd.read_csv("hf://datasets/tuetschek/atis/" + splits["test"])
test_df = test_df.drop('id', axis=1)
test_df.to_csv(DATASET_PATH + '/test.csv', index=False)

In [53]:
def display_intents(dataset_type: str):
    if dataset_type == "train":
        df = pd.read_csv(DATASET_PATH + '/train.csv')
    elif dataset_type == "test":
        df = pd.read_csv(DATASET_PATH + '/test.csv')

    train_unique_intents = df['intent'].unique()
    train_intent_counts = df['intent'].value_counts()
    for intent, count in train_intent_counts.items():
        percentage = (count / len(df)) * 100
        print(f"{intent}: {count} ({percentage:.2f}%)")

In [54]:
def remove_low_occurrence_intents_train():
    df = pd.read_csv(DATASET_PATH + '/train.csv')
    df_intent_counts = df['intent'].value_counts()

    print("\nKeeping intents with occurrence of more than 3% of the dataset:")
    for intent, count in df_intent_counts.items():
        percentage = (count / len(df)) * 100
        if percentage < 3:
            df = df[df['intent'] != intent]

    df.to_csv(DATASET_PATH + '/train.csv', index=False)


def remove_uncommon_intents_test():
    print("\nRemoving intents not found in cleaned train dataset from test dataset:")
    train_df = pd.read_csv(DATASET_PATH + '/train.csv')
    train_intents = train_df['intent'].unique()

    # only keep intents found in cleaned train dataset in test dataset
    test_df = pd.read_csv(DATASET_PATH + '/test.csv')
    test_df = test_df[test_df['intent'].isin(train_intents)]
    test_df.to_csv(DATASET_PATH + '/test.csv', index=False)


In [55]:
for split in splits.keys():
    print(f"\n{split.upper()} DATASET INTENTS:")
    display_intents(split)

remove_low_occurrence_intents_train()
remove_uncommon_intents_test()

for split in splits.keys():
    print(f"\n{split.upper()} DATASET INTENTS:")
    display_intents(split)


TRAIN DATASET INTENTS:
flight: 3666 (73.64%)
airfare: 423 (8.50%)
ground_service: 255 (5.12%)
airline: 157 (3.15%)
abbreviation: 147 (2.95%)
aircraft: 81 (1.63%)
flight_time: 54 (1.08%)
quantity: 51 (1.02%)
flight+airfare: 21 (0.42%)
airport: 20 (0.40%)
distance: 20 (0.40%)
city: 19 (0.38%)
ground_fare: 18 (0.36%)
capacity: 16 (0.32%)
flight_no: 12 (0.24%)
meal: 6 (0.12%)
restriction: 6 (0.12%)
airline+flight_no: 2 (0.04%)
ground_service+ground_fare: 1 (0.02%)
airfare+flight_time: 1 (0.02%)
cheapest: 1 (0.02%)
aircraft+flight+flight_no: 1 (0.02%)

TEST DATASET INTENTS:
flight: 632 (70.77%)
airfare: 48 (5.38%)
airline: 38 (4.26%)
ground_service: 36 (4.03%)
abbreviation: 33 (3.70%)
capacity: 21 (2.35%)
airport: 18 (2.02%)
flight+airfare: 12 (1.34%)
distance: 10 (1.12%)
aircraft: 9 (1.01%)
flight_no: 8 (0.90%)
ground_fare: 7 (0.78%)
meal: 6 (0.67%)
city: 6 (0.67%)
quantity: 3 (0.34%)
day_name: 2 (0.22%)
flight_time: 1 (0.11%)
airfare+flight: 1 (0.11%)
flight+airline: 1 (0.11%)
flight_no+