In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load data
Before we can analyse the data, we first need to load in the data we got provided from the UU. This data originates from the following source (https://www.microsoft.com/en-us/research/event/dialog-state-tracking-challenge/). Since this data is an .dat formatted file we can traditionally use the pd.read_csv() function in order to load this data in, however we can use pd.read_table() function for this. This results us the following dataframe

In [None]:
df = pd.read_table("../data/dialog_acts.dat", header=None, names=['data'])

In [None]:
df

# Correctly setting up axis
Since there is no headers marked inside the .dat file, indicating which of the words or combination of words are representing the act or utterance. To distinguis this between eachother we can split on the white spaces inbetween the words and choose the appriopate selection to split appart the act and utterance from each other.

In [None]:
def get_features(row):
    
    seperate_words = row.split(" ")

    act, utterance = seperate_words[0], ' '.join(seperate_words[1:])
    
    return pd.Series([act, utterance])

In [None]:
df[['act', 'utterance']] = df.apply(lambda row: get_features(row=row['data']), axis=1)

In [None]:
df.head()

# Cleaning
Before we can look at the data, we would also like to see whether we have a need to clean the data before usage. We will check the following steps whether it can be correctly used, Occurences of NaN value, Settings the words to lowercase, and checking for any leading whitespaces that can occur within the utterances

## Check NaN's

In [None]:
df.isna().sum()

## Set lowercase

In [None]:
df['act'] = df['act'].str.lower()
df['utterance'] = df['utterance'].str.lower()

## Check leading spaces

In [None]:
df['utterance'] = df['utterance'].str.lstrip()

# Distributions

## Act distribution

In [None]:
sns.histplot(
    df,
    x="act",
    multiple="stack",
    edgecolor=".3",
    linewidth=.5,
    log_scale=True,
)

plt.xticks(rotation=35)

## Token count distribution

In [None]:
mean_char_count_per_act = df.groupby("act")["utterance"].apply(lambda x: x.str.len().mean())

In [None]:
mccpa = mean_char_count_per_act.reset_index()

In [None]:
sns.barplot(x='act', y='utterance', data=mccpa)
plt.xticks(rotation=35)