In [None]:
import pandas as pd
import ipysheet
from utils.nlu_engine import NLUEngine
from utils.nlu_engine import MacroDataRefinement
from utils.nlu_engine import DataUtils
from utils.nlu_engine import IntentMatcher, LR

# Data set cleaning

### Macro NLU Data Refinement
It's a bit like the TV show[Serverance](https: // www.imdb.com/title/tt11280740 /) .

![Helly R and Mark S](https: // media.npr.org/assets/img/2022/02/15/atv_severance_photo_010103-5f8033cc2b219ba64fe265ce893eae4c90e83896-s1100-c50.jpg "Helly R and Mark G")

*Helly R*: `My job is to scroll through the spreadsheet and look for the numbers that feel scary?`

*Mark S*: `I told you, you’ll understand when you see it, so just be patient.`

![MDR](https: // www.imore.com/sites/imore.com/files/styles/large/public/field/image/2022/03/refinement-software-severance-apple-tv.jpg "serverance micro data refinement")

*Helly R*: `That was scary. The numbers were scary.`

Hopefully the intents and entities that are wrong aren't scary, just a bit frustrating. Let's see if we can find the right ones.

* Example of intent classification of an utterance(this will probably be moved to an example notebook solely for the NLU engine)
* Example of entity recognition of an utterance(this will probably be moved to an example notebook solely for the NLU engine)
* Example of cleaning a data set: Macro NLU Data Refinement


## Load and overview of data set

In [None]:
nlu_data_df = DataUtils.load_data(
    'NLU-Data-Home-Domain-Annotated-All.csv'
)

There are some issues with the answer_annotation not being similar to the answer_normalised. Therefore, we will make our own answer_normalised from the answer_annotation.

In [None]:
nlu_data_df = DataUtils.convert_annotated_utterances_to_normalised_utterances(
    nlu_data_df)

In [None]:
nlu_data_info_df = MacroDataRefinement.get_data_info(nlu_data_df)
nlu_data_info_df

If there are overlapping intents over domains in the dataset, we will rename them to be unique.

In [None]:
nlu_data_df = MacroDataRefinement.rename_overlapping_intents(nlu_data_df, nlu_data_info_df)

In [None]:
nlu_data_df

## Intent classification

### Example of a single utterance

Both the intents and the domains (scenarios/skills) can be used to label an utterance. In this example we will use domains to label the utterances' intents. 

In [None]:
domains = nlu_data_df.scenario.values

LR_domain_classifier_model, tfidf_vectorizer = NLUEngine.train_intent_classifier(
    data_df_path=nlu_data_df,
    labels_to_predict='domain',
    classifier=LR
)


Example: Let's try to predict an utterances intent label using the domains.

In [None]:
utterance = "turn off the kitchen lights"

print(IntentMatcher.predict_label(
    LR_domain_classifier_model, tfidf_vectorizer, utterance))


### Create intent classifier report

In [None]:
domain_labels = 'scenario'

domain_report_df = NLUEngine.evaluate_intent_classifier(
    data_df_path=nlu_data_df,
    labels_to_predict=domain_labels,
    classifier=LR
)

In [None]:
domain_report_df.sort_values(by=['f1-score'])

## Entity extraction

The entity extraction could be greatly improved by improving the features it uses. It would be great if someone would take a look at this. Perhaps the CRF features similar to what Snips uses would be better (probably).

In [None]:
#TODO: implement brown clustering to improve entity extraction (see entity_extractor.py)

In [None]:
from utils.nlu_engine import EntityExtractor

It is important to have the NLTK tokenizer to be able to extract entities.

In [None]:
import nltk
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
        nltk.download('punkt')

### Example: Extracting entities from an utterance

In [None]:
crf_model = NLUEngine.train_entity_classifier(data_df=nlu_data_df)

Example: Let's try an example utterance for entity extraction.

In [None]:
utterance = 'wake me up at five pm this week'

We can get the entity tags of a specific utterance with the EntityExtractor.

In [None]:
EntityExtractor.get_entity_tags(utterance, crf_model)

We can also get the entity tagged utterance with the NLUEngine.

In [None]:
entity_tagged_utterance = NLUEngine.create_entity_tagged_utterance(
    utterance, crf_model)

entity_tagged_utterance


### Entity extraction report

Due to this error featured in [this git issue](https://github.com/TeamHG-Memex/sklearn-crfsuite/issues/60) we have to use an older version of scikit learn (sklearn<0.24), otherwise the latest version would work. Hopefully this gets fixed one day..

In [None]:
entity_report_df = NLUEngine.evaluate_entity_classifier(data_df=nlu_data_df)

In [None]:
entity_report_df.sort_values(by=['f1-score'])

In [None]:
#TODO: Benchmark the state features to find the best and the worst, remove/replace worst: add in state features like here: https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html#let-s-check-what-classifier-learned
# Specifically, we want print_state_features()

## Cleaning the dataset
Now that we know what works and what doesn't, we can clean the dataset.

We don't want all of the columns, so we will drop some to review the data set

In [None]:
nlu_scenario_df = nlu_data_df.drop(
    columns=[
        'userid', 'notes', 'answer', 'answerid', 'suggested_entities'
        ])


Pick a domain (scenario) to review

For this example we are going to pick 'alarm'. The intent classification isn't bad, but the entity extraction for alarm_type is terrible. Perhaps it overlaps with another entity type, like 'event_name'. We will try to fix this.

In [None]:
nlu_scenario_df = nlu_scenario_df[
    nlu_scenario_df['scenario'] == 'alarm'
    ]
nlu_scenario_df

### Cleaning incorrect intents for the domain

TODO: add in description of the types of fixes we can do to the NLU data for intent
* intents that collide with other intents and how to fix them (separation by TFIDF terms and using checkboxes in ipysheet to annotate them into the correct intent): this leads to the visualization of the intents in the NLU data with venn word cloud diagrams
* utterances that are grammatically incorrect or contain incorrect spelling (grammar checker in the future?)
* utterances that are straight up wrong for the intent
* utterances that actually seem contain multiple intents (this isn't supported by default)

Let's train an intent classifier on the whole data set for labeling intents and get the incorrect results for the intents on the domain we want to clean.
(why not split a training test set? Because we want to see the results of the intent classifier on the whole data set, I mean if it's still getting it wrong when it has trained on it, then perhaps there is something wrong with the utterance, tagging, overlapping intents, etc.)

In [None]:
LR_intent_classifier_model, tfidf_vectorizer = NLUEngine.train_intent_classifier(
    data_df_path=nlu_data_df,
    labels_to_predict='intent',
    classifier=LR
)


In [None]:
incorrect_intent_predictions_df = IntentMatcher.get_incorrect_predicted_labels(
    nlu_scenario_df, LR_intent_classifier_model, tfidf_vectorizer)
incorrect_intent_predictions_df


In [None]:
MacroDataRefinement.get_incorrect_predicted_intents_report(nlu_scenario_df, incorrect_intent_predictions_df)

### TODO: human in the for loop. 

Your job is to go through for each intent by their incorrect predicted intent and refine it. You can do this by:
...

also note: some of the incorrectly predicted utterances are actually fine the way they are, you may need to review the intent that is fasly being predicted...

1. Does the utterance fit to the intent? -> mark as move, remove, or review

2. Is the utterance grammar or spelling wrong but 1 is fine? -> correct the utterance

3. Is this intent collidating with another intent because the scope of both intents are overlapping? -> redefine the scope of the intents (either combine them or separate their functionality better)

4. Is the intent collidating with another intent because certain keywords overlap between intents? -> redefine the keywords to split between intents or merge them together if they are similar

In [None]:
# TODO: from here it's all just a work in progress. These 4 flows should be implemented in a human for loop pipeline with ipysheets.
# TODO: at the end of each flow, the dataframe will be appended with a column to indicate MDR was successfully completed. This way users can keep track of what they have refined.

In [None]:
# TODO: fix intent issues by removing incorrect utterances or fixing them otherwise
# TODO: export them and integrate them into the training data

In [None]:
#TODO: get the most incorrect intent
most_incorrect_intent = domain_intent_counts_df.sort_values(by=['incorrect_count'], ascending=False).head(1).index

incorrect_intent_df = incorrect_intent_predictions_df[
    incorrect_intent_predictions_df['intent'].isin(most_incorrect_intent)]

most_incorrect_prediction = incorrect_intent_df['predicted_label'].value_counts()[:1].index.tolist()


incorrect_intent_for_worst_performing_intent_and_prediction_df = incorrect_intent_df[incorrect_intent_df['predicted_label'].isin(most_incorrect_prediction)]
incorrect_intent_for_worst_performing_intent_and_prediction_df


In [None]:
nlu_data_df[nlu_data_df['intent'].isin(most_incorrect_prediction)]


In [None]:
# TODO: turn this into a function
# TODO: drop answer_normalised and question (?)
# #TODO: go by specific intents from the most incorrect intents
incorrect_intent_df = incorrect_intent_for_worst_performing_intent_and_prediction_df.drop(
    columns=['answer_normalised', 'question'])
#incorrect_intent_df = incorrect_intent_predictions_df[incorrect_intent_predictions_df['intent'] == 'alarm_set']

incorrect_intent_df = incorrect_intent_df.assign(review=None)
incorrect_intent_df['review'] = incorrect_intent_df['review'].astype(bool)

incorrect_intent_df = incorrect_intent_df.assign(remove=None)
incorrect_intent_df['remove'] = incorrect_intent_df['remove'].astype(bool)

incorrect_intent_df_sheet = ipysheet.from_dataframe(incorrect_intent_df)
incorrect_intent_sheet = ipysheet.from_dataframe(incorrect_intent_df)
incorrect_intent_sheet

Besides some incorrect utterances and intents, we can see that there is an overlap between the intent 'alarm_set' and the intent 'calandar_set'. This is because those two intents are not well defined and will require refinement. We will try to fix this.

In [None]:
#TODO: find separation criteria for alarm_set and calander_set: get most popular tfidf words for each intent and assign them as the separation criteria. e.g.
# alarm -> alarm_set
# reminder -> calander_set
# is alert alarm or reminder?
# wakeup or wake up -> alarm_set
# get up -> alarm_set
# timer -> alarm_set

#TODO: this will be visually represented in a venn diagram with word clouds

As we have seen from the entity extraction report, the entity extraction is not working for the alarm_type.

In [None]:
#TODO: move this below the intent cleaning flow
nlu_scenario_df = nlu_scenario_df[nlu_scenario_df['answer_annotation'].str.contains(
    'alarm_type')]

## Convert to ipysheet and review

We shall make two buttons. 
* **review**: The entry should be further reviewed
* **remove**: We will drop the entry from the data set.
* **move**: We will move the entry to another domain.

Look at each utterance, check the following:
* Is the utterance grammatically correct (and spelled correctly)?
* Is the utterance in the correct language?
* Is the utterance in the correct domain?
* Is the utterance in the correct format?
* Does the utterance actually make sense? (i.e. does it make sense to say it?)

If you are unsure, mark  **review**.

In [None]:
#TODO: refactor this by moving the introduction above the intent cleaning flow and introduce here the entity cleaning flow

In [None]:
# TODO: turn this into a function
nlu_scenario_df = nlu_scenario_df.assign(review=None)
nlu_scenario_df['review'] = nlu_scenario_df['review'].astype(bool)

nlu_scenario_df = nlu_scenario_df.assign(remove=None)
nlu_scenario_df['remove'] = nlu_scenario_df['remove'].astype(bool)

nlu_scenario_df_sheet = ipysheet.from_dataframe(nlu_scenario_df)
nlu_scenario_sheet = ipysheet.from_dataframe(nlu_scenario_df)
nlu_scenario_sheet

For the example with 'alarm' and the alarm_type: 
* We see that the alarm_type entities are really event_name (ie wake up, soccer practice) except for ID 5879, we will need to change them to event_name and remove ID 5879.
* The last one (ID 6320) is a mistake. Someone got confused with the prompt and assumed alarm is a security system. This is out of scope for the alarm domain, as the alarms are ones set on a phone or other device. We will drop this utterance.

Once you are done reviewing, you convert it back to a dataframe and check to make sure it looks okay.

In [None]:
reviewed_scenario_df = ipysheet.to_dataframe(nlu_scenario_sheet)
reviewed_scenario_df.index = pd.to_numeric(reviewed_scenario_df.index)
reviewed_scenario_df.tail(50)

Let's change all alarm_type entities to event_name.

In [None]:
reviewed_scenario_df['answer_annotation'] = reviewed_scenario_df['answer_annotation'].str.replace('alarm_type', 'event_name')

In [None]:
reviewed_scenario_df

Okay dokey, now we can merge this with the original data set and see if it made a difference already (well of course it did!).

In [None]:
nlu_data_df.drop(
    reviewed_scenario_df[reviewed_scenario_df['remove'] == True].index, inplace=True)

reviewed_scenario_df = reviewed_scenario_df[~reviewed_scenario_df['remove'] == True]

nlu_data_df.loc[nlu_data_df.index.intersection(
    reviewed_scenario_df.index), 'answer_annotation'] = reviewed_scenario_df['answer_annotation']


In [None]:
nlu_data_df[(nlu_data_df['scenario'].str.contains('alarm')) & (nlu_data_df['answer_annotation'].str.contains(
    'event_name'))]


### Benchmark changed data set
TODO: repeat reports for the changed data set for domain and entities and compare

In [None]:
entity_reviewed_report_df = NLUEngine.evaluate_entity_classifier(data_df=nlu_data_df)
entity_reviewed_report_df.sort_values(by=['f1-score'])

If you are sure it is okay, you can save it as a csv file, make sure to name it correctly (i.e. `alarm_domain_first_review.csv`)

In [None]:
reviewed_scenario_df.to_csv('alarm_domain_first_review.csv')


Load it back up and check to make sure it looks okay. Make sure to give it the right name!

In [None]:
audio_domain_first_review_df = pd.read_csv(
    'alarm_domain_first_review.csv', index_col=0)
audio_domain_first_review_df.tail(50)

In [None]:
# TODO: implement the evaluate_classifier in the NLU engine to check f1 score for intents and entities in the domain vs original NLU data of domain!
# Value: benchmark!

In [None]:
#TODO: implement a flow for getting the domains with the lowest f1 scores by intent/domain and entities and cleaning them by the order of the lowest f1 scores

In [None]:
# TODO: concat all reviewed dfs and save to csv

In [None]:
# TODO: add benchmark for whole NLU data set before and after cleaning! (by intents and domains!)
# TODO: review the review marked entries
# TODO: add new column for notes
# TODO: change flow of review for only ones that should be reviewed, not all of the ones that have been changed (track changes by comparing against the original data set)
# TODO: do the changed utterances have to be changed in other fields too or is it just enough for the tagged utterancve field?

In [None]:
# TODO: add visualizations of domains, their intents, keywords in utterances, and entities to top