### Imports

In [None]:
import sys
sys.path.append("../")
from IPython.display import display, Markdown, HTML

from preprocessing.document_manager import DocumentManager

### Load Dataset

In [None]:
doc_manager = DocumentManager()
doc_manager.cache_documents()

### Plot Distrubtion of Emotions by Document and Sentence
If a document has any value for any emotion we will count that as a value of '1' for the purpose of producing a distribution.

In [None]:
cumulative_emotion = {}
for document in doc_manager.documents:
    for label, value in document.data["emotion_labels"].items():
        if value > 0.0:
            if label in cumulative_emotion:
                cumulative_emotion[label] += 1
            else:
                cumulative_emotion[label] = 1

cumulative_sentence_emotions = {}
for sentence in doc_manager.get_all_sentences():
    for label, value in sentence.data["emotion_labels"].items():
        if value > 0.0:
            if label in cumulative_sentence_emotions:
                cumulative_sentence_emotions[label] += 1
            else:
                cumulative_sentence_emotions[label] = 1
                
print(cumulative_emotion)
print(cumulative_sentence_emotions)

In [None]:
import matplotlib.pyplot as plt

print("Occurences of emotion labels in for documents:")
plt.bar(range(len(cumulative_emotion)), list(cumulative_emotion.values()), align="center")
plt.xticks(range(len(cumulative_emotion)), list(cumulative_emotion.keys()));

In [None]:
print("Occurences of emotion labels in for all sentences:")
plt.bar(range(len(cumulative_sentence_emotions)), list(cumulative_sentence_emotions.values()), align="center")
plt.xticks(range(len(cumulative_sentence_emotions)), list(cumulative_sentence_emotions.keys()));

### Conclusions
* Given the observed imbalance in the label classes, we'll introduce a balanced sampler before fitting the model.

### Sample a document

In [None]:
# Set a seed constant to get the same random documents for this notebook
import random

seed_constant = 1338
random.seed(seed_constant)

In [None]:
i, document = doc_manager.get_random_document()
document.cache_data()

In [None]:
display(Markdown("### Document #%s" % i))

display(Markdown("Polarity: %s" % document.data["title"]["polarity"]))
display(Markdown("Emotion Labels: \n    %s" % document.data["title"]["emotion_labels"]))
display(Markdown("### %s" % document.data["title"]["text"]))

body_text = document.get_body_html(document.paragraphs)
# print(document.data["paragraphs"])
# import pdb; pdb.set_trace()
display(Markdown(body_text))