In [2]:
from preprocess import EmotionStimulus, DailyDialog, ISEAR
from utils import NCRlexicon2dict, lexicon_one_hot_encoding, sentence_strength
from typing import Dict
import numpy as np
import collections

In [None]:
preprocessor = EmotionStimulus('/Users/yarikmenchaca/Documents/Datasets/Emotion_Cause/')
texts = preprocessor.sentences

In [3]:
preprocessor = DailyDialog('/Users/yarikmenchaca/Documents/Datasets/ijcnlp_dailydialog')
texts = preprocessor.sentences


In [7]:
len(preprocessor.sentence_tags)

102979

In [8]:
preprocessor = ISEAR('/Users/yarikmenchaca/Documents/Datasets/eng_dataset.csv')
texts = preprocessor.sentences

In [12]:
preprocessor.emotions

{'anger', 'fear', 'joy', 'sadness'}

# NCR labels
Get text class using the NCR, 

In [None]:
ncr_lexicon = NCRlexicon2dict('/Users/yarikmenchaca/Documents/Datasets/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt')

In [None]:
emotion_encoding = list()
for sentence in texts:
    emotion_encoding.append(lexicon_one_hot_encoding(sentence, ncr_lexicon))
    
sentences_strength = list()
for sentence in texts:
    sentences_strength.append([sentence_strength(sentence, ncr_lexicon)])

In [None]:
len(emotion_encoding), len(sentences_strength), len(texts)

# Preprocess sentences

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.utils as ku

In [None]:
tokenizer = Tokenizer(10000, oov_token = "<OOV>")
tokenizer.fit_on_texts(texts)
total_words = len(tokenizer.word_index)  + 1
print(total_words)

In [None]:
#create n_grams
# need to expanda the emotions_encoding, and strenght to match expanded, ngrams
input_sentences = list()
expanded_emotion_encodign = list()
expanded_setence_strength = list()

for token_list, emotion, s_strength in zip(tokenizer.texts_to_sequences(texts), emotion_encoding, sentences_strength) :
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sentences.append(n_gram_sequence)
        expanded_emotion_encodign.append(emotion)
        expanded_setence_strength.append(s_strength)

In [None]:
# pad sentences
max_sequence_len = max([len(x) for x in input_sentences])
input_sentences = np.array(pad_sequences(input_sentences,maxlen=max_sequence_len))
print(max_sequence_len)

In [None]:
# create predictor & labes by taking the last value
predictor, label = input_sentences[:,:-1], input_sentences[:,-1]
word_label = ku.to_categorical(label, num_classes=total_words)

In [None]:
len(predictor), len(expanded_emotion_encodign), len(expanded_setence_strength)

In [None]:
len(predictor[0]),len(expanded_emotion_encodign[0]), len(expanded_setence_strength[0])

In [None]:
# Create conditional vector emotion_condition, emotion_strength
conditional_vector = np.concatenate((np.array(expanded_emotion_encodign), 
                                     np.array(expanded_setence_strength)), axis = 1)
print(predictor.shape, conditional_vector.shape)


In [None]:
# single input 
single_vector = np.concatenate((predictor, conditional_vector), axis = 1)
print(single_vector.shape)

# LSTM Model

In [None]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, concatenate, Flatten
from tensorflow.keras.models import Sequential
import tensorflow as tf

In [None]:
max_sequence_len
conditiona = conditional_vector

In [None]:
# funtional API

# Input layers
text_input = tf.keras.Input(shape=(max_sequence_len), name = 'text')
conditional_input = tf.keras.Input(shape=(conditional_vector.shape[1]), name = 'condition')

# embbeding layer
text_embedding = Embedding(total_words, 300, name = 'text_embedding')(text_input)
text_embedding = Flatten()(text_embedding)

# concatenate layers
concatenate_layer = concatenate([text_embedding, conditional_input], name = 'Concatenate')
lstm_layer = LSTM(150)(concatenate_layer)
dense1_layer = Dense(200, activation='relu')(lstm_layer)
dense2_layer = Dense(200, activation='relu')(dense1_layer)
classification = Dense(total_words, activation='softmax')(dense2_layer)

In [None]:
model = Sequential()
model.add(Embedding(total_words, 300, input_length= single_vector.shape[1]))
model.add(LSTM(150))
model.add(Dense(200, activation='relu'))
model.add(Dense(200, activation='relu'))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
history = model.fit(single_vector, word_label, epochs=20, batch_size = 20)

In [None]:
# save models
model.save('affect_ml_emotion_cause.h5')

In [None]:
import io
import json

tokenizer_json = tokenizer.to_json()
with io.open('affect_ml_emotion_cause_tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

# Generate Text

In [None]:
def generate_conditional_text(seed_text: str,
                              seed_emotion: str,
                              seed_strength: str, 
                              next_words: int = 20)-> str:
    # Generete conditonal vector
    seed_emotion_encoding = np.zeros (len(ncr_lexicon))
    for index, emotion in enumerate(ncr_lexicon):
        if emotion == seed_emotion:
            seed_emotion_encoding[index] = 1

    seed_conditional_vector = np.concatenate((seed_emotion_encoding, np.array([seed_strength])))
    
    # Generate conditional text
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1)
        token_list_conditional = np.concatenate((token_list[0], seed_conditional_vector))
        predicted = model.predict_classes([token_list], verbose=0)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    
    return seed_text

In [None]:
# available emotions
ncr_lexicon.keys()

In [None]:
seed_text = "I enjoy to"
seed_emotion = 'joy'
seed_strength = 2

next_words = 20


In [None]:
generate_conditional_text(seed_text, seed_emotion, seed_strength, 30 )

In [None]:
# Generete conditonal vector
seed_emotion_encoding = np.zeros (len(ncr_lexicon))
for index, emotion in enumerate(ncr_lexicon):
    if emotion == seed_emotion:
        seed_emotion_encoding[index] = 1
        
seed_conditional_vector = np.concatenate((seed_emotion_encoding, np.array([seed_strength])))

In [None]:
seed_conditional_vector

In [None]:
for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1)
    token_list_conditional = np.concatenate((token_list[0], seed_conditional_vector))
    predicted = model.predict_classes([token_list], verbose=0)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word
print(seed_text)