In [21]:
from preprocess import EmotionStimulus, DailyDialog, ISEAR
from utils import NCRlexicon2dict, lexicon_one_hot_encoding, sentence_strength
from typing import Dict
import numpy as np
import collections
import tensorflow.keras.utils as ku

# Read preproces datasets

In [29]:
def one_hot_encoding(variables: list)-> list:
    set_variables = set(variables)
    one_hot = np.zeros((len(variables), len(set_variables)))
    variable_dict = {var: index for index, var in enumerate(set_variables)}
    for index, var in enumerate(variables):
        one_hot[index][variable_dict[var]]= 1
    return one_hot
        

In [41]:
dataset = pd.read_csv('preprocess_dataset/Emotion_stimulus_strength.csv')
texts = dataset['text'].to_list()
emotions = dataset['emotion'].to_list()
strength = dataset['strength'].to_list()

In [42]:
emotion_encoding = one_hot_encoding(emotions)
sentences_strength = np.array([[round(val,2)]for val in strength])

In [43]:
len(emotion_encoding), len(sentences_strength), len(texts)

(2412, 2412, 2412)

In [None]:
# NCR labels
#Get text class using the NCR

ncr_lexicon = NCRlexicon2dict('/Users/yarikmenchaca/Documents/Datasets/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt')

emotion_encoding = list()
for sentence in texts:
    emotion_encoding.append(lexicon_one_hot_encoding(sentence, ncr_lexicon))
    
sentences_strength = list()
for sentence in texts:
    sentences_strength.append([sentence_strength(sentence, ncr_lexicon)])

# Preprocess sentences

In [44]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.utils as ku

In [45]:
tokenizer = Tokenizer(10000, oov_token = "<OOV>")
tokenizer.fit_on_texts(texts)
total_words = len(tokenizer.word_index)  + 1
print(total_words)

7528


In [46]:
#create n_grams
# need to expanda the emotions_encoding, and strenght to match expanded, ngrams
input_sentences = list()
expanded_emotion_encodign = list()
expanded_setence_strength = list()

for token_list, emotion, s_strength in zip(tokenizer.texts_to_sequences(texts), emotion_encoding, sentences_strength) :
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sentences.append(n_gram_sequence)
        expanded_emotion_encodign.append(emotion)
        expanded_setence_strength.append(s_strength)

In [47]:
# pad sentences
max_sequence_len = max([len(x) for x in input_sentences])
input_sentences = np.array(pad_sequences(input_sentences,maxlen=max_sequence_len))
print(max_sequence_len)

50


In [48]:
# create predictor & labes by taking the last value
predictor, label = input_sentences[:,:-1], input_sentences[:,-1]
word_label = ku.to_categorical(label, num_classes=total_words)

In [51]:
# Create conditional vector emotion_condition, emotion_strength
conditional_vector = np.concatenate((np.array(expanded_emotion_encodign), 
                                     np.array(expanded_setence_strength)), axis = 1)
print(predictor.shape, conditional_vector.shape)


(41460, 49) (41460, 8)


In [52]:
# single input 
single_vector = np.concatenate((predictor, conditional_vector), axis = 1)
print(single_vector.shape)

(41460, 57)


# LSTM Model

In [53]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, concatenate, Flatten
from tensorflow.keras.models import Sequential
import tensorflow as tf

In [54]:
max_sequence_len
conditiona = conditional_vector

In [55]:
model = Sequential()
model.add(Embedding(total_words, 300, input_length= single_vector.shape[1]))
model.add(LSTM(150))
model.add(Dense(200, activation='relu'))
model.add(Dense(200, activation='relu'))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 57, 300)           2258400   
_________________________________________________________________
lstm (LSTM)                  (None, 150)               270600    
_________________________________________________________________
dense (Dense)                (None, 200)               30200     
_________________________________________________________________
dense_1 (Dense)              (None, 200)               40200     
_________________________________________________________________
dense_2 (Dense)              (None, 7528)              1513128   
Total params: 4,112,528
Trainable params: 4,112,528
Non-trainable params: 0
_________________________________________________________________
None


In [56]:
history = model.fit(single_vector, word_label, epochs=20, batch_size = 20)

Epoch 1/20
  34/2073 [..............................] - ETA: 4:38 - loss: 8.1824 - accuracy: 0.0412

KeyboardInterrupt: 

In [None]:
# save models
model.save('affect_ml_emotion_cause.h5')

In [None]:
import io
import json

tokenizer_json = tokenizer.to_json()
with io.open('affect_ml_emotion_cause_tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

# Generate Text

In [None]:
def generate_conditional_text(seed_text: str,
                              seed_emotion: str,
                              seed_strength: str, 
                              next_words: int = 20)-> str:
    # Generete conditonal vector
    seed_emotion_encoding = np.zeros (len(ncr_lexicon))
    for index, emotion in enumerate(ncr_lexicon):
        if emotion == seed_emotion:
            seed_emotion_encoding[index] = 1

    seed_conditional_vector = np.concatenate((seed_emotion_encoding, np.array([seed_strength])))
    
    # Generate conditional text
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1)
        token_list_conditional = np.concatenate((token_list[0], seed_conditional_vector))
        predicted = model.predict_classes([token_list], verbose=0)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    
    return seed_text

In [None]:
# available emotions
ncr_lexicon.keys()

In [None]:
seed_text = "I enjoy to"
seed_emotion = 'joy'
seed_strength = 2

next_words = 20


In [None]:
generate_conditional_text(seed_text, seed_emotion, seed_strength, 30 )

In [None]:
# Generete conditonal vector
seed_emotion_encoding = np.zeros (len(ncr_lexicon))
for index, emotion in enumerate(ncr_lexicon):
    if emotion == seed_emotion:
        seed_emotion_encoding[index] = 1
        
seed_conditional_vector = np.concatenate((seed_emotion_encoding, np.array([seed_strength])))

In [None]:
seed_conditional_vector

In [None]:
for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1)
    token_list_conditional = np.concatenate((token_list[0], seed_conditional_vector))
    predicted = model.predict_classes([token_list], verbose=0)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word
print(seed_text)