In [90]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [246]:
from tensorflow.keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding
from tensorflow.keras.models import Sequential

In [253]:
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical

In [94]:
data = pd.read_csv('../raw_data/emotion_data.csv')

In [95]:
data.head()

Unnamed: 0,Emotion,Text
0,sad,Layin n bed with a headache ughhhh...waitin o...
1,sad,Funeral ceremony...gloomy friday...
2,joy,wants to hang out with friends SOON!
3,worry,Re-pinging @ghostridah14: why didn't you go to...
4,sad,"I should be sleep, but im not! thinking about ..."


In [97]:
data['Emotion'].value_counts()

anger      15872
joy        14168
worry      11786
neutral    10212
sad         9233
Name: Emotion, dtype: int64

In [102]:
data.shape

(61271, 2)

In [272]:
X = data[['Text']]
y = data[['Emotion']]

In [273]:
from sklearn.model_selection import train_test_split

In [274]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.utils import simple_preprocess

In [275]:
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

In [276]:
def clean_data(dataframe):
    """
    clean and preprocess data
    """
    
    data = dataframe.copy()

    # Lowercase text
    data['clean_text'] = data['Text'].apply(
        lambda x: x.lower()
        )

    # Strip whitespace
    data['clean_text'] = data['clean_text'].apply(
        lambda x: x.strip()
        )

    # Remove numbers
    data['clean_text'] = data['clean_text'].apply(
        lambda x: ''.join(let for let in x if not let.isdigit())
        )

    # Remove punctuation
    data['clean_text'] = data['clean_text'].apply(
        lambda x: ''.join(let for let in x if not let in string.punctuation)
        )

    # Tokenization with nltk
    data['clean_text'] = data['clean_text'].apply(
        lambda x: word_tokenize(x)
    )
    
    # Remove stopwords
#     stop_words = set(stopwords.words('english'))
#     data['clean_text'] = data['clean_text'].apply(
#         lambda x: [word for word in x if word not in stop_words]
#         )

    # Lemmatizing with nltk
    lemmatizer = WordNetLemmatizer()
    data['clean_text'] = data['clean_text'].apply(
        lambda x: ' '.join(lemmatizer.lemmatize(word) for word in x)
        )
    
    # Tokenizing text
    data['tokenized_text'] = [simple_preprocess(line, deacc=True) for line in data['clean_text']] 

    # Return data
    return data

In [277]:
emotions_df = clean_data(data)

In [313]:
# Train Test Split Function
def split_train_test(df, test_size=0.3):
    X_train, X_test, y_train, y_test = train_test_split(df[['tokenized_text']], 
                                                        df[['Emotion']], 
                                                        test_size=test_size, 
                                                        random_state=15)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_train_test(emotions_df, test_size=0.3)

In [279]:
X_train.head()

Unnamed: 0,tokenized_text
32200,"[yeah, it, lot, better, than, mine]"
1030,"[thomasfiss, think, you, should, come, tomarro..."
56436,"[ive, been, up, since, hate, wen, cant, sleep,..."
29605,"[sure, now, you, are, talking]"
28747,"[sometimes, ive, thought, ll, never, forget, w..."


Training data stats

In [280]:
all_training_words = [word for tokens in X_train["tokenized_text"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in X_train["tokenized_text"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

531063 words total, with a vocabulary size of 39182
Max sentence length is 201


Testing data stats

In [281]:
all_test_words = [word for tokens in X_test["tokenized_text"] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in X_test["tokenized_text"]]
TEST_VOCAB = sorted(list(set(all_test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

226316 words total, with a vocabulary size of 22205
Max sentence length is 152


In [282]:
from gensim import models

In [None]:
word2vec_path = '../raw_data/google-vectors.bin.gz'
word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [314]:
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(X_train["tokenized_text"].tolist())
training_sequences = tokenizer.texts_to_sequences(X_train["tokenized_text"].tolist())
train_word_index = tokenizer.word_index
MAX_SEQUENCE_LENGTH = 201
print("Found %s unique tokens." % len(train_word_index))
train_cnn_data = pad_sequences(training_sequences, 
                               maxlen=MAX_SEQUENCE_LENGTH)

Found 39182 unique tokens.


In [285]:
EMBEDDING_DIM = 300

In [286]:
train_embedding_weights = np.zeros((len(train_word_index)+1, 
 EMBEDDING_DIM))
for word,index in train_word_index.items():
 train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

(39183, 300)


In [287]:
test_sequences = tokenizer.texts_to_sequences(X_test["tokenized_text"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [270]:
np.array(test_sequences).shape

  """Entry point for launching an IPython kernel.


(18382,)

In [315]:
def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    convs = []
    filter_sizes = [2,3,4,5,6]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=200, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)


    l_merge = concatenate(convs, axis=1)

    x = Dropout(0.1)(l_merge)  
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    preds = Dense(labels_index, activation='softmax')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    model.summary()
    return model

In [316]:
label_names = ['anger', 'joy', 'worry', 'neutral', 'sad']

In [322]:
model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 
                len(list(label_names)))

Model: "functional_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 201)]        0                                            
__________________________________________________________________________________________________
embedding_12 (Embedding)        (None, 201, 300)     11754900    input_4[0][0]                    
__________________________________________________________________________________________________
conv1d_27 (Conv1D)              (None, 200, 200)     120200      embedding_12[0][0]               
__________________________________________________________________________________________________
conv1d_28 (Conv1D)              (None, 199, 200)     180200      embedding_12[0][0]               
_______________________________________________________________________________________

In [289]:
# Prepare mapping for the sentiment
sentiment_coding = {'anger': 0 , 'joy': 4, 'worry': 2, 'sad': 1 , 'neutral': 3}

# apply mapping
y_train_coded = y_train['Emotion'].map(sentiment_coding)

# Transform the numbers to categories
y_train_cat = to_categorical(y_train_coded)

# Prepare y_test
y_test_coded= y_test['Emotion'].map(sentiment_coding)
y_test_coded.value_counts()
y_test_cat = to_categorical(y_test_coded)

In [290]:
y_train_cat

array([[1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.]], dtype=float32)

In [318]:
num_epochs = 3
batch_size = 34
X_train_model = train_cnn_data
y_tr = y_train_cat

In [319]:
len(X_train)

42889

In [320]:
y_tr

array([[1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.]], dtype=float32)

In [323]:
history = model.fit(X_train_model, y_tr,
                 validation_split=0.3,
                 batch_size=batch_size,
                 epochs=num_epochs,
                 verbose=1
                )

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [175]:
from gensim.models import Word2Vec
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

def convert_sentences(X):
    return [sentence.split(' ') for sentence in X]

def embed_sentence(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])
        
    return np.array(embedded_sentence)

def embedding(word2vec, sentences):
    embed = []
    for sentence in sentences:
        embedded_sentence = embed_sentence(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed

def embedding_pipeline(word2vec, X):
    # Step #2: List of words to list of vectors
    X = convert_sentences(X)
    # Step #3: Sentences to list of words
    X = embedding(word2vec, X)
    # Step #4: Pad the inputs
    X = pad_sequences(X, dtype='float32', padding='post', value= -1000)
    
    return X

In [177]:
X_train_token = convert_sentences(X_train)

In [179]:
from tensorflow.keras.preprocessing.text import Tokenizer

texts = emotions_df.clean_text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

sequence_train = tokenizer.texts_to_sequences(X_train)
sequence_test = tokenizer.texts_to_sequences(X_test)

index_of_words = tokenizer.word_index

# vacab size is number of unique words + reserved 0 index for padding
vocab_size = len(index_of_words) + 1

In [180]:
#To know the longest sentence in X_train
max([len(X_train_token[k]) for k in range(len(X_train_token))])

# Step #1: Train a word2vec - with X_train token so it takes into account the words and not the letters 
word2vec = Word2Vec(sentences=X_train_token, size=60, min_count=10, window=10)

X_train_pad = embedding_pipeline(word2vec, X_train)

X_test_pad = embedding_pipeline(word2vec, X_test)

X_train_pad.shape

(42889, 215, 60)

In [181]:
X_test_pad.shape

(18382, 151, 60)

In [182]:
for X in [X_train_pad, X_test_pad]:
    assert type(X) == np.ndarray
    assert X.shape[-1] == word2vec.wv.vector_size


assert X_train_pad.shape[0] == len(X_train)
assert X_test_pad.shape[0] == len(X_test)

In [184]:
from sklearn.metrics import accuracy_score

unique, counts = np.unique(y_train, return_counts=True)
counts = dict(zip(unique, counts))
print('Number of labels in train set', counts)

Number of labels in train set {'anger': 11112, 'joy': 9890, 'neutral': 7145, 'sad': 6450, 'worry': 8292}


In [310]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Dense , Flatten   

embed_num_dims = 201
max_seq_len = 50
vocab_size

# Embedding layer before the actaul BLSTM 
embed_layer = Embedding(vocab_size,
                         embed_num_dims,
                         input_length = max_seq_len,
                         trainable=False)

def cnn_model():
    model = Sequential()
    model.add(embed_layer)
    model.add(Conv1D(filters = 3,input_shape=(215,60),kernel_size=3, activation='relu'))
    model.add(MaxPooling1D())
    model.add(Flatten())
    
    model.add(Dense(5, activation='softmax'))
    
    model.compile(loss = 'categorical_crossentropy', 
                  optimizer = 'adam', 
                  metrics = ['accuracy'])
    return model

model = cnn_model()    
model.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 50, 201)           10382052  
_________________________________________________________________
conv1d_20 (Conv1D)           (None, 48, 3)             1812      
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 24, 3)             0         
_________________________________________________________________
flatten_10 (Flatten)         (None, 72)                0         
_________________________________________________________________
dense_20 (Dense)             (None, 5)                 365       
Total params: 10,384,229
Trainable params: 2,177
Non-trainable params: 10,382,052
_________________________________________________________________


In [194]:
print(X_train_pad[6].shape)
y_train_cat

(215, 60)


array([[0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]], dtype=float32)

In [195]:
from tensorflow.keras.callbacks import EarlyStopping
batch_size = 32
epochs = 30

es = EarlyStopping(patience=15, restore_best_weights=True)

history = model.fit(X_train, y_train_cat,
                 validation_split=0.3,
                 batch_size=batch_size,
                 epochs=epochs,
                 callbacks=[es],
                 verbose=1
                )

Epoch 1/30


ValueError: in user code:

    /Users/lukemoberly/.pyenv/versions/3.7.7/envs/lewagon_project/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:806 train_function  *
        return step_function(self, iterator)
    /Users/lukemoberly/.pyenv/versions/3.7.7/envs/lewagon_project/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:796 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /Users/lukemoberly/.pyenv/versions/3.7.7/envs/lewagon_project/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /Users/lukemoberly/.pyenv/versions/3.7.7/envs/lewagon_project/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /Users/lukemoberly/.pyenv/versions/3.7.7/envs/lewagon_project/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    /Users/lukemoberly/.pyenv/versions/3.7.7/envs/lewagon_project/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:789 run_step  **
        outputs = model.train_step(data)
    /Users/lukemoberly/.pyenv/versions/3.7.7/envs/lewagon_project/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:747 train_step
        y_pred = self(x, training=True)
    /Users/lukemoberly/.pyenv/versions/3.7.7/envs/lewagon_project/lib/python3.7/site-packages/tensorflow/python/keras/engine/base_layer.py:985 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    /Users/lukemoberly/.pyenv/versions/3.7.7/envs/lewagon_project/lib/python3.7/site-packages/tensorflow/python/keras/engine/sequential.py:372 call
        return super(Sequential, self).call(inputs, training=training, mask=mask)
    /Users/lukemoberly/.pyenv/versions/3.7.7/envs/lewagon_project/lib/python3.7/site-packages/tensorflow/python/keras/engine/functional.py:386 call
        inputs, training=training, mask=mask)
    /Users/lukemoberly/.pyenv/versions/3.7.7/envs/lewagon_project/lib/python3.7/site-packages/tensorflow/python/keras/engine/functional.py:508 _run_internal_graph
        outputs = node.layer(*args, **kwargs)
    /Users/lukemoberly/.pyenv/versions/3.7.7/envs/lewagon_project/lib/python3.7/site-packages/tensorflow/python/keras/engine/base_layer.py:985 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    /Users/lukemoberly/.pyenv/versions/3.7.7/envs/lewagon_project/lib/python3.7/site-packages/tensorflow/python/keras/layers/convolutional.py:247 call
        outputs = self._convolution_op(inputs, self.kernel)
    /Users/lukemoberly/.pyenv/versions/3.7.7/envs/lewagon_project/lib/python3.7/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /Users/lukemoberly/.pyenv/versions/3.7.7/envs/lewagon_project/lib/python3.7/site-packages/tensorflow/python/ops/nn_ops.py:1017 convolution_v2
        name=name)
    /Users/lukemoberly/.pyenv/versions/3.7.7/envs/lewagon_project/lib/python3.7/site-packages/tensorflow/python/ops/nn_ops.py:1147 convolution_internal
        name=name)
    /Users/lukemoberly/.pyenv/versions/3.7.7/envs/lewagon_project/lib/python3.7/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /Users/lukemoberly/.pyenv/versions/3.7.7/envs/lewagon_project/lib/python3.7/site-packages/tensorflow/python/util/deprecation.py:574 new_func
        return func(*args, **kwargs)
    /Users/lukemoberly/.pyenv/versions/3.7.7/envs/lewagon_project/lib/python3.7/site-packages/tensorflow/python/util/deprecation.py:574 new_func
        return func(*args, **kwargs)
    /Users/lukemoberly/.pyenv/versions/3.7.7/envs/lewagon_project/lib/python3.7/site-packages/tensorflow/python/ops/nn_ops.py:1888 conv1d
        name=name)
    /Users/lukemoberly/.pyenv/versions/3.7.7/envs/lewagon_project/lib/python3.7/site-packages/tensorflow/python/ops/gen_nn_ops.py:979 conv2d
        data_format=data_format, dilations=dilations, name=name)
    /Users/lukemoberly/.pyenv/versions/3.7.7/envs/lewagon_project/lib/python3.7/site-packages/tensorflow/python/framework/op_def_library.py:744 _apply_op_helper
        attrs=attr_protos, op_def=op_def)
    /Users/lukemoberly/.pyenv/versions/3.7.7/envs/lewagon_project/lib/python3.7/site-packages/tensorflow/python/framework/func_graph.py:593 _create_op_internal
        compute_device)
    /Users/lukemoberly/.pyenv/versions/3.7.7/envs/lewagon_project/lib/python3.7/site-packages/tensorflow/python/framework/ops.py:3485 _create_op_internal
        op_def=op_def)
    /Users/lukemoberly/.pyenv/versions/3.7.7/envs/lewagon_project/lib/python3.7/site-packages/tensorflow/python/framework/ops.py:1975 __init__
        control_input_ops, op_def)
    /Users/lukemoberly/.pyenv/versions/3.7.7/envs/lewagon_project/lib/python3.7/site-packages/tensorflow/python/framework/ops.py:1815 _create_c_op
        raise ValueError(str(e))

    ValueError: Negative dimension size caused by subtracting 3 from 1 for '{{node sequential_4/conv1d_3/conv1d}} = Conv2D[T=DT_FLOAT, data_format="NHWC", dilations=[1, 1, 1, 1], explicit_paddings=[], padding="VALID", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true](sequential_4/conv1d_3/conv1d/ExpandDims, sequential_4/conv1d_3/conv1d/ExpandDims_1)' with input shapes: [?,1,1,60], [1,3,60,3].
