# Starter pack library

In [7]:
import pandas as pd 
import matplotlib.pyplot as plt
import datetime
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

# Loading data

In [89]:
emotions_df = pd.read_csv("../london-emotions/data/emotion_data.csv")

In [157]:
emotions_df['Emotion'].value_counts()

anger      15872
joy        14168
worry      11786
neutral    10212
sad         9233
Name: Emotion, dtype: int64

# Cleaning function

In [94]:
def clean_data(data):
    """
    clean and preprocess data
    """

    # Lowercase text
    data['clean_text'] = data['Text'].apply(
        lambda x: x.lower()
        )
    # Strip whitespace
    data['clean_text'] = data['clean_text'].apply(
        lambda x: x.strip()
        )
    # Remove numbers
    data['clean_text'] = data['clean_text'].apply(
        lambda x: ''.join(let for let in x if not let.isdigit())
        )
    # Remove punctuation
    data['clean_text'] = data['clean_text'].apply(
        lambda x: ''.join(let for let in x if not let in string.punctuation)
        )
    # Tokenization with nltk
    data['clean_text'] = data['clean_text'].apply(
        lambda x: word_tokenize(x)
    )
    # Remove stopwords
    # stop_words = set(stopwords.words('english'))
    # data['clean_text'] = data['clean_text'].apply(
    #     lambda x: [word for word in x if word not in stop_words]
    #     )
    #Lemmatizing with nltk
    lemmatizer = WordNetLemmatizer()
    data['clean_text'] = data['clean_text'].apply(
        lambda x: ' '.join(lemmatizer.lemmatize(word) for word in x)
        )

    # Return data
    return data

In [95]:
clean_data(emotions_df)

Unnamed: 0,Emotion,Text,clean_text
0,sad,Layin n bed with a headache ughhhh...waitin o...,layin n bed with a headache ughhhhwaitin on yo...
1,sad,Funeral ceremony...gloomy friday...,funeral ceremonygloomy friday
2,joy,wants to hang out with friends SOON!,want to hang out with friend soon
3,worry,Re-pinging @ghostridah14: why didn't you go to...,repinging ghostridah why didnt you go to prom ...
4,sad,"I should be sleep, but im not! thinking about ...",i should be sleep but im not thinking about an...
5,worry,Hmmm. http://www.djhero.com/ is down,hmmm httpwwwdjherocom is down
6,sad,@charviray Charlene my love. I miss you,charviray charlene my love i miss you
7,sad,@kelcouch I'm sorry at least it's Friday?,kelcouch im sorry at least it friday
8,worry,Choked on her retainers,choked on her retainer
9,sad,Ugh! I have to beat this stupid song to get to...,ugh i have to beat this stupid song to get to ...


# library for NN

In [16]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Split dataset

In [96]:
# Train Test Split Function
def split_train_test(df, test_size=0.3):
    X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], 
                                                        df['Emotion'], 
                                                        test_size=test_size, 
                                                        random_state=15)
    return X_train, X_test, y_train, y_test

In [97]:
X_train, X_test, y_train, y_test = split_train_test(emotions_df, test_size=0.3)

# Vectorizing X_train & X_test

In [110]:
from gensim.models import Word2Vec
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

def convert_sentences(X):
    return [sentence.split(' ') for sentence in X]

def embed_sentence(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec.wv:
            embedded_sentence.append(word2vec.wv[word])
        
    return np.array(embedded_sentence)

def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed

def embedding_pipeline(word2vec, X):
    # Step #2: List of words to list of vectors
    X = convert_sentences(X)
    
    # Step #3: Sentences to list of words
    X = embedding(word2vec, X)
    
    # Step #4: Pad the inputs
    X = pad_sequences(X, dtype='float32', padding='post', value= -1000)
    
    return X

In [116]:
# Step #1: Train a word2vec - with possible hyperparameters
word2vec = Word2Vec(sentences=X_train_token, size=60, min_count=10, window=10)




In [117]:
X_train_pad = embedding_pipeline(word2vec, X_train)

In [119]:
X_test_pad = embedding_pipeline(word2vec, X_test)

In [118]:
X_train_pad.shape

(42889, 215, 60)

In [120]:
X_test_pad.shape

(18382, 151, 60)

In [112]:
word2vec.wv.vocab

{'yeah': <gensim.models.keyedvectors.Vocab at 0x131702910>,
 'it': <gensim.models.keyedvectors.Vocab at 0x1317029d0>,
 '’': <gensim.models.keyedvectors.Vocab at 0x131702a10>,
 's': <gensim.models.keyedvectors.Vocab at 0x131702950>,
 'a': <gensim.models.keyedvectors.Vocab at 0x131702b50>,
 'lot': <gensim.models.keyedvectors.Vocab at 0x131706e90>,
 'better': <gensim.models.keyedvectors.Vocab at 0x131706310>,
 'than': <gensim.models.keyedvectors.Vocab at 0x131706050>,
 'mine': <gensim.models.keyedvectors.Vocab at 0x131702d90>,
 'thomasfiss': <gensim.models.keyedvectors.Vocab at 0x131706bd0>,
 'i': <gensim.models.keyedvectors.Vocab at 0x131706b50>,
 'think': <gensim.models.keyedvectors.Vocab at 0x131706f50>,
 'you': <gensim.models.keyedvectors.Vocab at 0x131706b90>,
 'should': <gensim.models.keyedvectors.Vocab at 0x131706c90>,
 'come': <gensim.models.keyedvectors.Vocab at 0x131706f90>,
 'wa': <gensim.models.keyedvectors.Vocab at 0x131706d50>,
 'really': <gensim.models.keyedvectors.Vocab at

In [109]:
X_train_pad.shape

(42889, 31, 60)

In [106]:
X_train_token= convert_sentences(X_train)

In [108]:
max([len(X_train_token[k]) for k in range(len(X_train_token))])

232

## Making sure X_train_pad & X_train_pad have the right size

In [122]:
for X in [X_train_pad, X_test_pad]:
    assert type(X) == np.ndarray
    assert X.shape[-1] == word2vec.wv.vector_size


assert X_train_pad.shape[0] == len(X_train)
assert X_test_pad.shape[0] == len(X_test)

# Create baseline accuracy

In [123]:
from sklearn.metrics import accuracy_score

unique, counts = np.unique(y_train, return_counts=True)
counts = dict(zip(unique, counts))
print('Number of labels in train set', counts)


Number of labels in train set {'anger': 11112, 'joy': 9890, 'neutral': 7145, 'sad': 6450, 'worry': 8292}


# Create NN 

In [68]:
from tensorflow.keras import Sequential
from tensorflow.keras import layers

def init_model():
    model = Sequential()
    model.add(layers.Masking())
    
    model.add(layers.Conv1D(20, activation='relu'))
    model.add(layers.Dense(15, activation='relu'))
    model.add(layers.Dense(10, activation='relu'))
    model.add(layers.Dense(5, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    
    return model

model = init_model()

In [148]:
sentiment_coding = {'anger': 0 , 'joy': 4, 'worry': 2, 'sad': 1 , 'neutral': 3}

In [149]:
y_train_coded= y_train.map(sentiment_coding)

In [151]:
y_train_coded.value_counts()

0    11112
4     9890
2     8292
3     7145
1     6450
Name: Emotion, dtype: int64

In [152]:
from tensorflow.keras.utils import to_categorical

In [153]:
y_train_cat = to_categorical(y_train_coded)

# Train model

In [156]:
from tensorflow.keras.callbacks import EarlyStopping
X_train_pad_short = X_train_pad[:201]
y_train_cat_short = y_train_cat[:201]
es = EarlyStopping(patience=2, restore_best_weights=True)

model.fit(X_train_pad_short, y_train_cat_short, 
          batch_size = 32,
          epochs=10,
          callbacks=[es],
          verbose=1
         )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x159864fd0>

In [82]:
res = model.evaluate(X_test_pad, y_test, verbose=0)

ValueError: in user code:

    /Users/audehamdi/.pyenv/versions/3.7.7/envs/le-wagon-7/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:1224 test_function  *
        return step_function(self, iterator)
    /Users/audehamdi/.pyenv/versions/3.7.7/envs/le-wagon-7/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:1215 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /Users/audehamdi/.pyenv/versions/3.7.7/envs/le-wagon-7/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /Users/audehamdi/.pyenv/versions/3.7.7/envs/le-wagon-7/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /Users/audehamdi/.pyenv/versions/3.7.7/envs/le-wagon-7/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    /Users/audehamdi/.pyenv/versions/3.7.7/envs/le-wagon-7/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:1208 run_step  **
        outputs = model.test_step(data)
    /Users/audehamdi/.pyenv/versions/3.7.7/envs/le-wagon-7/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:1177 test_step
        y, y_pred, sample_weight, regularization_losses=self.losses)
    /Users/audehamdi/.pyenv/versions/3.7.7/envs/le-wagon-7/lib/python3.7/site-packages/tensorflow/python/keras/engine/compile_utils.py:204 __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    /Users/audehamdi/.pyenv/versions/3.7.7/envs/le-wagon-7/lib/python3.7/site-packages/tensorflow/python/keras/losses.py:149 __call__
        losses = ag_call(y_true, y_pred)
    /Users/audehamdi/.pyenv/versions/3.7.7/envs/le-wagon-7/lib/python3.7/site-packages/tensorflow/python/keras/losses.py:253 call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    /Users/audehamdi/.pyenv/versions/3.7.7/envs/le-wagon-7/lib/python3.7/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /Users/audehamdi/.pyenv/versions/3.7.7/envs/le-wagon-7/lib/python3.7/site-packages/tensorflow/python/keras/losses.py:1535 categorical_crossentropy
        return K.categorical_crossentropy(y_true, y_pred, from_logits=from_logits)
    /Users/audehamdi/.pyenv/versions/3.7.7/envs/le-wagon-7/lib/python3.7/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /Users/audehamdi/.pyenv/versions/3.7.7/envs/le-wagon-7/lib/python3.7/site-packages/tensorflow/python/keras/backend.py:4687 categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)
    /Users/audehamdi/.pyenv/versions/3.7.7/envs/le-wagon-7/lib/python3.7/site-packages/tensorflow/python/framework/tensor_shape.py:1134 assert_is_compatible_with
        raise ValueError("Shapes %s and %s are incompatible" % (self, other))

    ValueError: Shapes (None, 1) and (None, 5) are incompatible


In [None]:
res