# Emotion Detection using GRU on tweets.

## Select tensorflow 2 in colab

In [None]:
%tensorflow_version 2.x

## Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

### Go to your file path

In [None]:
%cd /content/gdrive/My\ Drive/app/sentiment

In [4]:
!pwd

/home/agn/PycharmProjects/Emotion-Detection-RNN


In [5]:
!ls

configuration.cfg  requirements.txt
data		   test_configuration.cfg
handler.ipynb	   trained_models
handler.py	   vectors
handler-test.py    weights_bestsurprisecrawl-300d-2M-subword.h5
model.png	   weights_bestsurpriseglove.6B.300d.h5
README.md	   WordsNotFound.txt


## Header

In [6]:
import os
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn import metrics

## Configuration

In [13]:
embedding_file = "vectors/crawl-300d-2M-subword.vec"
dataset_File = "data/wang_cleaned_full_dataset.csv"
traget_Emotion = "anger"
max_features = 25000
maxlen = 35
batchsize = 250
num_epochs = 20

## Definitions

In [10]:
puncts = [
    ',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']',
    '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', '·', '_', '{', '}', '©', '^',
    '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½',
    'à', '…', '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±',
    '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼', '⊕', '▼', '▪',
    '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã',
    '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦',
    '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√',
]


def clean_data(x):
    """
    Seperates punctuations from words in given string.
    :param x: String
    :return: Cleaned string.
    """
    x = str(x).strip().lower()
    for punct in puncts:
        x = x.replace(punct, ' %s ' % punct)
    return x


def prepare_data(target_emotion='anger', other_emotions=None):
    """
    Prepares Train, Validation, and Test set along with the vocabulary
    for a given target emotion.
    :param target_emotion:
    :param other_emotions:
    :return:
    """
    dataset_all = pd.read_csv(dataset_File)
    
    # cleans up the text and makes it lower case
    dataset_all["text"] = dataset_all["text"].apply(lambda x: clean_data(x))
        
    dataset_all["emotion"] = dataset_all["emotion"].apply(lambda x: clean_data(x))
    
    print('Number of unique tweets: {}'.format(len(dataset_all['id'].unique().tolist())))

    # prints distribution of emotions in full dataset
    s = pd.Series(dataset_all['emotion'])
    print(s.value_counts())

    # select data based on a target emotion with random selection from others
    dataset_target = dataset_all.loc[dataset_all['emotion'] == target_emotion]
    target_count = dataset_target['emotion'].count()
    if other_emotions is None:
        dataset_other = dataset_all.loc[dataset_all['emotion'] != target_emotion].sample(target_count)
    else:
        dataset_other = dataset_all.loc[dataset_all['emotion'] == other_emotions].sample(target_count)
    
    # assign float values to class labels
    dataset_target['emotion'] = 1.0
    dataset_other['emotion'] = 0.0

    dataset = pd.concat([dataset_target, dataset_other])
    
    # prints distribution of emotions in selected dataset
    s = pd.Series(dataset['emotion'])
    print(s.value_counts())

    # split to train, validation and test
    train_df, val_test_df = train_test_split(dataset, test_size=0.2, random_state=2018)  # .08 since the datasize is large enough.
    test_df, val_df = train_test_split(val_test_df, test_size=0.5, random_state=2018)

    # prints distribution of emotions in train, validation and test sets
    s = pd.Series(train_df['emotion'])
    print('**************')
    print(s.value_counts())
    
    s = pd.Series(test_df['emotion'])
    print('**************')
    print(s.value_counts())

    s = pd.Series(val_df['emotion'])
    print('**************')
    print(s.value_counts())

    # fill up the missing values
    all_X = dataset['text'].fillna("_##_").values
    train_X = train_df["text"].fillna("_##_").values
    val_X = val_df["text"].fillna("_##_").values
    test_X = test_df["text"].fillna("_##_").values

    # Tokenize the sentences
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(all_X))
    print('#### number of words: ')
    print(tokenizer.num_words)
    
    train_X = tokenizer.texts_to_sequences(train_X)
    val_X = tokenizer.texts_to_sequences(val_X)
    test_X = tokenizer.texts_to_sequences(test_X)
    all_X = tokenizer.texts_to_sequences(all_X)
    lengths = [len(l) for l in all_X]

    print('=========================================')

    # plt.hist(lengths, bins = 'auto')
    # plt.show()

    # Pad the sentences. We need to pad the sequence with 0's to achieve consistent length across examples.
    train_X = tf.keras.preprocessing.sequence.pad_sequences(train_X, maxlen=maxlen)
    val_X = tf.keras.preprocessing.sequence.pad_sequences(val_X, maxlen=maxlen)
    test_X = tf.keras.preprocessing.sequence.pad_sequences(test_X, maxlen=maxlen)

    # Get the target values
    train_y = train_df['emotion'].values
    val_y = val_df['emotion'].values
    test_y = test_df['emotion'].values
    print(type(train_y))

    # shuffling the data
    np.random.seed(2018)
    trn_idx = np.random.permutation(len(train_X))
    val_idx = np.random.permutation(len(val_X))
    tst_idx = np.random.permutation(len(test_X))

    train_X = train_X[trn_idx]
    val_X = val_X[val_idx]
    test_X = test_X[tst_idx]
    train_y = train_y[trn_idx]
    val_y = val_y[val_idx]
    test_y = test_y[tst_idx]

    return train_X, val_X, test_X, train_y, val_y, test_y, tokenizer.word_index


def load_embedding(word_index, embedding_file):
    """
    Create an embedding matrix in which we keep only the embeddings for words which are in our word_index
    :param word_index:
    :param embedding_file:
    :return:
    """

    def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(embedding_file))
    embed_size = len(embeddings_index[next(iter(embeddings_index))])
    
    # make sure all embeddings have the right format
    key_to_del = []
    for key, value in embeddings_index.items():
        if not len(value) == embed_size:
            key_to_del.append(key)
    
    for key in key_to_del:
        del embeddings_index[key]

    notFountWords = []
    all_embs = np.stack(embeddings_index.values())
    emb_mean, emb_std = -0.005838499, 0.48782197
    embed_size = all_embs.shape[1]
    print("*****embedding Size********")
    print(embed_size)
    # word_index = tokenizer.word_index
    print(f"word_index length: {len(word_index)}\n")
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    count = 0
    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            count += 1
        else:
            notFountWords.append(word)
    
    # with open('WordsNotFound.txt', 'w') as f:
    #     for item in notFountWords:
    #         f.write("%s\n" % item)

    print('# of embeding changed: ')
    print(count)
    return embedding_matrix, embed_size


def model_gru(embedding_matrix, embed_size):
    """
    Create model by Tensorflow.
    :param embedding_matrix:
    :param embed_size:
    :return:
    """
    inp = tf.keras.layers.Input(shape=(maxlen,))
    x = tf.keras.layers.Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
 
    x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(35, return_sequences=True))(x)
    avg_pool = tf.keras.layers.GlobalAveragePooling1D()(x)
    max_pool = tf.keras.layers.GlobalMaxPooling1D()(x)
    conc = tf.keras.layers.concatenate([avg_pool, max_pool])
    conc = tf.keras.layers.Dense(70, activation="relu")(conc)

    conc = tf.keras.layers.Dropout(0.5)(conc)
    outp = tf.keras.layers.Dense(1, activation="sigmoid")(conc)
    model = tf.keras.models.Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


def train_model(model):
    """
    Train model.
    :param model:
    :return:
    """
    # embedding_name = os.path.splitext(os.path.basename(embedding_file))[0]
    # fileName = 'weights_best' + traget_Emotion + embedding_name + '.h5'
    # filepath = fileName
    # # filepath = "weights_best.h5"
    # checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath,
    #                                                 monitor='val_loss', verbose=2,
    #                                                 save_best_only=True, mode='min')
    # reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5,
    #                                                  patience=1, min_lr=0.0001, verbose=2)
    # earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0001,
    #                                                  patience=2, verbose=2, mode='auto')
    # callbacks = [checkpoint, reduce_lr]

    history = model.fit(train_X, train_y,
                        batch_size=batchsize, epochs=num_epochs,
                        validation_data=(val_X, val_y))  # , callbacks=callbacks)
    # model.load_weights(filepath)
    # plot_graphs(history, 'accuracy')
    # plot_graphs(history, 'loss')
    pred_val_y = model.predict([val_X], batch_size=1024, verbose=0)
    pred_test_y = model.predict([test_X], batch_size=1024, verbose=0)

    return pred_val_y, pred_test_y


def f1_smart(y_true, y_pred):
    """
    This function computes the best F1 score by looking at predictions for evaluation.
    :param y_true:
    :param y_pred:
    :return:
    """
    thresholds = []
    for thresh in np.arange(0.1, 0.501, 0.01):
        thresh = np.round(thresh, 2)
        res = metrics.f1_score(y_true, (y_pred > thresh).astype(int))
        thresholds.append([thresh, res])
        printout = "F1 score at threshold {0} is {1}".format(thresh, res)
        print(printout)

    thresholds.sort(key=lambda x: x[1], reverse=True)
    best_thresh = thresholds[0][0]
    best_f1 = thresholds[0][1]
    print("Best threshold: ", best_thresh)
    return best_f1, best_thresh


def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()


## Run

In [9]:
train_X, val_X, test_X, train_y, val_y, test_y, word_index = prepare_data(traget_Emotion)

Number of unique tweets: 1387787
joy             393631
sadness         338015
anger           298480
love            169267
thankfulness     79341
fear             73575
nan              23349
surprise         13535
Name: emotion, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


1.0    298480
0.0    298480
Name: emotion, dtype: int64
**************
1.0    238958
0.0    238610
Name: emotion, dtype: int64
**************
0.0    29919
1.0    29777
Name: emotion, dtype: int64
**************
0.0    29951
1.0    29745
Name: emotion, dtype: int64
#### number of words: 
25000
<class 'numpy.ndarray'>


In [11]:
embedding_matrix, embedding_size = load_embedding(word_index, embedding_file)




*****embedding Size********
300
# of embeding changed: 
23506


In [12]:
model1 = model_gru(embedding_matrix, embedding_size)
print(model1.summary())
pred_val_y, pred_test_y = train_model(model1)
f1, threshold = f1_smart(test_y, pred_test_y)
print(f'Optimal F1: {f1} at threshold: {threshold}')

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 35)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 35, 300)      7500000     input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 35, 70)       70770       embedding[0][0]                  
__________________________________________________________________________________________________
global_average_pooling1d (Globa (None, 70)           0           bidirectional[0][0]              
______________________________________________________________________________________________

Epoch 17/20
Epoch 00017: val_loss did not improve from 0.38630
Epoch 18/20
Epoch 00018: val_loss did not improve from 0.38630
Epoch 19/20
Epoch 00019: val_loss did not improve from 0.38630
Epoch 20/20
Epoch 00020: val_loss did not improve from 0.38630
F1 score at threshold 0.1 is 0.7743996183003764
F1 score at threshold 0.11 is 0.7781752175537877
F1 score at threshold 0.12 is 0.7820252517782468
F1 score at threshold 0.13 is 0.7850163748071561
F1 score at threshold 0.14 is 0.7882195958825771
F1 score at threshold 0.15 is 0.7911780821917808
F1 score at threshold 0.16 is 0.7938125869502335
F1 score at threshold 0.17 is 0.7961275310933216
F1 score at threshold 0.18 is 0.7982391374002201
F1 score at threshold 0.19 is 0.8009642742014604
F1 score at threshold 0.2 is 0.803031370173684
F1 score at threshold 0.21 is 0.8046208077919504
F1 score at threshold 0.22 is 0.8069066632503451
F1 score at threshold 0.23 is 0.8084156928930017
F1 score at threshold 0.24 is 0.8103078847840781
F1 score at thre