### Deep learning - LSTM

In [1]:
import pandas as pd
from gensim.models import Word2Vec

# form embedding matrix (w2v)
import numpy as np
from tqdm import tqdm

# padding for word embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import keras
from sklearn.preprocessing import LabelEncoder

word2vec_model = Word2Vec.load("word2vec_twitter_50.model")

train_df = pd.read_pickle("./train_df_clean.pkl")
test_df = pd.read_pickle("./test_df_clean.pkl")



MAX_SEQUENCE_LENGTH = 20

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df['clean_text'])
sequences = tokenizer.texts_to_sequences(train_df['clean_text'])

word_index = tokenizer.word_index

wordEmbedding_w2v_train  = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)


vector_dim = 50
embedding_matrix = np.zeros((len(word_index) + 1, vector_dim))
missingWord = []
for word, i in tqdm(word_index.items()):
    try:
        embedding_vector = word2vec_model.wv[word]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    except:
        missingWord.append(word)
        
#modeling
# for a classificaiton problem, you need to provide both training & testing data
div = int(train_df.shape[0]*0.8)

wordEmbedding_w2v_X_train = wordEmbedding_w2v_train[:div]
wordEmbedding_w2v_y_train = train_df['emotion'][:div]

wordEmbedding_w2v_X_test = wordEmbedding_w2v_train[div:]
wordEmbedding_w2v_y_test = train_df['emotion'][div:]

def label_encode(le, labels):
    enc = le.transform(labels)
    return keras.utils.to_categorical(enc)

def label_decode(le, one_hot_label):
    dec = np.argmax(one_hot_label, axis=1)
    return le.inverse_transform(dec)

def encode(y_train, y_test):
    label_encoder = LabelEncoder()
    label_encoder.fit(y_train)
    print('check label: ', label_encoder.classes_)
    print('\n## Before convert')
    print('y_train[0:4]:\n', y_train[0:4])
    print('\ny_train.shape: ', y_train.shape)
    print('y_test.shape: ', y_test.shape)

    re_y_train = label_encode(label_encoder, y_train)
    re_y_test = label_encode(label_encoder, y_test)

    print('\n\n## After convert')
    print('y_train[0:4]:\n', y_train[0:4])
    print('\ny_train.shape: ', y_train.shape)
    print('y_test.shape: ', y_test.shape)
    
    return re_y_train, re_y_test, label_encoder

le_wordEmbedding_w2v_y_train, le_wordEmbedding_w2v_y_test, label_encoder = encode(wordEmbedding_w2v_y_train, wordEmbedding_w2v_y_test)

# I/O check
input_shape = wordEmbedding_w2v_X_train.shape[1]
print('input_shape: ', input_shape)

output_shape = len(label_encoder.classes_)
print('output_shape: ', output_shape)

from keras.models import Sequential
from keras.layers import Dense, LSTM, GRU, Dropout, Activation, ActivityRegularization, SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.regularizers import l2
from keras.initializers import Constant
from keras import optimizers

model = Sequential()
model.add(Embedding(len(word_index) + 1,
                            vector_dim,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False))
# model.add(SpatialDropout1D(0.7))
model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.3))#0.2
model.add(Dense(8, activation='softmax'))
model.add(Dense(8, activation='softmax'))

# adam = optimizers.Adamax(lr=0.002, beta_1=0.9, beta_2=0.999)
model.compile(
    optimizer="nadam", 
    loss='categorical_crossentropy', 
    metrics=['acc'])

model.summary()

epochs = 5
batch_size = 32
history = model.fit(wordEmbedding_w2v_X_train, le_wordEmbedding_w2v_y_train, 
                    epochs=epochs, batch_size=batch_size, verbose=1,
                    validation_data = (wordEmbedding_w2v_X_test, le_wordEmbedding_w2v_y_test))

## precision, recall, f1-score,
from sklearn.metrics import classification_report

y_pred_result = model.predict(x = wordEmbedding_w2v_X_test, batch_size=128)
y_pred_result = label_decode(label_encoder, y_pred_result)
print(classification_report(y_true=wordEmbedding_w2v_y_test, y_pred=y_pred_result))

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
100%|██████████| 138999/138999 [00:00<00:00, 241942.83it/s]


check label:  ['anger' 'anticipation' 'disgust' 'fear' 'joy' 'sadness' 'surprise'
 'trust']

## Before convert
y_train[0:4]:
 0    anticipation
1         sadness
3            fear
5             joy
Name: emotion, dtype: object

y_train.shape:  (1164450,)
y_test.shape:  (291113,)


## After convert
y_train[0:4]:
 0    anticipation
1         sadness
3            fear
5             joy
Name: emotion, dtype: object

y_train.shape:  (1164450,)
y_test.shape:  (291113,)
input_shape:  20
output_shape:  8
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 50)            6950000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               314368    
_________________________________________________________________
dense_1 (Dense)              (None, 512)               131584    
____________________________________

In [2]:
## predict
# tests = pd.read_pickle("test_df_clean.pkl")

test_sequences = tokenizer.texts_to_sequences(test_df["clean_text"])
test_sequences  = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

pred_result = model.predict(test_sequences, batch_size=128)
pred_result = np.array(label_decode(label_encoder, pred_result))
print(pred_result.shape)
test_df['emotion'] = pred_result
test_df = test_df.drop('hashtags', axis=1)
test_df = test_df.drop('text', axis=1)
test_df = test_df.drop('identification', axis=1)
test_df = test_df.drop('clean_text', axis=1)
test_df = test_df.drop('tokenized', axis=1)
test_df.rename(columns={'tweet_id':'id'}, inplace=True)

test_df.to_csv('prediction.csv', index=False)

(411972,)
