- https://towardsdatascience.com/multiclass-text-classification-using-keras-to-predict-emotions-comparison-with-and-without-word-5ef0a5eaa1a0

- https://medium.com/analytics-vidhya/train-keras-model-with-large-dataset-batch-training-6b3099fdf366

In [1]:
import pandas as pd
import numpy as np

# NLP
import nltk
import multiprocessing
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import spacy

# Keras
import tensorflow as tf
import tensorflow.keras as keras
from keras_rnn import SentimentLSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences

# visualizations
import plotly.express as px

# utils
from tqdm import tqdm
import os
import re

2023-04-14 15:18:44.414070: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
tokenized_df = pd.read_csv('tokenized_df.csv')
embedding_25 = KeyedVectors.load_word2vec_format('embeddings_unks_25.txt', binary=False)
embedding_25

ValueError: could not convert string to float: ''

In [None]:
vocab_list = list(embedding_25.key_to_index.keys())
print(len(vocab_list))

def remove_non_vocab_words(text: list, vocab):
    for i in tqdm(range(len(text))):
        text[i] = [word for word in text[i] if word in vocab]
    return text

POS_LABEL = 0
NEUTRAL_LABEL = 1
NEG_LABEL = 2

train_data = tokenized_df.sample(frac=0.7,random_state=200)
train_data['label'] = train_data['label'].map({'1':POS_LABEL,'0':NEUTRAL_LABEL,'-1':NEG_LABEL})
train_features, train_labels = train_data.tokens, tf.one_hot(np.asarray(train_data['label']), 3)

test_data = tokenized_df.drop(train_data.index)
validation_data = test_data.sample(frac=0.5,random_state=200)
test_data = test_data.drop(validation_data.index)

validation_data['label'] = validation_data['label'].astype('category')
validation_data['label_cat'] = validation_data['label'].cat.codes
validation_features, validation_labels = validation_data.tokens, tf.one_hot(validation_data['label_cat'], 3)

test_data['label'] = test_data['label'].astype('category')
test_data['label_cat'] = test_data['label'].cat.codes
test_features, test_labels = test_data.tokens, tf.one_hot(test_data['label_cat'], 3)

# tokenized_sequences = [remove_non_vocab_words(line, vocab_list) for line in tqdm(data_lines)]
print(train_labels[:100])
train_data.head(100)

In [None]:
embedding_matrix = embedding_25[embedding_25.key_to_index.keys()]

In [None]:
# plot review length distribution

review_lengths = [len(x) for x in tokenized_df['tokens']]
length_mean = np.mean(review_lengths)
length_std = np.std(review_lengths)
# remove outliers whose length is very large
review_lengths = [x for x in review_lengths if x < length_mean + 2*length_std]
fig = px.histogram(x=review_lengths, labels={'x':'Review Length'}, title="Review Length Distribution")
fig.show()

In [None]:
max_seq_len = 40

def sequences_to_token_indexes(w2v_model, list_features):
    indexed_features = []
    for sentence in tqdm(list_features):
        indexed_sentence = []
        for word in sentence:
            try:
                indexed_sentence.append(w2v_model.key_to_index[word])
            except KeyError as e:
                pass
        indexed_features.append(indexed_sentence)
    return indexed_features

indexed_train_features = sequences_to_token_indexes(wv_25, train_features)
indexed_validation_features = sequences_to_token_indexes(wv_25, validation_features)
indexed_test_features = sequences_to_token_indexes(wv_25, test_features)

padded_train = pad_sequences(indexed_train_features, maxlen=max_seq_len, padding='post', truncating='post')
padded_validation = pad_sequences(indexed_validation_features, maxlen=max_seq_len, padding='post', truncating='post')
padded_test = pad_sequences(indexed_test_features, maxlen=max_seq_len, padding='post', truncating='post')

In [None]:
# create batches

def batch_generator(features, labels, batch_size):
    num_batches = len(features) // batch_size
    for batch in range(num_batches):
        start = batch * batch_size
        end = start + batch_size
        yield features[start:end], labels[start:end]

# def load_data(features, labels, batch_size):
#     dataset = tf.data.Dataset.from_generator(
#         lambda: batch_generator(features, labels, batch_size),
#         output_types=(tf.int32, tf.int32),
#         output_shapes=([None, max_seq_len], [None, 3])
#     )
#     return dataset

batch_size = 64
training_batch_generator = batch_generator(padded_train, train_labels, batch_size)
validation_batch_generator = batch_generator(padded_validation, validation_labels, batch_size)
testing_batch_generator = batch_generator(padded_test, test_labels, batch_size)

In [None]:
# create the model

callbacks = [
    keras.callbacks.EarlyStopping(
        # Stop training when `val_loss` is no longer improving
        monitor="val_loss",
        # "no longer improving" being defined as "no better than 1e-2 less"
        min_delta=1e-2,
        # "no longer improving" being further defined as "for at least 2 epochs"
        patience=2,
        verbose=1,
        restore_best_weights=True),
    keras.callbacks.ModelCheckpoint(
        filepath='models/lstm_with_w2v.hdf5',
        verbose=1,
        save_best_only=True)
]

print(len(vocab_list))
print(embedding_matrix.shape)

model = SentimentLSTM(vocab_size=len(vocab_list),
                      output_dim=25,
                      weights=embedding_matrix,
                      max_seq_length=max_seq_len)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
tf.config.run_functions_eagerly(True)

# storing model training details to analyze later
padded_train = np.asarray(padded_train).astype('float32')
train_labels = np.asarray(train_labels).astype('float32')

[print(i.shape, i.dtype) for i in model.inputs]
[print(o.shape, o.dtype) for o in model.outputs]
[print(l.name, l.input_shape, l.dtype) for l in model.layers]
print(padded_train[0].shape)
print(train_labels[0].shape)

# history = model.fit(padded_train,
#                     train_labels,
#                     validation_split=0.33,
#                     callbacks=callbacks,
#                     epochs=3)

history = model.fit_generator(
    training_batch_generator,
    steps_per_epoch=len(padded_train) // batch_size,
    epochs=3,
    validation_data=validation_batch_generator,
    validation_steps=len(padded_test) // batch_size,
    callbacks=callbacks
)


In [None]:
y_pred_one_hot = model.predict(padded_test)