- https://towardsdatascience.com/multiclass-text-classification-using-keras-to-predict-emotions-comparison-with-and-without-word-5ef0a5eaa1a0

- https://medium.com/analytics-vidhya/train-keras-model-with-large-dataset-batch-training-6b3099fdf366

In [49]:
import pandas as pd
import numpy as np

# NLP
import nltk
import multiprocessing
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import spacy

# Keras
import tensorflow as tf
import tensorflow.keras as keras
from keras_rnn import SentimentLSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences

# visualizations
import plotly.express as px

# utils
from tqdm import tqdm
import os
import re

In [59]:


# Read in the data
df1 = pd.read_csv('data/Reddit_Data.csv',encoding_errors='ignore')
df2 = pd.read_csv('data/test.csv',encoding_errors='ignore')
df3 = pd.read_csv('data/train.csv',encoding_errors='ignore')
df4 = pd.read_csv('data/Twitter_Data.csv',encoding_errors='ignore')
# df5 = pd.read_csv('data/train.tsv',encoding_errors='ignore', delimiter='\t')
# df6 = pd.read_csv('data/training.1600000.processed.noemoticon.csv',encoding_errors='ignore')

# print out value counts for sentiment or category column of each dataframe
print("df1\n",df1["category"].value_counts())
print("df2\n",df2["sentiment"].value_counts())
print("df3\n",df3["sentiment"].value_counts())
print("df4\n",df4["category"].value_counts())
# print("df4\n",df5["Sentiment"].value_counts())
# print("df6\n",df6["polarity of tweet"].value_counts())


# df6

df1
  1    15830
 0    13142
-1     8277
Name: category, dtype: int64
df2
 neutral     1430
positive    1103
negative    1001
Name: sentiment, dtype: int64
df3
 neutral     11118
positive     8582
negative     7781
Name: sentiment, dtype: int64
df4
  1.0    72250
 0.0    55213
-1.0    35510
Name: category, dtype: int64


In [4]:
df1_std = df1.copy()
df2_std = df2.copy()
df3_std = df3.copy()
df4_std = df4.copy()
# df5_std = df5.copy()
# df6_std = df6.copy()

df1_std.columns = ["text","label"]
df1_std.dropna(axis='index',inplace=True)

df2_std = df2_std.loc[:,["text","sentiment"]]
df2_std.dropna(axis='index',inplace=True)
df2_std.columns = ["text","label"]
df2_std["label"] = df2_std["label"].map({"positive":'1',"neutral":'0',"negative":'-1'})

df3_std = df3_std.loc[:,["text","sentiment"]]
df3_std.dropna(axis='index',inplace=True)
df3_std.columns = ["text","label"]
df3_std["label"] = df3_std["label"].map({"positive":'1',"neutral":'0',"negative":'-1'})
df3_std

df4_std = df4_std.loc[:,["clean_text","category"]]
df4_std.dropna(axis='index',inplace=True)
df4_std.columns = ["text","label"]
df4_std["label"] = df4_std["label"].map({1.0:'1',0.0:'0',-1.0:'-1'})
df4_std

# df5_std = df5_std.loc[:,["text","Sentiment"]]
# df5_std.columns = ["text","label"]
# df5_std["label"] = df5_std["label"].map({"positive":1,"neutral":0,"negative":-1})
# df5_std

# df6_std = df6_std.loc[:,["text of the tweet","polarity of tweet"]]
# df6_std.dropna(axis='index',inplace=True)
# df6_std.columns = ["text","label"]
# df6_std["label"] = df6_std["label"].map({4:'1',1:'-1'})
# df6_std

# combine df1_std and df2_std dataframes
combined_df = pd.concat([df1_std,df2_std,df3_std,df4_std],axis=0)
combined_df

# print out the shapes of all dataframes
print("df1_std",df1_std.shape)
print("df2_std",df2_std.shape)
print("df3_std",df3_std.shape)
print("df4_std",df4_std.shape)
# print("df5_std",df5_std.shape)
print("combined_df",combined_df.shape)

combined_df



df1_std (37149, 2)
df2_std (3534, 2)
df3_std (27480, 2)
df4_std (162969, 2)
combined_df (231132, 2)


Unnamed: 0,text,label
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1
...,...,...
162975,why these 456 crores paid neerav modi not reco...,-1
162976,dear rss terrorist payal gawar what about modi...,-1
162977,did you cover her interaction forum where she ...,0
162978,there big project came into india modi dream p...,0


In [64]:
# clean the data
def text_preprocessing(text):
    # lowercase
    text = text.lower()

    # Remove anything that is not a letter, number, or punctuation
    text = re.sub(r"[^a-zA-Z0-9.,!'?]", " ", text)

    # replace digits
    text = re.sub(r"\d", "", text)

    return text

In [65]:
# tqdm to show progress bar
tqdm.pandas(desc="progress-bar")

# make all the text lowercase
combined_df["text"] = combined_df["text"].progress_map(text_preprocessing)

# save the data as a list of tokenized sentences
combined_df['tokens'] = combined_df.text.progress_apply(nltk.word_tokenize)
data_lines = combined_df.tokens.tolist()

## stuff if we wanna do NGRAM related stuff
# NGRAM = 3
# spooky_lines = data.apply(lambda x: ["<s>"]*(NGRAM-1) + x + ["</s>"]*(NGRAM-1)).tolist()

# with open(f"spooky_lines_{NGRAM}.txt","w",encoding="utf-8") as f:
#     for line in combined_df["text"]:
#         f.write("<line> "*(NGRAM-1) + line + " </line>"*(NGRAM-1)+"\n")

# The dimension of word embedding. 
# This variable will be used throughout the program
# you may vary this as you desire
# EMBEDDINGS_SIZE = 100
for EMBEDDINGS_SIZE in tqdm((25,50,100,200)):

    # only generate embeddings if file doesn't exist
    if os.path.exists(f'embeddings_{EMBEDDINGS_SIZE}.txt'):
        continue

    # Train the Word2Vec model from Gensim.
    sg = 1
    window = 5
    vector_size = EMBEDDINGS_SIZE
    min_count = 1


    embeddings = Word2Vec(sentences=data_lines, vector_size=vector_size, window=window, min_count=min_count, sg=sg, workers=multiprocessing.cpu_count())
    # save file in txt format, then load later if you wish.
    embeddings.wv.save_word2vec_format(f'embeddings_{EMBEDDINGS_SIZE}.txt', binary=False)


progress-bar: 100%|██████████| 231132/231132 [00:01<00:00, 216132.44it/s]
progress-bar: 100%|██████████| 231132/231132 [00:25<00:00, 9226.89it/s]
100%|██████████| 4/4 [00:00<00:00, 10414.16it/s]


In [51]:
# if you save your Word2Vec as the variable model, this will
# print out the vocabulary size
# print('Vocab size: {}'.format(len(embeddings.wv)))

In [66]:
wv_25 = KeyedVectors.load_word2vec_format('embeddings_25.txt', binary=False)

In [67]:
vocab_list = list(wv_25.key_to_index.keys())
print(len(vocab_list))

def remove_non_vocab_words(text: list, vocab):
    for i in tqdm(range(len(text))):
        text[i] = [word for word in text[i] if word in vocab]
    return text

POS_LABEL = 0
NEUTRAL_LABEL = 1
NEG_LABEL = 2

train_data = combined_df.sample(frac=0.7,random_state=200)
train_data['label'] = train_data['label'].map({'1':POS_LABEL,'0':NEUTRAL_LABEL,'-1':NEG_LABEL})
train_features, train_labels = train_data.tokens, tf.one_hot(np.asarray(train_data['label']), 3)

test_data = combined_df.drop(train_data.index)
validation_data = test_data.sample(frac=0.5,random_state=200)
test_data = test_data.drop(validation_data.index)

validation_data['label'] = validation_data['label'].astype('category')
validation_data['label_cat'] = validation_data['label'].cat.codes
validation_features, validation_labels = validation_data.tokens, tf.one_hot(validation_data['label_cat'], 3)

test_data['label'] = test_data['label'].astype('category')
test_data['label_cat'] = test_data['label'].cat.codes
test_features, test_labels = test_data.tokens, tf.one_hot(test_data['label_cat'], 3)

# tokenized_sequences = [remove_non_vocab_words(line, vocab_list) for line in tqdm(data_lines)]
print(train_labels[:100])
train_data.head(100)

151298
tf.Tensor(
[[0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 0.]
 [1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 0.

Unnamed: 0,text,label,tokens
609,sulibelesirfeltverybadthatthistimecannotvotebc...,2.0,[sulibelesirfeltverybadthatthistimecannotvoteb...
110105,thankyouforstrengtheningrevivingthecongresspar...,1.0,[thankyouforstrengtheningrevivingthecongresspa...
23486,johnmakingmewearamaskandglovesatworkcauseimsic...,2.0,[johnmakingmewearamaskandglovesatworkcauseimsi...
25259,maybesomedayilovayafriendsmycomputersucksliste...,1.0,[maybesomedayilovayafriendsmycomputersuckslist...
42262,modigoingannouncethatcalcuttabiryanithegreatest,0.0,[modigoingannouncethatcalcuttabiryanithegreatest]
...,...,...,...
16573,thatremindsmeineedtopickupamaskmrwthanksiprefe...,0.0,[thatremindsmeineedtopickupamaskmrwthanksipref...
131994,slowingeconomyrisingjoblessnessnewsclickmarthe...,1.0,[slowingeconomyrisingjoblessnessnewsclickmarth...
98658,groundzeroformodionly,1.0,[groundzeroformodionly]
18199,atthespokeclubfellinlovewithtomarnoldalloverag...,0.0,[atthespokeclubfellinlovewithtomarnoldallovera...


In [56]:
# create embedding matrix
embedding_matrix = wv_25[wv_25.key_to_index.keys()]

In [57]:
# plot review length distribution

review_lengths = [len(x) for x in combined_df['tokens']]
length_mean = np.mean(review_lengths)
length_std = np.std(review_lengths)
# remove outliers whose length is very large
review_lengths = [x for x in review_lengths if x < length_mean + 2*length_std]
fig = px.histogram(x=review_lengths, labels={'x':'Review Length'}, title="Review Length Distribution")
fig.show()

In [None]:
max_seq_len = 40

def sequences_to_token_indexes(w2v_model, list_features):
    indexed_features = []
    for sentence in tqdm(list_features):
        indexed_sentence = []
        for word in sentence:
            try:
                indexed_sentence.append(w2v_model.key_to_index[word])
            except KeyError as e:
                pass
        indexed_features.append(indexed_sentence)
    return indexed_features

indexed_train_features = sequences_to_token_indexes(wv_25, train_features)
indexed_validation_features = sequences_to_token_indexes(wv_25, validation_features)
indexed_test_features = sequences_to_token_indexes(wv_25, test_features)

padded_train = pad_sequences(indexed_train_features, maxlen=max_seq_len, padding='post', truncating='post')
padded_validation = pad_sequences(indexed_validation_features, maxlen=max_seq_len, padding='post', truncating='post')
padded_test = pad_sequences(indexed_test_features, maxlen=max_seq_len, padding='post', truncating='post')

In [37]:
# create batches

def batch_generator(features, labels, batch_size):
    num_batches = len(features) // batch_size
    for batch in range(num_batches):
        start = batch * batch_size
        end = start + batch_size
        yield features[start:end], labels[start:end]

# def load_data(features, labels, batch_size):
#     dataset = tf.data.Dataset.from_generator(
#         lambda: batch_generator(features, labels, batch_size),
#         output_types=(tf.int32, tf.int32),
#         output_shapes=([None, max_seq_len], [None, 3])
#     )
#     return dataset

batch_size = 64
training_batch_generator = batch_generator(padded_train, train_labels, batch_size)
validation_batch_generator = batch_generator(padded_validation, validation_labels, batch_size)
testing_batch_generator = batch_generator(padded_test, test_labels, batch_size)

In [38]:
# create the model

callbacks = [
    keras.callbacks.EarlyStopping(
        # Stop training when `val_loss` is no longer improving
        monitor="val_loss",
        # "no longer improving" being defined as "no better than 1e-2 less"
        min_delta=1e-2,
        # "no longer improving" being further defined as "for at least 2 epochs"
        patience=2,
        verbose=1,
        restore_best_weights=True),
    keras.callbacks.ModelCheckpoint(
        filepath='models/lstm_with_w2v.hdf5',
        verbose=1,
        save_best_only=True)
]

print(len(vocab_list))
print(embedding_matrix.shape)

model = SentimentLSTM(vocab_size=len(vocab_list),
                      output_dim=25,
                      weights=embedding_matrix,
                      max_seq_length=max_seq_len)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
tf.config.run_functions_eagerly(True)

# storing model training details to analyze later
padded_train = np.asarray(padded_train).astype('float32')
train_labels = np.asarray(train_labels).astype('float32')

[print(i.shape, i.dtype) for i in model.inputs]
[print(o.shape, o.dtype) for o in model.outputs]
[print(l.name, l.input_shape, l.dtype) for l in model.layers]
print(padded_train[0].shape)
print(train_labels[0].shape)

# history = model.fit(padded_train,
#                     train_labels,
#                     validation_split=0.33,
#                     callbacks=callbacks,
#                     epochs=3)

history = model.fit_generator(
    training_batch_generator,
    steps_per_epoch=len(padded_train) // batch_size,
    epochs=3,
    validation_data=validation_batch_generator,
    validation_steps=len(padded_test) // batch_size,
    callbacks=callbacks
)


151298
(151298, 25)
(None, 40) <dtype: 'float32'>
(None, 3) <dtype: 'float32'>
embedding_5 (None, 40) float32
dropout_5 (None, 40, 25) float32
lstm_10 (None, 40, 25) float32
lstm_11 (None, 40, 40) float32
dense_5 (None, 3) float32
(40,)
(3,)
Epoch 1/3



`Model.fit_generator` is deprecated and will be removed in a future version. Please use `Model.fit`, which supports generators.


Even though the `tf.config.experimental_run_functions_eagerly` option is set, this option does not apply to tf.data functions. To force eager execution of tf.data functions, please use `tf.data.experimental.enable_debug_mode()`.



Object was never used (type <class 'tensorflow.python.ops.tensor_array_ops.TensorArray'>):
<tensorflow.python.ops.tensor_array_ops.TensorArray object at 0x7fb02eb5fe20>
If you want to mark it as used call its "mark_used()" method.
It was originally created here:
  File "/Users/benwyant/opt/anaconda3/envs/NN-Language-Model/lib/python3.10/site-packages/keras/backend.py", line 5129, in <genexpr>
    output_ta_t = tuple(  File "/Users/benwyant/opt/anaconda3/envs/NN-Language-Model/lib/python3.10/site-packages/tensorflow/python/util/tf_should_use.py", line 243, in wrapped
Epoch 1: val_loss improved from inf to 0.03010, saving model to models/lstm_with_w2v.hdf5
Epoch 2/3


In [39]:
y_pred_one_hot = model.predict(padded_test)



In [40]:
print(test_labels[:10], y_pred_one_hot[:10])

tf.Tensor(
[[0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]], shape=(10, 3), dtype=float32) [[0.58430547 0.12908232 0.28661227]
 [0.5842917  0.1290927  0.2866156 ]
 [0.5843073  0.12908089 0.2866118 ]
 [0.5842977  0.12908812 0.28661418]
 [0.5843067  0.12908125 0.28661206]
 [0.5843075  0.12908068 0.28661188]
 [0.5843065  0.12908156 0.28661194]
 [0.58427817 0.12910387 0.286618  ]
 [0.5842677  0.12911043 0.28662187]
 [0.58430225 0.12908408 0.28661367]]
