<a href="https://colab.research.google.com/github/athishr88/NN_DL/blob/main/Coding_tutorial7/CT7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
## PART 2: Emotion Classification using word embeddings from a pretrained model (GloVe)

## Load the "emotion" dataset (input: tweets, output: 6 emotions)
import nlp

dataset = nlp.load_dataset('emotion')
train = dataset['train']
val = dataset['validation']
test = dataset['test']


Using custom data configuration default


In [19]:
## Prepare input and output pairs for train dataset
import numpy as np

## Prep the train dataset to samples (input) and labels (output)
train_samples = [x['text'] for x in train]
train_labels = [x['label'] for x in train]

print("Classes:", np.unique(train_labels))
print("Number of samples in train:", len(train_samples))
print(train_samples[0])

## Convert each label in the output to a unique integer
classes = list(set(train_labels))
class_to_index = dict((c,i) for i, c in enumerate(classes))
names_to_ids = lambda labels: np.array([class_to_index.get(x) for x in labels])

## Convert the train labels to corresponding int values
train_labels = names_to_ids(train_labels)

Classes: ['anger' 'fear' 'joy' 'love' 'sadness' 'surprise']
Number of samples in train: 16000
i didnt feel humiliated


In [20]:
## Prep the val dataset
val_samples = [x['text'] for x in val]
val_labels = [x['label'] for x in val]
val_labels = names_to_ids(val_labels)

## Prep the test dataset
test_samples = [x['text'] for x in test]
test_labels = [x['label'] for x in test]
test_labels = names_to_ids(test_labels)

In [21]:
## Create our Text Vectorizer to index our vocabulary based on the train samples 
from keras.layers import TextVectorization
import tensorflow as tf

vectorizer = TextVectorization(max_tokens=10000, output_sequence_length=100)
text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128) ## Read batches of 128 samples
vectorizer.adapt(text_ds)

In [22]:
## Print out top five words in the vocab
print(len(vectorizer.get_vocabulary())) ## We set max_tokens=10000
vectorizer.get_vocabulary()[:5]

10000


['', '[UNK]', 'i', 'feel', 'and']

In [23]:
## Text an example of what a string looks like after vectorization
output = vectorizer([["I feel good today"]])
output.numpy()[0, :4]

array([  2,   3, 101, 122])

In [24]:
## Create a map to get the unique list of the vocabulary
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

## print the unique list of integers for the same string using the new map "Word_index"
test = ["i", "feel", "good", "today"]
[word_index[w] for w in test]

[2, 3, 101, 122]

In [25]:
## Vectorize our data (Convert the string data to integer data)
x_train = vectorizer(np.array([[s] for s in train_samples])).numpy()
x_val = vectorizer(np.array([[s] for s in val_samples])).numpy()

y_train = np.array(train_labels)
y_val = np.array(val_labels)

In [None]:
## Download and unzip the Stanford GloVe model (pretrained word embeddings)
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

In [26]:
## Read the embeddings in the pretrained model (we are using the 100D version of GloVe)
import os
path_to_glove_file = "glove.6B.100d.txt"

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [27]:
## Create "embedding_matrix" to index our vocabulary using the GloVe model 
num_tokens = len(voc) 
embedding_dim = 100 ## 100 dimensions
hits = 0 ## number of words that were found in the pretrained model
misses = 0 ## number of words that were missing in the pretrained model

# Prepare embedding matrix for our word list
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 9627 words (373 misses)


In [28]:
## Define our embedding layer for the training model 
## We load our embedding_matrix as the initializer and set trainable to False to avoid retraining this layer

from keras.layers import Embedding
from keras.initializers import Constant

embedding_layer = Embedding(num_tokens, embedding_dim,
                            embeddings_initializer= Constant(embedding_matrix), 
                            trainable=False,
)

In [29]:
## Create a simple Bidirectional LSTM model

from keras import layers, Input, Model

int_sequences_input = Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = layers.Bidirectional(layers.LSTM(20, return_sequences=True))(embedded_sequences)
x = layers.Bidirectional(layers.LSTM(20))(x)
preds = layers.Dense(len(classes), activation="softmax")(x)
model = Model(int_sequences_input, preds)
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_1 (Embedding)     (None, None, 100)         1000000   
                                                                 
 bidirectional_2 (Bidirectio  (None, None, 40)         19360     
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  (None, 40)               9760      
 nal)                                                            
                                                                 
 dense_1 (Dense)             (None, 6)                 246       
                                                                 
Total params: 1,029,366
Trainable params: 29,366
Non-traina

In [30]:
## Train the model 
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["acc"])
model.fit(x_train, y_train, batch_size=128, epochs=20, validation_data=(x_val, y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f77b8643ed0>

In [35]:
## Test the model on a sample tweet from the test split

## Export a Model object to read a string of arbitary length
string_input = Input(shape=(1,), dtype="string")
x = vectorizer(string_input) 
preds = model(x)
end_to_end_model = Model(string_input, preds)

## Predict using the model
probabilities = end_to_end_model.predict([test_samples[11]])

print("String: {}".format(test_samples[11]))
print("Target output: {}".format(classes[test_labels[11]]))
print("Predicted output: {}".format(classes[np.argmax(probabilities[0])]))

String: i feel beautifully emotional knowing that these women of whom i knew just a handful were holding me and my baba on our journey
Target output: sadness
Predicted output: sadness


In [36]:
## PART 3: Semantic Analogies using Gensim library
## Gensim is an open source Python library for NLP
!pip install --upgrade gensim

Collecting gensim
  Downloading gensim-4.1.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 72.0 MB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.1.2


In [37]:
## Use builtin function in Gensim to convert glove to word2vec format 
## (Gensim works on Word2Vec and has built in function to convert GloVe to Word2Vec)

from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

  import sys


(400000, 100)

In [39]:
## Test with semantic analogies

from gensim.models import KeyedVectors
# load the GloVe model 
filename = 'glove.6B.100d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)
# Print example: (king - man) + woman = ? 
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)

[('queen', 0.7698540687561035)]


In [40]:
## Print another example
model.most_similar(positive=["moscow", "france"], negative = "russia", topn = 1)

[('paris', 0.8822440505027771)]