<a href="https://colab.research.google.com/github/arind123/GenAI/blob/main/3-CBOW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install textblob 'keras-nlp' 'keras-preprocessing' 'gensim==4.2.0' np_utils



In [2]:
import multiprocessing
import tensorflow as tf
import sys
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
import np_utils
from tensorflow.keras.utils import to_categorical
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from textblob import TextBlob, Word
from keras_preprocessing.sequence import pad_sequences
import numpy as np
import random
import os
import pandas as pd
import gensim
import warnings
import nltk
import re
import spacy

nlp = spacy.load("en_core_web_sm")

TRACE = False  # Setting to true is useful when debugging to know which device is being used
embedding_dim = 50
epochs=100
batch_size = 500
BATCH = True

def set_seeds_and_trace():
  os.environ['PYTHONHASHSEED'] = '0'
  np.random.seed(42)
  tf.random.set_seed(42)
  random.seed(42)
  if TRACE:
    tf.debugging.set_log_device_placement(True)

def set_session_with_gpus_and_cores():
  cores = multiprocessing.cpu_count()
  gpus = len(tf.config.list_physical_devices('GPU'))
  config = tf.compat.v1.ConfigProto( device_count = {'GPU': gpus  , 'CPU': cores} , intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
  sess = tf.compat.v1.Session(config=config)
  tf.compat.v1.keras.backend.set_session(sess)

set_seeds_and_trace()
set_session_with_gpus_and_cores()
warnings.filterwarnings('ignore')
nltk.download('punkt')
textblob_tokenizer = lambda x: TextBlob(x).words

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
%%writefile get_data.sh
if [ ! -f yelp.csv ]; then
  wget -O yelp.csv https://www.dropbox.com/s/xds4lua69b7okw8/yelp.csv?dl=0
fi

Overwriting get_data.sh


In [4]:
!bash get_data.sh

In [5]:
path = './yelp.csv'
yelp = pd.read_csv(path)
# Create a new DataFrame that only contains the 5-star and 1-star reviews.
yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]
X = yelp_best_worst.text
y = yelp_best_worst.stars.map({1:0, 5:1})

In [6]:
X.shape

(4086,)

In [7]:
y.shape

(4086,)

In [8]:
# Create corpus of sentences such that the sentence has more than 3 words
# corpus = []

# def get_sentences(review):
#   sentences = []
#   sentences_in_review = [sent.text for sent in nlp(review).sents]
#   for sentence in sentences_in_review:

#     if sentence.endswith('..'):
#       sentence = re.sub(r'.$', '', sentence)
#     sentence = sentence.replace('..', ',')
#     sentence = re.sub('\s+', ' ', sentence)
#     sentence = sentence.strip()

#     if len(sentence.split(' ')) > 3:
#       sentences.append(sentence)

#   return sentences

# for reviews in X:
#   sentences = get_sentences(reviews)
#   if len(sentences) > 0:
#     corpus.append(sentences)

# corpus = [item for sublist in corpus for item in sublist]
# corpus = list(set(corpus))

# import swifter
# sentences = X.swifter.apply(get_sentences)
# sentences = X.apply(lambda x: get_sentences(x))

corpus = [line for line in X.values if len(textblob_tokenizer(line))>3]

At this point we have a list (any iterable will do) of queries that are longer than 3 words. This is normal to filter random queries. Now we must use the `Tokenizer` object to `fit` on the corpus, in order to convert each wor to an ID, and later convert such corpus of list of words into their identifiers.


In [9]:
print(len(corpus))
print(corpus[0:5])

4056
['My wife took me here on my birthday for breakfast and it was excellent.  The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure.  Our waitress was excellent and our food arrived quickly on the semi-busy Saturday morning.  It looked like the place fills up pretty quickly so the earlier you get here the better.\n\nDo yourself a favor and get their Bloody Mary.  It was phenomenal and simply the best I\'ve ever had.  I\'m pretty sure they only use ingredients from their garden and blend them fresh when you order it.  It was amazing.\n\nWhile EVERYTHING on the menu looks excellent, I had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious.  It came with 2 pieces of their griddled bread with was amazing and it absolutely made the meal complete.  It was the best "toast" I\'ve ever had.\n\nAnyway, I can\'t wait to go back!', 'I have no idea why some people give bad reviews about this place. It goes to show you, you

In [10]:
tokenizer = Tokenizer()
# Use the fit_on_texts method to fit the tokenizer
tokenizer.fit_on_texts(corpus) # Fill

print(f'Before the tokenizer: {corpus[:1]}')

#Now use the same "trained" tokenizer to convert the corpus from words to IDs with the texts_to_sequences method
tokenized_corpus = tokenizer.texts_to_sequences(corpus)

print(f'After the tokenizer: {tokenized_corpus[:1]}')

Before the tokenizer: ['My wife took me here on my birthday for breakfast and it was excellent.  The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure.  Our waitress was excellent and our food arrived quickly on the semi-busy Saturday morning.  It looked like the place fills up pretty quickly so the earlier you get here the better.\n\nDo yourself a favor and get their Bloody Mary.  It was phenomenal and simply the best I\'ve ever had.  I\'m pretty sure they only use ingredients from their garden and blend them fresh when you order it.  It was amazing.\n\nWhile EVERYTHING on the menu looks excellent, I had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious.  It came with 2 pieces of their griddled bread with was amazing and it absolutely made the meal complete.  It was the best "toast" I\'ve ever had.\n\nAnyway, I can\'t wait to go back!']
After the tokenizer: [[12, 447, 202, 35, 41, 20, 12, 571, 11, 282, 2, 9, 8

In [11]:
nb_samples = sum(len(s) for s in tokenized_corpus)
vocab_size = len(tokenizer.word_index) + 1

In [12]:
print(f'First 5 corpus items are {tokenized_corpus[:5]}')
print(f'Length of corpus is {len(tokenized_corpus)}')



First 5 corpus items are [[12, 447, 202, 35, 41, 20, 12, 571, 11, 282, 2, 9, 8, 196, 1, 1549, 8, 201, 71, 123, 654, 319, 4500, 43, 2394, 58, 1408, 1478, 50, 483, 8, 196, 2, 50, 28, 572, 664, 20, 1, 3444, 458, 616, 450, 9, 388, 38, 1, 27, 4501, 53, 178, 664, 25, 1, 1631, 15, 46, 41, 1, 138, 85, 600, 4, 1632, 2, 46, 43, 2217, 2726, 9, 8, 1388, 2, 693, 1, 66, 74, 109, 23, 86, 178, 163, 17, 77, 356, 632, 45, 43, 1036, 2, 2395, 80, 130, 54, 15, 113, 9, 9, 8, 99, 170, 140, 20, 1, 122, 545, 196, 3, 23, 1, 475, 2218, 3230, 770, 1409, 2727, 2, 9, 8, 301, 2, 108, 9, 154, 16, 144, 859, 6, 43, 8190, 243, 16, 8, 99, 2, 9, 364, 123, 1, 179, 998, 9, 8, 1, 66, 812, 74, 109, 23, 750, 3, 142, 139, 5, 48, 64], [3, 19, 69, 730, 273, 62, 107, 187, 197, 351, 52, 14, 27, 9, 731, 5, 610, 15, 15, 59, 551, 272, 17, 22, 305, 8191, 52, 190, 13, 43, 335, 1821, 33, 22, 157, 107, 38, 13, 10, 121, 928, 12, 283, 2, 3, 572, 26, 52, 159, 560, 1367, 14, 524, 633, 9, 8, 178, 897, 73, 97, 3, 320, 11, 4, 633, 715, 2, 320, 1

In [13]:
type(tokenized_corpus)

list

In [14]:
# This is the algorithmic part of batching the dataset and yielding the window of words and expected middle word for each bacth as a generator.
def generate_data(corpus, vocab_size, window_size=2, sentence_batch_size=15,  batch_size=250):
    np.random.shuffle(np.array(corpus))
    number_of_sentence_batches = (len(corpus) // sentence_batch_size) + 1
    for batch in range(number_of_sentence_batches):
        lower_end = batch*batch_size
        upper_end = (batch+1)*batch_size if batch+1 < number_of_sentence_batches else len(corpus)
        mini_batch_size = upper_end - lower_end
        maxlen = window_size*2
        X = []
        Y = []
        for review_id, words in enumerate(corpus[lower_end:upper_end]):
            L = len(words)
            for index, word in enumerate(words):
                contexts = []
                labels   = []
                s = index - window_size
                e = index + window_size + 1

                contexts.append([words[i] for i in range(s, e) if 0 <= i < L and i != index])
                labels.append(word)

                x = pad_sequences(contexts, maxlen=maxlen)
                y = to_categorical(labels, vocab_size)
                X.append(x)
                Y.append(y)
        X = tf.constant(X)
        Y = tf.constant(Y)
        number_of_batches = len(X) // batch_size
        for real_batch in range(number_of_batches):
          lower_end = batch*batch_size
          upper_end = (batch+1)*batch_size
          batch_X = tf.squeeze(X[lower_end:upper_end])
          batch_Y = tf.squeeze(Y[lower_end:upper_end])
          yield (batch_X, batch_Y)

Notice now in a sample how we construct X and y to predict words

In [15]:
iterable = generate_data(corpus=tokenized_corpus, vocab_size=vocab_size, batch_size=10)
sample_x, sample_y = next(iterable)

In [16]:
sample_y_numpy = sample_y.numpy()

sample_x

<tf.Tensor: shape=(10, 4), dtype=int32, numpy=
array([[  0,   0, 447, 202],
       [  0,  12, 202,  35],
       [ 12, 447,  35,  41],
       [447, 202,  41,  20],
       [202,  35,  20,  12],
       [ 35,  41,  12, 571],
       [ 41,  20, 571,  11],
       [ 20,  12,  11, 282],
       [ 12, 571, 282,   2],
       [571,  11,   2,   9]], dtype=int32)>

In [17]:
np.where(sample_y_numpy == 1)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([ 12, 447, 202,  35,  41,  20,  12, 571,  11, 282]))

Now comes the core part, defining the model. Keras provides a convenient Sequential model class to just `add` layers of any type and they will just work. Let's add an `Embedding` layer (that will map the word ids into a vector of size 100), a `Lambda` to average the words out in a sentence, and a `Dense layer` to select the best word on the other end. This is classic CBOW.


In [18]:
window = 2
cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=window*2))  # Add an Embedding layer with input_dim vocab_size, output_dim to be embedding_dim, and the input_length to be twice our window
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embedding_dim,)))  # Add a Lambda that takes a lambda function using the K.mean method to average the words. The output_shape should be (dim, ).
cbow.add(Dense(vocab_size, activation='softmax'))  # Add a classic Dense layer to just select with a softmax the best word
# Compile the model with a loss and optimizer of your liking.
cbow.compile(loss='categorical_crossentropy', optimizer='adam')
cbow.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 50)             997500    
                                                                 
 lambda (Lambda)             (None, 50)                0         
                                                                 
 dense (Dense)               (None, 19950)             1017450   
                                                                 
Total params: 2014950 (7.69 MB)
Trainable params: 2014950 (7.69 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [19]:
def fit_model():
    if not BATCH:
        # If we are not batching, Fill how to get X AND Y
        # X, Y = generate_data(corpus=tokenized_corpus, vocab_size=vocab_size, batch_size=len(y)) # Fill
        # print(f'Size of X is {X.shape} and Y is {Y.shape}')
        # cbow.fit(X, Y, epochs = epochs)
        pass
    else:
      for X, Y in generate_data(corpus=tokenized_corpus, vocab_size=vocab_size, batch_size=10):
          cbow.train_on_batch(X, Y)
      # for epoch in range(0,epochs):
      #   print(epoch)
      #   for X, Y in generate_data(corpus=tokenized_corpus, vocab_size=vocab_size, batch_size=10):
      #     cbow.train_on_batch(X, Y)
        # Implement the batching logic to train the model (Hint: use the train_on_batch method of Keras models)


In [None]:
fit_model()

In [None]:
with open('./cbow_scratch_synonims.txt' ,'w') as f:
    f.write('{} {}\n'.format(vocab_size-1, embedding_dim))
    vectors = cbow.get_weights()[0]
    for word, i in tokenizer.word_index.items():
        str_vec = ' '.join(map(str, list(vectors[i, :])))
        f.write('{} {}\n'.format(word, str_vec))

In [None]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('./cbow_scratch_synonims.txt', binary=False)

In [None]:
w2v.most_similar(positive=['gasoline'])

In [None]:
w2v.most_similar(negative=['apple'])