In [1]:
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

#data visualisation and manipulation

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns

#set matplotlib inline and displays graphs below the corressponding cell

%matplotlib inline
style.use('fivethirtyeight')
sns.set(style='whitegrid', color_codes=True)

import nltk

In [2]:
from nltk import word_tokenize, sent_tokenize

# tensorflow keras
import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Flatten, Embedding, Input

print(tf.__version__)

2.0.0


In [3]:
sample_1 = 'bitty bought a bit of butter'
sample_2 = 'but the bit of butter was a bit bitter'
sample_3 = 'so she bought some better butter to make the bitter butter better'


In [4]:
corp = [sample_1, sample_2, sample_3]
no_docs = len(corp)
print(corp)
print(no_docs)

['bitty bought a bit of butter', 'but the bit of butter was a bit bitter', 'so she bought some better butter to make the bitter butter better']
3


In [5]:
# encoding
vocab_size = 50
encod_corp = []
for i, doc in enumerate(corp):
    encod_corp.append(one_hot(doc, 50))
    print(f" The encoding for document {i+1} is: {one_hot(doc, 50)}")
    

 The encoding for document 1 is: [44, 31, 1, 45, 20, 33]
 The encoding for document 2 is: [47, 8, 45, 20, 33, 28, 1, 45, 49]
 The encoding for document 3 is: [12, 49, 31, 7, 24, 33, 3, 13, 8, 49, 33, 24]


In [6]:
# step 2
# Padding the doc
# the keras embedding layer requires all individual documents to be the same length

#length of maximum document, will be nedded whenever create embeddings for words

maxlen = -1 
for doc in corp:
    tokens = nltk.word_tokenize(doc)
    if maxlen < len(tokens):
        maxlen = len(tokens)
print(f'The maximum number of words in any document is: {maxlen}')

The maximum number of words in any document is: 12


In [7]:
# now to create embeddings all of our docs need to be of same length

pad_corp = pad_sequences(encod_corp, maxlen=maxlen, padding='post',
                         value=0.0)
print(f" No of padded documents: {len(pad_corp)}")

 No of padded documents: 3


In [8]:
pad_corp

array([[44, 31,  1, 45, 20, 33,  0,  0,  0,  0,  0,  0],
       [47,  8, 45, 20, 33, 28,  1, 45, 49,  0,  0,  0],
       [12, 49, 31,  7, 24, 33,  3, 13,  8, 49, 33, 24]])

In [10]:
for i, doc in enumerate(pad_corp):
    print(i+1, doc)

1 [44 31  1 45 20 33  0  0  0  0  0  0]
2 [47  8 45 20 33 28  1 45 49  0  0  0]
3 [12 49 31  7 24 33  3 13  8 49 33 24]


In [11]:
##Actually creating the embeddings using keras embedding layer
# wwe will embed the words into vectors of 8 dimensions

#Specifying the input shape

input = Input(shape=(no_docs, maxlen),dtype='float64')
"""
shape of input 
each document has 12 elenment or words which is the value of our maxlen variable
"""


'\nshape of input \neach document has 12 elenment or words which is the value of our maxlen variable\n'

In [13]:
word_input = Input(shape=(maxlen,), dtype='float64')

#create the embedding

word_embedding = Embedding(input_dim=vocab_size, output_dim=8, 
                           input_length=maxlen)(word_input)
word_vec = Flatten()(word_embedding)
embed_model = Model([word_input], word_vec)
# combining all into a keras model

# Parameters of the embedding layer: 
# input_dim = the vocab size that we will choose. the number of unique words in the vocab
# out_dim = the number of dimensions we wish to embed into.
# each word will be represented by a vector of this much dimensions

# input_length = length of the maximum ducument. which is stored in maxlen variable in our case

In [15]:
print(word_embedding)
print(word_vec)


Tensor("embedding/Identity:0", shape=(None, 12, 8), dtype=float32)
Tensor("flatten/Identity:0", shape=(None, 96), dtype=float32)


In [17]:
embed_model.compile(optimizer= tf.keras.optimizers.Adam(lr=1e-3), 
                    loss='binary_crossentropy', metrics=['acc'])


In [18]:
print(type(word_embedding))
print(word_embedding)

<class 'tensorflow.python.framework.ops.Tensor'>
Tensor("embedding/Identity:0", shape=(None, 12, 8), dtype=float32)


In [19]:
embed_model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 12)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 12, 8)             400       
_________________________________________________________________
flatten (Flatten)            (None, 96)                0         
Total params: 400
Trainable params: 400
Non-trainable params: 0
_________________________________________________________________


In [20]:
embeddings = embed_model.predict(pad_corp)
#finally getting the embeddings

In [21]:
print(f" shape of embeddings: {embeddings.shape}")
print(embeddings)

 shape of embeddings: (3, 96)
[[-0.02321883  0.00590817 -0.03908654  0.00891554 -0.04585463 -0.03270767
  -0.04084574  0.0353308  -0.01433499  0.04560712 -0.04818144 -0.00751668
   0.03635092 -0.04925726 -0.01476747 -0.02798288 -0.0201286   0.03721992
   0.00277926  0.00698855  0.01600209 -0.01949042  0.03407076  0.02152048
   0.00773871 -0.01210929  0.04502001  0.03467814  0.01702353 -0.04315567
   0.00123763 -0.04305519 -0.0088929  -0.02109972 -0.02930001 -0.00044409
   0.01457237  0.01910624  0.02606511 -0.01090483  0.03869852  0.04664112
   0.04851368  0.04248032  0.04791268  0.01435616 -0.0347581   0.00446444
  -0.02522277  0.01396323  0.02518037 -0.00034447 -0.04707885 -0.03438509
   0.01970177  0.04845733 -0.02522277  0.01396323  0.02518037 -0.00034447
  -0.04707885 -0.03438509  0.01970177  0.04845733 -0.02522277  0.01396323
   0.02518037 -0.00034447 -0.04707885 -0.03438509  0.01970177  0.04845733
  -0.02522277  0.01396323  0.02518037 -0.00034447 -0.04707885 -0.03438509
   0.019

In [23]:
embeddings = embeddings.reshape(-1, maxlen, 8)
print(f"shape of embeddings: {embeddings.shape}")
print(embeddings)

shape of embeddings: (3, 12, 8)
[[[-0.02321883  0.00590817 -0.03908654  0.00891554 -0.04585463
   -0.03270767 -0.04084574  0.0353308 ]
  [-0.01433499  0.04560712 -0.04818144 -0.00751668  0.03635092
   -0.04925726 -0.01476747 -0.02798288]
  [-0.0201286   0.03721992  0.00277926  0.00698855  0.01600209
   -0.01949042  0.03407076  0.02152048]
  [ 0.00773871 -0.01210929  0.04502001  0.03467814  0.01702353
   -0.04315567  0.00123763 -0.04305519]
  [-0.0088929  -0.02109972 -0.02930001 -0.00044409  0.01457237
    0.01910624  0.02606511 -0.01090483]
  [ 0.03869852  0.04664112  0.04851368  0.04248032  0.04791268
    0.01435616 -0.0347581   0.00446444]
  [-0.02522277  0.01396323  0.02518037 -0.00034447 -0.04707885
   -0.03438509  0.01970177  0.04845733]
  [-0.02522277  0.01396323  0.02518037 -0.00034447 -0.04707885
   -0.03438509  0.01970177  0.04845733]
  [-0.02522277  0.01396323  0.02518037 -0.00034447 -0.04707885
   -0.03438509  0.01970177  0.04845733]
  [-0.02522277  0.01396323  0.02518037 -0

In [24]:
# 3 - 3 document
# 12 - each document is made of 12 words which was our maximun length of any document
# 8 - each word is 8 dimensional


# getting encoding for a particular word in a specific document 

for i, doc in enumerate(embeddings):
    for j, word in enumerate(doc):
        print(f" The encoding for {j+1} th word in {i+1} document is: \n\n {word}")

 The encoding for 1 th word in 1 document is: 

 [-0.02321883  0.00590817 -0.03908654  0.00891554 -0.04585463 -0.03270767
 -0.04084574  0.0353308 ]
 The encoding for 2 th word in 1 document is: 

 [-0.01433499  0.04560712 -0.04818144 -0.00751668  0.03635092 -0.04925726
 -0.01476747 -0.02798288]
 The encoding for 3 th word in 1 document is: 

 [-0.0201286   0.03721992  0.00277926  0.00698855  0.01600209 -0.01949042
  0.03407076  0.02152048]
 The encoding for 4 th word in 1 document is: 

 [ 0.00773871 -0.01210929  0.04502001  0.03467814  0.01702353 -0.04315567
  0.00123763 -0.04305519]
 The encoding for 5 th word in 1 document is: 

 [-0.0088929  -0.02109972 -0.02930001 -0.00044409  0.01457237  0.01910624
  0.02606511 -0.01090483]
 The encoding for 6 th word in 1 document is: 

 [ 0.03869852  0.04664112  0.04851368  0.04248032  0.04791268  0.01435616
 -0.0347581   0.00446444]
 The encoding for 7 th word in 1 document is: 

 [-0.02522277  0.01396323  0.02518037 -0.00034447 -0.04707885 -0