In [1]:
!pip install tensorflow



In [2]:
### Libraries USed Tensorflow> 2.0  and keras
import tensorflow as tf
print(tf.__version__)

2.18.0


In [23]:
##tensorflow >2.0
from tensorflow.keras.preprocessing.text import one_hot

In [24]:
### sentences
sent=[  'the glass of milk',
     'the glass of juice',
     'the cup of tea',
    'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'your videos are good']

In [25]:
sent # each sentence is of different length, in NN sentence size should be fixed/same. so we do post and pre padding

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'I am a good boy',
 'I am a good developer',
 'understand the meaning of words',
 'your videos are good']

In [26]:
### Vocabulary size #hyperparameter
voc_size=500

In [34]:
## one hot representation # will give index number where 1's are there. deprecated by tensorflow
#Deprecated: tf.keras.text.preprocessing.one_hot does not operate on tensors and is not recommended 
#for new code. Prefer tf.keras.layers.Hashing with output_mode='one_hot' which provides 
#equivalent functionality through a layer which accepts tf.Tensor input. 
#See the preprocessing layer guide for an overview of preprocessing layers. So using vectorizer below.
onehot_repr=[one_hot(words,voc_size)for words in sent] 
print(onehot_repr)


[[23, 390, 252, 321], [23, 390, 252, 56], [23, 322, 252, 496], [388, 449, 137, 331, 44], [388, 449, 137, 331, 151], [271, 23, 268, 252, 112], [94, 254, 89, 331]]


In [35]:
#Neural networks (especially embeddings / RNNs / transformers) expect fixed-length input sequences.
#But real sentences have variable length. So output_sequence_length tells TensorFlow how to handle that:
#If a sentence is shorter than output_sequence_length → it gets padded with 0s.
# If a sentence is longer → it gets truncated.
# Define TextVectorization
vectorizer = tf.keras.layers.TextVectorization(output_mode="int", output_sequence_length=8)
# "Adapt" builds the vocabulary
vectorizer.adapt(sent)

In [36]:
#It analyzes your dataset and builds the internal state the layer needs.
#For TextVectorization, adapt():
#tokenizes all your text samples counts word frequencies creates a vocabulary (mapping words → integer IDs)
#After that, the layer knows how to transform new unseen text consistently.
# Encode text to integer sequences
embedded_docs = vectorizer(sent)
print(embedded_docs)

tf.Tensor(
[[ 2  6  3 14  0  0  0  0]
 [ 2  6  3 16  0  0  0  0]
 [ 2 18  3 13  0  0  0  0]
 [ 5  7  8  4 19  0  0  0]
 [ 5  7  8  4 17  0  0  0]
 [12  2 15  3 10  0  0  0]
 [ 9 11 20  4  0  0  0  0]], shape=(7, 8), dtype=int64)


In [37]:
## word embedding representation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [38]:
import numpy as np

In [39]:
#In NN sentence size should be fixed/same. so we do post and pre padding. Assume max sentence length can be 8
#input size is fixed now
## pre padding
sent_length=8
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[  0   0   0   0  23 390 252 321]
 [  0   0   0   0  23 390 252  56]
 [  0   0   0   0  23 322 252 496]
 [  0   0   0 388 449 137 331  44]
 [  0   0   0 388 449 137 331 151]
 [  0   0   0 271  23 268 252 112]
 [  0   0   0   0  94 254  89 331]]


In [40]:
embedded_docs.shape

(7, 8)

In [41]:
type(embedded_docs)

numpy.ndarray

In [42]:
#conert each and every value to vectors
#feature representation size is considered 10 example the represented with 10 different values in a vector(10 * 1)
## 10 feature dimesnions
dim=10

In [43]:
model=Sequential()
model.add(Embedding(voc_size,10)) # (500,10,input_length=sent_length =8)
model.compile('adam','mse') # adam optimizer and mean squared error as loss functionS

In [44]:
model.summary()

In [45]:
##'the glass of milk',
embedded_docs[0] # max sentence length is 8

array([  0,   0,   0,   0,  23, 390, 252, 321])

In [17]:
model.predict(embedded_docs[0]) #0th sentence, 0 represented by 10 dimensions as feature dimension is 10
#0,0,0,0-> padded zeroes,126 -> the,241->glass,40->of, milk-> 109

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 334ms/step


array([[-0.01112232, -0.00081172,  0.02831871, -0.0245816 ,  0.01532907,
        -0.00180618,  0.01032201, -0.01420678, -0.01558369,  0.00040852],
       [-0.01112232, -0.00081172,  0.02831871, -0.0245816 ,  0.01532907,
        -0.00180618,  0.01032201, -0.01420678, -0.01558369,  0.00040852],
       [-0.01112232, -0.00081172,  0.02831871, -0.0245816 ,  0.01532907,
        -0.00180618,  0.01032201, -0.01420678, -0.01558369,  0.00040852],
       [-0.01112232, -0.00081172,  0.02831871, -0.0245816 ,  0.01532907,
        -0.00180618,  0.01032201, -0.01420678, -0.01558369,  0.00040852],
       [-0.02016089, -0.03144081, -0.03939272,  0.00629536, -0.00222921,
        -0.00986203,  0.03867291, -0.01435559, -0.02521435,  0.00554601],
       [-0.02333982,  0.02279809, -0.02882886,  0.01799783,  0.0167642 ,
         0.03396301,  0.01661063, -0.03770345,  0.01494106, -0.04478148],
       [-0.0358403 , -0.0272306 , -0.00584207,  0.03340179, -0.03380632,
        -0.03197195, -0.01240156,  0.00941103

In [18]:
#for index,value in np.ndenumerate(embedded_docs):
#    print(f"index:{index}, value:{value}")
#    print(model.predict(value))
X = []
for row in range(embedded_docs.shape[0]):
    for col in range(embedded_docs.shape[1]):
        print(f"\n row:{row}, col:{col},embedded_docs[row,col] :{embedded_docs[row,col]}\n")
        X.append(model.predict(embedded_docs[row]))
        print(X)


 row:0, col:0,embedded_docs[row,col] :0

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[array([[-0.01112232, -0.00081172,  0.02831871, -0.0245816 ,  0.01532907,
        -0.00180618,  0.01032201, -0.01420678, -0.01558369,  0.00040852],
       [-0.01112232, -0.00081172,  0.02831871, -0.0245816 ,  0.01532907,
        -0.00180618,  0.01032201, -0.01420678, -0.01558369,  0.00040852],
       [-0.01112232, -0.00081172,  0.02831871, -0.0245816 ,  0.01532907,
        -0.00180618,  0.01032201, -0.01420678, -0.01558369,  0.00040852],
       [-0.01112232, -0.00081172,  0.02831871, -0.0245816 ,  0.01532907,
        -0.00180618,  0.01032201, -0.01420678, -0.01558369,  0.00040852],
       [-0.02016089, -0.03144081, -0.03939272,  0.00629536, -0.00222921,
        -0.00986203,  0.03867291, -0.01435559, -0.02521435,  0.00554601],
       [-0.02333982,  0.02279809, -0.02882886,  0.01799783,  0.0167642 ,
         0.03396301,  0.01661063, -0.03770345,  0.01494106, -0.04478148],
   

In [19]:
import numpy as np
X_new=np.array(X)

In [20]:
# Train Test Split
# y not their in data set so cannot run ML algo on embeddings
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size = 0.20, random_state = 0)

NameError: name 'y' is not defined

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

In [None]:
#prediction
y_pred=classifier.predict(X_test)

In [None]:
score=accuracy_score(y_test,y_pred)
print(score)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

In [None]:
### Assignment
sent=["The world is a better place",
      "Marvel series is my favourite movie",
      "I like DC movies",
      "the cat is eating the food",
      "Tom and Jerry is my favourite movie",
      "Python is my favourite programming language"
      ]
#assignment: movie review of 50k dataset imdb dataset kaggle