**Bag of Words**

  * Words into numerical form

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

docs = [
    "I am a good Cricket player",
    "Rajiv is a bad Chess player"
]

vectorizer = CountVectorizer()
bow_matrix =  vectorizer.fit_transform(docs)
#print(vectorizer.get_feature_names_out())

pd.DataFrame(bow_matrix.toarray() , columns=vectorizer.get_feature_names_out())

Unnamed: 0,am,bad,chess,cricket,good,is,player,rajiv
0,1,0,0,1,1,0,1,0
1,0,1,1,0,0,1,1,1


**TFiD vectorizer**

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

docs = [
    "AI is the future",
    "AI and ML are the future",
    "Physics is really intersting thing",
    "I am intrested in concepts of physics"
]

In [31]:
vectorizer = TfidfVectorizer()
tfid_matrix = vectorizer.fit_transform(docs)

In [32]:
vectorizer.get_feature_names_out()

array(['ai', 'am', 'and', 'are', 'concepts', 'future', 'in', 'intersting',
       'intrested', 'is', 'ml', 'of', 'physics', 'really', 'the', 'thing'],
      dtype=object)

In [33]:
import numpy as np
import pandas as pd

pd.DataFrame(tfid_matrix.toarray() , columns = vectorizer.get_feature_names_out())

Unnamed: 0,ai,am,and,are,concepts,future,in,intersting,intrested,is,ml,of,physics,really,the,thing
0,0.5,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.0
1,0.357455,0.0,0.453386,0.453386,0.0,0.357455,0.0,0.0,0.0,0.0,0.453386,0.0,0.0,0.0,0.357455,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.485461,0.0,0.382743,0.0,0.0,0.382743,0.485461,0.0,0.485461
3,0.0,0.421765,0.0,0.0,0.421765,0.0,0.421765,0.0,0.421765,0.0,0.0,0.421765,0.332524,0.0,0.0,0.0


**Tokenization & Word embedding**

  * Tokenization is the process of breaking a sentence or paragraph into smaller units called tokens usually words or subwords.

  * **Integer encoding** - After tokenization, we convert each unique word into integer ID, SO that it can be used for predictive modelling or in a neural network

In [52]:
from tensorflow.keras.preprocessing.text import Tokenizer


texts = [
    "I am playing good cricket",
    "He is playing chess",
]

#Initialize the tokenizer
tokenizer = Tokenizer()

#Fit the text on tokenizer
tokenizer.fit_on_texts(texts)

#View the word index
print(tokenizer.word_index)

{'playing': 1, 'i': 2, 'am': 3, 'good': 4, 'cricket': 5, 'he': 6, 'is': 7, 'chess': 8}


In [53]:
#Convert text to sequence of integers
sequences = tokenizer.texts_to_sequences(texts)
print(sequences)

[[2, 3, 1, 4, 5], [6, 7, 1, 8]]


In [54]:
from keras.utils import pad_sequences
padded_sequences = pad_sequences(sequences , padding='post')
padded_sequences

array([[2, 3, 1, 4, 5],
       [6, 7, 1, 8, 0]], dtype=int32)

In [55]:
from keras.utils import to_categorical
to_categorical(padded_sequences)

array([[[0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0.]]])

**Keras embedding**

In [60]:
from tensorflow.keras import models, layers

model = models.Sequential()
model.add(layers.Embedding(input_dim = 9 , output_dim = 4, input_length=5))
model.summary()

In [61]:
model.compile('adam' , 'accuracy')
word_vectors = model.predict(padded_sequences)
word_vectors

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 216ms/step


array([[[-0.02020686, -0.04125232,  0.01534704,  0.02173528],
        [ 0.01239552, -0.04860717, -0.04437011, -0.04814874],
        [ 0.00399593,  0.04833022, -0.02193937, -0.00617837],
        [-0.03564187, -0.01835127, -0.00079627,  0.02289392],
        [ 0.04922066, -0.02339051,  0.03565074,  0.04176407]],

       [[ 0.04921648,  0.02977195,  0.04304693,  0.02741532],
        [ 0.04451391, -0.01525299, -0.04584635,  0.01142229],
        [ 0.00399593,  0.04833022, -0.02193937, -0.00617837],
        [ 0.03769281,  0.04309872, -0.03978114,  0.0454892 ],
        [-0.0490764 ,  0.02441097,  0.00674274,  0.03950968]]],
      dtype=float32)

In [64]:
word_vectors[0].flatten()

array([-0.02020686, -0.04125232,  0.01534704,  0.02173528,  0.01239552,
       -0.04860717, -0.04437011, -0.04814874,  0.00399593,  0.04833022,
       -0.02193937, -0.00617837, -0.03564187, -0.01835127, -0.00079627,
        0.02289392,  0.04922066, -0.02339051,  0.03565074,  0.04176407],
      dtype=float32)

In [65]:
word_vectors[1].flatten()

array([ 0.04921648,  0.02977195,  0.04304693,  0.02741532,  0.04451391,
       -0.01525299, -0.04584635,  0.01142229,  0.00399593,  0.04833022,
       -0.02193937, -0.00617837,  0.03769281,  0.04309872, -0.03978114,
        0.0454892 , -0.0490764 ,  0.02441097,  0.00674274,  0.03950968],
      dtype=float32)

**Build a predictive model which can predict or identify text is about cricket or about chess**

In [None]:
texts = [
    "I am playing good cricket",
    "He is playing chess",
]

In [66]:
import numpy as np
sent_levels = np.array([1,0])

In [67]:
from tensorflow.keras import models, layers

model = models.Sequential()
model.add(layers.Embedding(input_dim = 9 , output_dim = 4, input_length=5))
model.add(layers.Flatten())
model.add(layers.Dense(4))
model.add(layers.Dense(1, activation='sigmoid'))



In [68]:
model.compile('adam' , loss='binary_crossentropy' , metrics = ['accuracy'])

In [69]:
model.fit(padded_sequences , sent_levels , epochs=10)

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.0000e+00 - loss: 0.7058
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step - accuracy: 0.5000 - loss: 0.7030
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - accuracy: 0.5000 - loss: 0.7001
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.5000 - loss: 0.6973
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.5000 - loss: 0.6945
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.5000 - loss: 0.6917
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 1.0000 - loss: 0.6889
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 1.0000 - loss: 0.6862
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

<keras.src.callbacks.history.History at 0x7fd3d89becd0>

In [89]:
sent_to_predict = ["I am playing good cricket"]
sentseq = tokenizer.texts_to_sequences(sent_to_predict)
pred_data = pad_sequences(sentseq  , padding='post' , maxlen=5)
pred_data

array([[2, 3, 1, 4, 5]], dtype=int32)

In [90]:
model.predict(pred_data)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step


array([[0.5066875]], dtype=float32)

In [91]:
print(int(model.predict(pred_data) > 0.5))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
1


  print(int(model.predict(pred_data) > 0.5))


**Word2Vec**

**Glove**