In [297]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import gensim
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense
from keras.initializers import Constant

In [265]:
with open('Machine Learning.txt','r',encoding='utf-8') as myfile:
    mytext=myfile.read()

In [266]:
mytext[:1000]

"Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.\nRecently, artificial neural networks have been able to surpass many previous approaches in performance.\nML finds application in many fields, including natural language processing, computer vision, speech recognition, email filtering, agriculture, and medicine.\nWhen applied to business problems, it is known under the name predictive analytics. Although not all machine learning is statistically based, computational statistics is an important source of the field's methods.\nThe mathematical foundations of ML are provided by mathematical optimization (mathematical programming) methods.\nData mining is a related (parallel) field of study, focusing on exploratory data analysis (EDA) through unsupervised learning.\nFrom a theoretical viewpoint, pr

In [267]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts([mytext])

In [268]:
tokenizer.word_index

{'the': 1,
 'of': 2,
 'a': 3,
 'learning': 4,
 'to': 5,
 'and': 6,
 'in': 7,
 'is': 8,
 'data': 9,
 'machine': 10,
 'that': 11,
 'as': 12,
 'by': 13,
 'an': 14,
 'for': 15,
 'algorithms': 16,
 'with': 17,
 'are': 18,
 'or': 19,
 'from': 20,
 'on': 21,
 'model': 22,
 'be': 23,
 'training': 24,
 'it': 25,
 'can': 26,
 'set': 27,
 'based': 28,
 'supervised': 29,
 'methods': 30,
 'unsupervised': 31,
 'this': 32,
 'has': 33,
 'examples': 34,
 'used': 35,
 'algorithm': 36,
 'compression': 37,
 'feature': 38,
 'such': 39,
 'not': 40,
 'also': 41,
 'field': 42,
 'other': 43,
 'knowledge': 44,
 'been': 45,
 'which': 46,
 'one': 47,
 'input': 48,
 'example': 49,
 'many': 50,
 'analysis': 51,
 'was': 52,
 'more': 53,
 'its': 54,
 'ai': 55,
 'between': 56,
 'rule': 57,
 'have': 58,
 'systems': 59,
 'reinforcement': 60,
 'into': 61,
 'classification': 62,
 'but': 63,
 'detection': 64,
 'artificial': 65,
 'neural': 66,
 'performance': 67,
 'computer': 68,
 'known': 69,
 'mathematical': 70,
 'program

In [269]:
all_sequences=[]
for line in mytext.split('\n'):
    sequence=tokenizer.texts_to_sequences([line])[0]
    all_sequences.append(sequence)

In [270]:
all_sequences[1]

[533, 65, 66, 113, 58, 45, 329, 5, 534, 50, 235, 142, 7, 67]

In [271]:
all_seq=[]
for seq in all_sequences:
    for i in range(1,len(seq)):
        se=seq[:i+1]
        all_seq.append(se)

In [272]:
all_seq[:10]

[[10, 4],
 [10, 4, 177],
 [10, 4, 177, 8],
 [10, 4, 177, 8, 3],
 [10, 4, 177, 8, 3, 42],
 [10, 4, 177, 8, 3, 42, 2],
 [10, 4, 177, 8, 3, 42, 2, 109],
 [10, 4, 177, 8, 3, 42, 2, 109, 7],
 [10, 4, 177, 8, 3, 42, 2, 109, 7, 65],
 [10, 4, 177, 8, 3, 42, 2, 109, 7, 65, 100]]

In [273]:
maxlen=max([len(i) for i in all_seq])
pad_seq=np.array(pad_sequences(all_seq,maxlen=maxlen,padding='pre'))

In [274]:
pad_seq

array([[   0,    0,    0, ...,    0,   10,    4],
       [   0,    0,    0, ...,   10,    4,  177],
       [   0,    0,    0, ...,    4,  177,    8],
       ...,
       [   0,    0,    0, ...,    2,    3,  248],
       [   0,    0,    0, ...,    3,  248, 1319],
       [   0,    0,    0, ...,  248, 1319,   27]])

In [275]:
X=pad_seq[:,:-1]
y=pad_seq[:,-1]

In [276]:
X

array([[   0,    0,    0, ...,    0,    0,   10],
       [   0,    0,    0, ...,    0,   10,    4],
       [   0,    0,    0, ...,   10,    4,  177],
       ...,
       [   0,    0,    0, ...,  470,    2,    3],
       [   0,    0,    0, ...,    2,    3,  248],
       [   0,    0,    0, ...,    3,  248, 1319]])

In [277]:
y

array([   4,  177,    8, ...,  248, 1319,   27])

In [278]:
len(X)

4499

In [279]:
len(y)

4499

In [287]:
embedding_dictionary=dict()
glove_file=open('glove.6B.100d.txt',encoding='utf8')
for line in glove_file:
    records=line.split()
    word=records[0]
    vector_dimensions=np.array(records[1:],dtype='float32')
    embedding_dictionary[word]=vector_dimensions
glove_file.close()

In [290]:
embedding_dictionary['the']

array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
       -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
        0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
       -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
        0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
       -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
        0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
        0.20335 , -0.12763 ,  0.41367 ,  0.55187 ,  0.57908 , -0.33477 ,
       -0.36559 , -0.54857 , -0.062892,  0.26584 ,  0.30205 ,  0.99775 ,
       -0.80481 , -3.0243  ,  0.01254 , -0.36942 ,  2.2167  ,  0.72201 ,
       -0.24978 ,  0.92136 ,  0.034514,  0.46745 ,  1.1079  , -0.19358 ,
       -0.074575,  0.23353 , -0.052062, -0.22044 ,  0.057162, -0.15806 ,
       -0.30798 , -0.41625 ,  0.37972 ,  0.15006 , -0.53212 , -0.2055  ,
       -1.2526  ,  0.071624,  0.70565 ,  0.49744 , 

In [292]:
embedding_matrix=np.zeros((vocab_size,DIM))
for word,index in tokenizer.word_index.items():
    embedding_vector=embedding_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index]=embedding_vector

In [293]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.038194  , -0.24487001,  0.72812003, ..., -0.1459    ,
         0.82779998,  0.27061999],
       [-0.1529    , -0.24279   ,  0.89837003, ..., -0.59100002,
         1.00390005,  0.20664001],
       ...,
       [-0.09901   , -0.28292999,  0.59741998, ..., -0.20057   ,
         0.011939  ,  0.61221999],
       [ 0.76877999, -0.33102   , -0.0075087 , ..., -0.34268001,
         0.66249001, -0.92395997],
       [ 0.080861  , -0.85575002, -0.69593   , ..., -0.056623  ,
        -0.21822999, -0.44773999]])

In [303]:
model=Sequential()

In [304]:
model.add(Embedding(vocab_size,DIM,embeddings_initializer=Constant(embedding_matrix),trainable=False,input_shape=(maxlen,)))
model.add(LSTM(150))
model.add(Dense(vocab_size,activation='softmax'))

In [305]:
model.summary()

In [306]:
model.compile(optimizer='adam',
             loss='sparse_categorical_crossentropy',
             metrics=['accuracy'])

In [307]:
model.fit(X,y,epochs=50,verbose=1)

Epoch 1/50
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 88ms/step - accuracy: 0.0398 - loss: 6.6431
Epoch 2/50
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 90ms/step - accuracy: 0.0661 - loss: 5.9995
Epoch 3/50
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 90ms/step - accuracy: 0.0876 - loss: 5.7499
Epoch 4/50
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 92ms/step - accuracy: 0.1171 - loss: 5.3899
Epoch 5/50
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 92ms/step - accuracy: 0.1322 - loss: 5.1173
Epoch 6/50
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 95ms/step - accuracy: 0.1590 - loss: 4.7604
Epoch 7/50
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 93ms/step - accuracy: 0.1852 - loss: 4.4876
Epoch 8/50
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 93ms/step - accuracy: 0.2043 - loss: 4.1339
Epoch 9/50
[1m141/141[

<keras.src.callbacks.history.History at 0x1f1997b5e40>

In [313]:
def next_word(input_text,predict_next_words):
    for _ in range(predict_next_words):
        input_seq=tokenizer.texts_to_sequences([input_text])[0]
        input_pad=np.array(pad_sequences([input_seq],maxlen=maxlen-1,padding='pre'))
        predict=np.argmax(model.predict(input_pad),axis=-1)
        output_text=''
        for word,index in tokenizer.word_index.items():
            if predict==index:
                output_text=word
                break
        input_text=input_text+' '+output_text
    return input_text

In [315]:
next_word('Machine learning is',10)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step


'Machine learning is a general term for any machine learning method that identifies'

In [318]:
next_word('used in this',5)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step


'used in this context is to classify data'

In [319]:
next_word('supervised learning is',10)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step


'supervised learning is a rule based machine learning method for discovering relationships between'

In [320]:
next_word('unsupervised learning',10)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step


'unsupervised learning algorithms find structures in data that has not been labeled'

In [328]:
next_word('A popular heuristic method',10)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step


'A popular heuristic method for sparse dictionary learning is the k svd algorithm is'

In [329]:
next_word('A popular heuristic way',10)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step


'A popular heuristic way that can be used in a way that makes it'