In [1]:
#!pip install keras_nlp

In [2]:
import re
import tensorflow as tf
import numpy as np
import keras
from keras.layers import Dense, Input, TextVectorization
from keras.models import Sequential
from keras_nlp.layers import TokenAndPositionEmbedding, TransformerDecoder
from keras_nlp.metrics import Perplexity
from keras.callbacks import ReduceLROnPlateau,EarlyStopping
from nltk.metrics.distance import edit_distance

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Using TensorFlow backend


In [3]:
with open("/kaggle/input/textgen/text.txt", "r") as f:
    raw_text = f.readlines()

In [4]:
with open("/kaggle/input/textgen/words_alpha.txt","r") as f:
    data = [word.strip() for word in f.readlines()]
dictionary = dict(zip(data, range(len(data))))

In [5]:
def preprocessing_text(raw_text):
    text = str(raw_text[3:]) #First 3 lines are header data, so I'm taking into account for the data 4th line onwards
    text = re.sub(r'@@\d+','',text) #Replacing text like @@1234
    text = re.sub(r'#','',text) #Replacing # from text
    text = re.sub(r"@(\s@)+",'',text)
    text = re.sub(r'\(.*?\)','',text)
    text = re.sub(r'\s+\'s', '', text)
    text = re.sub(r'<\w+>','',text)
    text = re.sub(r'</\w+>','',text)
    text = re.sub(r'\*\*(\d+;\d+;[A-Za-z])','',text) #**37;11433;TOOLONG
    text = re.sub(r"\\'","",text) #\' \' Nature \' \'
    text = re.sub(r'#&[a-zA-Z]+;[a-zA-Z]+\s*;','',text) #&amp;ndash ;
    text = re.sub(r'&[a-zA-Z]*;','',text) #&amp;
    text = re.sub(r"\n",' ',text)

    text = text.lower()
    return text.split(".")


In [6]:
text = preprocessing_text(raw_text)
text = list(text[:50000])


In [7]:
maxlen = 50
vectorize = TextVectorization(
    output_mode="int",
    output_sequence_length=maxlen+1
)

vectorize.adapt(text)
vocab = vectorize.get_vocabulary()

In [8]:
vocab_list = dict(zip(range(len(vocab)), vocab))
vocab_size = len(vocab_list)
print("Total number of unique words found from the data: ", vocab_size)

Total number of unique words found from the data:  69598


In [9]:
train = text[:int(0.85*len(text))]
test = text[int(0.85*len(text)):]

train = tf.data.Dataset.from_tensor_slices(train)
test = tf.data.Dataset.from_tensor_slices(test)
train = train.batch(128)
test = test.batch(128)

In [10]:
def vectorize_dataset(text):

  text = tf.expand_dims(text, -1)
  tokenized_text = vectorize(text)

  x = tokenized_text[:, :-1]
  y = tokenized_text[:, 1:]
  return x,y

train = train.map(vectorize_dataset)
train = train.prefetch(tf.data.AUTOTUNE)
#train = train.batch(1000)

test = test.map(vectorize_dataset)
test = test.prefetch(tf.data.AUTOTUNE)
#test = test.batch(1000)

In [11]:
input_layer = Input(shape=(maxlen,), dtype = tf.int32)
embedding_layer = TokenAndPositionEmbedding(vocab_size, maxlen, 128)(input_layer)
decoder_layer = TransformerDecoder(
    intermediate_dim = 128,
    num_heads = 4,
    dropout=0.2,
    activation="relu")(embedding_layer)
output_layer = Dense(vocab_size,
                     activation="softmax")(decoder_layer)

model = keras.Model(inputs = input_layer, outputs = output_layer)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 50)]              0         
                                                                 
 token_and_position_embeddin  (None, 50, 128)          8914944   
 g (TokenAndPositionEmbeddin                                     
 g)                                                              
                                                                 
 transformer_decoder (Transf  (None, 50, 128)          99584     
 ormerDecoder)                                                   
                                                                 
 dense_2 (Dense)             (None, 50, 69598)         8978142   
                                                                 
Total params: 17,992,670
Trainable params: 17,992,670
Non-trainable params: 0
_________________________________________________

In [12]:
model.compile(
        optimizer="adam",
        loss='sparse_categorical_crossentropy',
        metrics=[Perplexity(), 'accuracy']
    )

In [14]:
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)
early_stopping = EarlyStopping(monitor="val_accuracy", patience=5, verbose=1)
model.fit(train, validation_data = test, batch_size=2500, epochs=20, verbose=1,callbacks=[reduce_lr,early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 9: early stopping


<keras.callbacks.History at 0x7a032001b220>

In [16]:
def predict_word(input_text):
  predicted_words = []
  #print(input_text)
  tokenized_prompt = vectorize([input_text])[:, :-1]
  predictions = model.predict([tokenized_prompt], verbose=0)
  sample_index = len(input_text.strip().split())-1
  #print(predictions[0][sample_index])
  logits, indices = tf.math.top_k(predictions[0][sample_index], k=20, sorted=True)
  indices = np.asarray(indices).astype("int32")
  #print(indices)
  for i in indices:
    predicted_words.append(vocab_list[i])

  return predicted_words


In [27]:
def autocorrect_sentence(sentence):
  new_sentence = " "
  mistake = 0
  wrong_word = ""
  tmp_predict = dict()
  tmp_word=[]
  tmp_key=[]
  correct_word = ""
  words = sentence.lower()
  words = sentence.split()
  for word in words:
    if word in dictionary:
      new_sentence += word
      new_sentence += " "
    else:
      wrong_word = word
      mistake = 1
      break

  if mistake == 0:
    print("The final corrected sentence is: ", new_sentence)
    #return new_sentence
  else:
  #if len(new_sentence) < len(sentence):
    predicted_words = predict_word(new_sentence)
    min_distance = 100
    for word in predicted_words:
      distance = edit_distance(word, wrong_word)
      #print(word," : ",distance)
      tmp_key.append(distance)
      tmp_word.append(word)
      """
      predict_dict = sorted(predict_dict)
      print(predict_dict)
      if distance < min_distance:
        correct_word = word
        min_distance = distance
    """

    tmp_predict = dict(zip(tmp_word,tmp_key))
    keys = list(tmp_predict.keys())
    values = list(tmp_predict.values())
    sorted_value_index = np.argsort(values)
    tmp_predict = {keys[i]: values[i] for i in sorted_value_index}

    #print(tmp_predict)
    for word in tmp_predict.keys():
      if len(word)>3 and (word[0] == wrong_word.lower()[0] and word[1] == wrong_word.lower()[1]) :
        correct_word = word
        new_sentence += correct_word

    print("Wrong word: ",wrong_word," Correct word", correct_word)
    #print(new_sentence)
    #print(len(words))

    new_sentence_words = new_sentence.split()
    for i in range(len(new_sentence_words),len(words)):
      new_sentence += " "
      new_sentence += words[i]
    #print(new_sentence)


    autocorrect_sentence(new_sentence)




In [28]:
texts = ["i am extremely ANSIOUS about the matter how it will apper to them",
        "i have been thinking of all the foreastd areas",
        "i am going away to meet my wift",
        "where have you been all this while"]

for text in texts:
  print("\n\nInput Sentence:", text.lower())
  output = autocorrect_sentence(text)
  print(output)



Input Sentence: i am extremely ansious about the matter how it will apper to them
Wrong word:  ANSIOUS  Correct word anxious
Wrong word:  apper  Correct word appear
The final corrected sentence is:   i am extremely anxious about the matter how it will appear to them 
None


Input Sentence: i have been thinking of all the foreastd areas
Wrong word:  foreastd  Correct word forested
The final corrected sentence is:   i have been thinking of all the forested areas 
None


Input Sentence: i am going away to meet my wift
Wrong word:  wift  Correct word wife
The final corrected sentence is:   i am going away to meet my wife 
None


Input Sentence: where have you been all this while
The final corrected sentence is:   where have you been all this while 
None
