# Importing the libraries

In [None]:
import nltk
from nltk.corpus import brown

import spacy
from spacy.tokens import Doc
import matplotlib.pyplot as plt

import numpy as np

nlp = spacy.load('en')
nltk.download('brown')
nltk.download('universal_tagset')

!pip install sklearn-crfsuite

import sklearn
import sklearn_crfsuite

from sklearn_crfsuite import metrics

from sklearn.model_selection import train_test_split

import tensorflow.keras as keras
from keras.models import Model

from sklearn.metrics.pairwise import cosine_similarity

import scipy.spatial.distance as distance

!pip install -q annoy

import annoy

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [None]:
words = brown.words()
sentences = brown.sents()

# Assign unique ids to each word

For this task all the words should be mapped to unique indices
indices start from 1 ( 0 is reserved for padding )

In [None]:
def Assign_ids(words_list):
  words_list = list(dict.fromkeys(words_list))
  return {k: v+1 for v, k in enumerate(words_list)}

In [None]:
ids = Assign_ids(words)

In [None]:
key_min = min(ids.keys(), key=(lambda k: ids[k]))
ids[key_min]

1

## Global Variables

In [None]:

################

MAX_INPUT_SIZE=10
NUM_WORDS = len(ids)

################

# String to IDs function

This function is responsible for transforming a sentence(spacy document to be precise) to a pair of lists of IDs. The first list represents the sentence without the last word, the second one represents it without the first word.

Before returning the result, padding/trimming should be applied to get uniform shapes.

In [None]:
def string_to_model_input(sentence): 
 
  X=[]
  Y=[]
  for token in sentence:
    X.append(ids[str(token)])
    Y.append(ids[str(token)])

  X.pop(len(X)-1)
  Y.pop(0)

  #padding
  X = (X + [0] * MAX_INPUT_SIZE)[:MAX_INPUT_SIZE]
  Y = (Y + [0] * MAX_INPUT_SIZE)[:MAX_INPUT_SIZE]

  return (X,Y)

In [None]:
string_to_model_input(sentences[0])

([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [2, 3, 4, 5, 6, 7, 8, 9, 10, 11])




# Runnig the string_to_model_input function on the sentences

In [None]:
X = [ string_to_model_input(sentence)[0] for sentence in sentences ]
Y = [ string_to_model_input(sentence)[1] for sentence in sentences ]

# Building the model

This function is for building the language model. It is an LSTM model with an Embedding layer for dimentionality reduction and embedding the corpus. 

The result of the prediction will be a matrix with dimention (MAX_INPUT_SIZE, 1, NUM_WORDS).

In [None]:
def build_model():

  # build network topology
  model = keras.Sequential()
  model.add(keras.layers.Embedding(NUM_WORDS, 10 , input_length=MAX_INPUT_SIZE )) 
  model.add(keras.layers.LSTM( 150 , return_sequences= True )) 
  model.add(keras.layers.LSTM( 100 , return_sequences= True, name = 'target')) 
  model.add(keras.layers.Dense(NUM_WORDS, activation= "softmax" ))

  model. compile (loss=keras.losses.sparse_categorical_crossentropy, optimizer='Adam', metrics = [ 'accuracy' ])

  return model

In [None]:
model = build_model()

In [None]:
x= X[99]

In [None]:
model.fit(X,Y, epochs=100)

In [None]:
# GET THE TRAINED MODEL FROM DRIVE
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
model = keras.models.load_model('/content/gdrive/MyDrive/nlp03.pt')

# function for getting the LSTM cell state:



In [None]:
def get_cell_state(word_id_vector):
  intermediate_layer = Model(inputs=model.input, outputs= model.get_layer('target').output)
  return intermediate_layer.predict([word_id_vector])[0]

# prediction function

After the training, the model will be capable of predicting the next word of a sentence. To achieve that, the index of the largest probability in the last row of the predicted matrix (last word) is taken and mapped to the original list of words.


In [None]:
def predict_next_word(x):
  
  y = model.predict([x])[0]
  try:
    next_probs = y[x.index(0)].tolist()
  except:
    next_probs = y[MAX_INPUT_SIZE -1].tolist()
  else:
    next_probs = y[MAX_INPUT_SIZE - x.index(0)].tolist()
  
  max_index = next_probs.index(max(next_probs))
  return list(ids.keys())[list(ids.values()).index(max_index +1)]
   

In [None]:
x= [1,2,3,4,5,6,7,8,9,0]
predict_next_word(x)

'conducted'

# Predict next word

In this part an input field is available to enter a list of words in a loop.
this sequence will be processed and fed to the model to predict the next word.

In [None]:
initial_text = ''

while True:
  word= input()
  if len(word) == 0:
    break
  else:
    initial_text = initial_text + ' ' +word 

initial_text = initial_text[1:]
initial_text = nlp(initial_text)


hello
how
are



In [None]:

def string_to_ids(initial_text):
  pred_input = []
  for token in initial_text:
    pred_input.append(ids[token.text])

  return (pred_input + [0]*MAX_INPUT_SIZE)[:MAX_INPUT_SIZE]

In [None]:
 pred_input = string_to_ids(initial_text)

In [None]:
print("sentence: ", initial_text, predict_next_word(pred_input))

sentence:  hello how are The


# cosine similarity
For this similarity metric the cell state of the LSTM layer is used. Thus in this task,a pair of sentences, will be read, transformed to IDs, fed to the model to get the cell states. the result will be flattened then the similarity is calculated.

In [None]:
while True:
  print("first sentence")
  first_sentence = input()
  if len(first_sentence) == 0:
    break
  print("second sentence")
  second_sentence = input()
  sentence_pair=[first_sentence, second_sentence]
  

first sentence
the cat is fast
second sentence
the dog is fast
first sentence



In [None]:
X = []
for sentence in sentence_pair:
  ids_sentense = []
  sentence = nlp(sentence)
  for token in sentence:
    ids_sentense.append(ids[token.text])
  ids_sentense = (ids_sentense + [0] * MAX_INPUT_SIZE)[:MAX_INPUT_SIZE]
  X.append(ids_sentense)

In [None]:
X

[[31, 15809, 143, 4877, 0, 0, 0, 0, 0, 0],
 [31, 4149, 143, 4877, 0, 0, 0, 0, 0, 0]]

In [None]:
sentences_cell_states = []
for sentence in X:
  sentences_cell_states.append(get_cell_state(sentence))

In [None]:
fx1 = [item for sublist in sentences_cell_states[0] for item in sublist]
fx2 = [item for sublist in sentences_cell_states[1] for item in sublist]
print(len(fx1))

In [None]:
d = cosine_similarity([fx1], [fx2])

print(sentence_pair[0], ' and ', sentence_pair[1], ' are ', d[0][0]*100, '% similar' )


the cat is fast  and  the dog is fast  are  97.88339734077454 %  similar


# Mini search engine

In this task all the corpus sentences should be transformed to IDs, to get all the cell states and index'em with annoy. (I took the first 200 sentences for speed purposes ).

then, an input field is provided in a loop to get the word sequence, and search for the 5 nearest neighbors.

An additional test is done at the end with the first sentence of the corpus, and the nearest neighbor is the sentence itself as supposed to be. 



In [None]:
def sentences_to_ids(corpus):
  vectors=[]
  
  for sentence in corpus:
    sentence = Doc(nlp.vocab, words=sentence)
    vectors.append(string_to_ids(sentence))
  return vectors

In [None]:
# Select the first 200 sentence for speed purposes
sentences = sentences[:200]

In [None]:
sentenses_ids = sentences_to_ids(sentences)


In [None]:
def get_all_cell_states(vectors):
  statevectors=[]
  for vec in vectors:
    state = get_cell_state(vec)
    statevectors.append(state)
  return statevectors

In [None]:
all_statevectors = get_all_cell_states(sentenses_ids)

In [None]:
all_statevectors = np.array(all_statevectors)

In [None]:
all_statevectors.shape

(200, 10, 100)

In [None]:
initial_text = ''

while True:
  word= input()
  if len(word) == 0:
    break
  else:
    initial_text = initial_text + ' ' + word 

initial_text = initial_text[1:]
initial_text = nlp(initial_text)
search_ids = string_to_ids(initial_text)
search_state = get_cell_state(search_ids)

hell
is
not
that
bad



In [None]:
annoy_index = annoy.AnnoyIndex(1000, metric='angular')

In [None]:
for i in range(all_statevectors.shape[0]):
  flate_vec = [item for sublist in all_statevectors[i] for item in sublist]
  annoy_index.add_item(i, flate_vec)


In [None]:
annoy_index.build(100)

True

In [None]:
v= [item for sublist in search_state for item in sublist]



In [None]:
nns = annoy_index.get_nns_by_vector(v, 5, search_k=-1, include_distances=False)

In [None]:
for n in nns:
  print(sentences[n][:MAX_INPUT_SIZE])

['Ask', 'jail', 'deputies']
['Wards', 'protected']
['Colquitt']
['``', 'Must', 'solve', 'problem', "''"]
['Construction', 'bonds']


In [None]:
test = Doc(nlp.vocab, words=sentences[0])
search_ids = string_to_ids(test)
search_state = get_cell_state(search_ids)

v= [item for sublist in search_state for item in sublist]
nns = annoy_index.get_nns_by_vector(v, 5, search_k=-1, include_distances=False)
nns



[0, 76, 154, 75, 91]

In [None]:
for n in nns:
  print(sentences[n][:MAX_INPUT_SIZE])

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of']
['Rep.', 'Mac', 'Barber', 'of', 'Commerce', 'is', 'asking', 'the', 'House', 'in']
['Rep.', 'James', 'Cotten', 'of', 'Weatherford', 'insisted', 'that', 'a', 'water', 'development']
['A', 'veteran', 'Jackson', 'County', 'legislator', 'will', 'ask', 'the', 'Georgia', 'House']
['The', 'former', 'county', 'school', 'superintendent', ',', 'George', 'P.', 'Callan', ',']
