## Import Libraries

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import Counter
from keras.preprocessing.text import Tokenizer
import numpy as np 
import pandas as pd 
import random  
import string
from string import punctuation
import bs4 as bs  
import urllib.request  
import re
import collections
import operator
import multiprocessing
import gensim.models.word2vec as w2v
import os
import warnings
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [None]:
warnings.simplefilter(action='ignore', category=FutureWarning)

## Bag of Words

In [None]:
docs = [
  'I took the dog dog for a walk',
  'I went for a walk with my dog last evening',
  'I went to a movie last evening',
]

## Step 1: Determine the Vocabulary
tokenizer = Tokenizer()
tokenizer.fit_on_texts(docs)
print(f'Vocabulary: {list(tokenizer.word_index.keys())}')

## Step 2: Count
vectors = tokenizer.texts_to_matrix(docs, mode='count')
print(vectors) # First element is always 0

Vocabulary: ['i', 'dog', 'a', 'for', 'walk', 'went', 'last', 'evening', 'took', 'the', 'with', 'my', 'to', 'movie']
[[0. 1. 2. 1. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0.]
 [0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 0. 0.]
 [0. 1. 0. 1. 0. 0. 1. 1. 1. 0. 0. 0. 0. 1. 1.]]


## Text processing and TF

In [None]:
# fetch the content from Internet by providing URL

def get_article(url):
  url_read = urllib.request.urlopen(url)
  raw_html = url_read.read()

  article_html = bs.BeautifulSoup(raw_html, 'lxml')
  article_paragraphs = article_html.find_all('p')
  article_text = ''

  for para in article_paragraphs:  
      article_text += para.text

  article = nltk.sent_tokenize(article_text)

  return article  

In [None]:
# process the text - lower case, remove spaces, punctuations and stopwords
 
def get_TF(text):
  punc = string.punctuation
  words = nltk.word_tokenize(' '.join(text).lower())

  words = [w for w in words if not w in punc]
  words = [w for w in words if not w in stopwords.words("english")]
  words = [re.sub(r'\s+',' ',w) for w in words]

  cv = CountVectorizer()
  vec = cv.fit_transform(words)
  count_list = vec.toarray().sum(axis = 0)
  word_list = cv.get_feature_names()
  vocab = sorted(dict(zip(word_list, count_list)).items(),key=operator.itemgetter(1), reverse=True) 

  return vocab


In [None]:
# Provide URL, fetch articles, get TF of the entire vocabulary used

doc1 = get_article('https://en.wikipedia.org/wiki/Natural_language_processing')
doc2 = get_article('https://indianexpress.com/article/sports/football/fa-cup-final-2020-arsenal-vs-chelsea-live-score-updates-6534611/')
vocab1 = get_TF(doc1)
vocab2 = get_TF(doc2)

In [None]:
# See the article fetched - how many sentences and the article

import textwrap
print(len(doc1))
slice = doc1[0:20]
textwrap.wrap(''.join(slice), width = 100)

59


['Natural language processing (NLP) is a subfield of linguistics, computer science, information',
 'engineering, and artificial intelligence concerned with the interactions between computers and human',
 '(natural) languages, in particular how to program computers to process and analyze large amounts of',
 'natural language data.Challenges in natural language processing frequently involve speech',
 'recognition, natural language understanding, and natural-language generation.The history of natural',
 'language processing (NLP) generally started in the 1950s, although work can be found from earlier',
 'periods.In 1950, Alan Turing published an article titled "Computing Machinery and Intelligence"',
 'which proposed what is now called the Turing test as a criterion of intelligence[clarification',
 'needed].The Georgetown experiment in 1954 involved fully automatic translation of more than sixty',
 'Russian sentences into English.The authors claimed that within three or five years, machin

In [None]:
# See words and frequency

vocab2

## TF - IDF

In [None]:
# Get TFIDF of two docs fetched previously

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([' '.join(doc1), ' '.join(doc2)])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)
df

In [None]:
# Get TFIDF score of each document based on the query

def get_doc(query):
  tfidf_doc1, tfidf_doc2 = 0., 0.
  query = nltk.word_tokenize(query.lower())
  columns = df.columns
  #print(query, columns)
  for x in query:
    #print(x)
    if x in columns:
      #print(x)
      tfidf_doc1 += np.array(df[x])[0]
      #print(tfidf_doc1)
      tfidf_doc2 += np.array(df[x])[1]

  return tfidf_doc1, tfidf_doc2


In [None]:
  # Enter the query here

  query = 'i want to learn machine learning'

In [None]:
  # Show which document to fetch based on the query above

  tfidf_doc1, tfidf_doc2 = get_doc(query)
  docid = [1 if tfidf_doc1>= tfidf_doc2 else 2]
  #print('  {:20s}: {:3d},'.format(repr(char), char2idx[char]))
  print('Show document', docid,'\n')
  print('TF-IDF of doc1 = %0.4f'%(tfidf_doc1))
  print('TF-IDF of doc2 = %0.4f'%(tfidf_doc2),'\n')

Show document [1] 

TF-IDF of doc1 = 0.4532
TF-IDF of doc2 = 0.1714 



In [None]:
  # See data in the dataframe

  col = set(word_tokenize(query)).intersection(set(df.columns))
  df[col]

Unnamed: 0,machine,learning,learn,to
0,0.123699,0.108237,0.023194,0.198029
1,0.0,0.0,0.0,0.17144


## Word to Vector - word2vec

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
per1 = np.array([90, 60, 75, 88, 98, 45]).reshape(1,-1)
per2 = np.array([87, 65, 55, 90, 95, 35]).reshape(1,-1)
per3 = np.array([40, 77, 95, 38, 48, 95]).reshape(1,-1)
print(cosine_similarity(per1, per2),cosine_similarity(per1, per3), cosine_similarity(per2, per3)) 


[[0.99343892]] [[0.84337893]] [[0.80204826]]


In [None]:
fileObject = open("/content/1-18 books combined.txt", "r")
mb = fileObject .read()
mb_sent = sent_tokenize(mb)
print(mb)

In [None]:
print(mb_sent[0])

In [None]:
def sent2word(sentences):
  words = []

  for raw_sentence in sentences:
    if len(raw_sentence) > 0:
      words.append(word_tokenize(raw_sentence))
  return words


In [None]:
sentences = sent2word(mb_sent)
sentences[5]

In [None]:
token_count = sum([len(sentence) for sentence in sentences])
print("The book corpus contains {0:,} tokens.".format(token_count))

##Build Word2Vec Model

In [None]:
num_features = 500
min_word_count = 6
num_workers = multiprocessing.cpu_count()
context_size = 15
downsampling = 1e-3
seed = 1

In [None]:
mb2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [None]:
mb2vec.build_vocab(sentences)
print("Word2Vec vocabulary length:", len(mb2vec.wv.vocab))

In [None]:
mb2vec.train(sentences, total_examples=mb2vec.corpus_count, epochs = 10)

In [None]:
mb2vec.most_similar('Draupadi')

In [None]:
def nearest_similarity_cosmul(start1, end1, start2):
    '''Find the word that completes the relationship.'''
    similarities = mb2vec.most_similar_cosmul(
        positive=[start1, start2],
        negative=[end1])
    end2 = similarities[0][0]
    print("{start1} is related to {end1}, as {start2} is related to {end2}".format(**locals()))
    return end2

In [None]:
nearest_similarity_cosmul("Dhritarastra" ,"Pandu", "Nakula")  
nearest_similarity_cosmul("Bhima" ,"Arjuna", "Ambika") 
nearest_similarity_cosmul("Karna" ,"Duryodhana", "Kunti")
nearest_similarity_cosmul("Bhima" ,"Draupadi", "Ulupi")

In [None]:
mb2vec.wv.most_similar(positive=['woman', 'king'], negative=['man'])

## Save the Model

In [None]:
if not os.path.exists("trained"):
  os.makedirs("trained")
mb2vec.save(os.path.join("trained", "mb2vec.w2v"))


In [None]:
mb2vec = w2v.Word2Vec.load(os.path.join("trained", "mb2vec.w2v"))

## GloVe - the pre-trained model

In [None]:
!pip install glove_python

Collecting glove_python
[?25l  Downloading https://files.pythonhosted.org/packages/3e/79/7e7e548dd9dcb741935d031117f4bed133276c2a047aadad42f1552d1771/glove_python-0.1.0.tar.gz (263kB)
[K     |████████████████████████████████| 266kB 2.9MB/s 
Building wheels for collected packages: glove-python
  Building wheel for glove-python (setup.py) ... [?25l[?25hdone
  Created wheel for glove-python: filename=glove_python-0.1.0-cp36-cp36m-linux_x86_64.whl size=700224 sha256=3601c6c5f07ba5c5d9a0984c1aec7abe41ead4694a727dd8b333169a1b60010e
  Stored in directory: /root/.cache/pip/wheels/88/4b/6d/10c0d2ad32c9d9d68beec9694a6f0b6e83ab1662a90a089a4b
Successfully built glove-python
Installing collected packages: glove-python
Successfully installed glove-python-0.1.0


In [None]:
corp_glove = ['Hello this is a tutorial on how to convert the word in an integer format', 'this is a beautiful day','Jack is going to office']
corp_sent = sent2word(corp_glove)

NameError: ignored

In [None]:
corp_sent

In [None]:
from glove import Corpus, Glove

In [None]:
corpus = Corpus()
corpus.fit(corp_sent, window=10)
glove = Glove(no_components=5, learning_rate=0.05)
glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)

In [None]:
print (glove.word_vectors[glove.dictionary['tutorial']])


In [None]:
embeddings_dict = {}
with open('/content/glove.6B.50d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector
        embeddings_dict

In [None]:
def get_embedding(word):
  word = word.lower()
  if word in embeddings_dict.keys():
    return embeddings_dict[word]

In [None]:
get_embedding('kailash')

In [None]:
def find_closest_embeddings(embedding):
  distance = [(spatial.distance.euclidean(embeddings_dict[word], embeddings_dict[embedding]) for word in embeddings_dict.keys())]
  #return sorted(embeddings_dict.keys(), key=lambda word: spatial.distance.euclidean(embeddings_dict[word], embedding))


In [None]:
distance_list = []
word_list = []
for word in embeddings_dict.keys():
  distance = (spatial.distance.euclidean(embeddings_dict[word], embeddings_dict['king']))
  if distance < 4.0:
    distance_list.append(distance)
    word_list.append(word)

print(word_list)
print(distance_list)

In [None]:
import gensim.downloader as api
model = api.load("glove-wiki-gigaword-100")  # download the model and return as object ready for use


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
# Get embeddings
model['samsung']

array([ 0.20165 , -0.10063 ,  0.70741 ,  0.38328 ,  0.26653 , -0.73444 ,
        0.18056 ,  0.10257 ,  0.36666 , -0.10203 ,  0.74039 ,  0.064539,
        0.17015 , -0.3904  , -0.028034,  0.010385,  0.16209 ,  0.45656 ,
        1.1704  , -0.081035,  0.9173  , -0.87593 ,  0.025113,  1.2481  ,
       -0.11057 ,  0.14544 ,  0.013   ,  0.47959 , -0.37063 , -0.20684 ,
        0.15821 ,  0.64398 , -0.53477 ,  0.050977,  0.42249 , -0.31015 ,
        0.5536  ,  0.28029 , -0.14466 , -0.55467 ,  0.31696 ,  0.067902,
       -0.48795 , -0.10996 , -0.86542 ,  1.0242  ,  0.67504 ,  0.77755 ,
        0.30693 , -0.79164 , -0.34635 , -0.83695 , -0.36242 , -0.52085 ,
        0.18611 , -0.23385 , -1.1157  , -0.25272 ,  0.70889 , -0.017324,
       -0.040643, -0.37613 , -0.42664 ,  0.51648 , -0.33523 ,  0.092375,
        0.53513 ,  1.2845  ,  0.18675 ,  1.005   ,  0.75454 ,  0.56702 ,
       -0.45096 , -0.83243 , -0.92564 ,  0.83302 ,  0.53917 ,  0.095942,
       -0.11653 ,  0.47449 ,  2.1476  , -0.33144 , 

In [None]:
# Get similar words
def similar_words(word):
  word = [item.lower() for item in word ]
  return (model.most_similar(positive = word, topn= 5))

In [None]:
# Get similar words
word = ['gachibowli']
output = similar_words(word)

for i, k in enumerate(output):
  a, b = k
  print ('{:4s} {:10s} : {:0.4f},'.format(repr(i),repr(a.title()), b))

0    'Fedexfield' : 0.7650,
1    'Madhapur' : 0.7549,
2    'Barabati' : 0.7416,
3    'Higashi-Ku' : 0.7330,
4    'Kingsmeadow' : 0.7311,


In [None]:
# Finding the odd one out
model.doesnt_match(['mango','banana', 'apricot','peach','guava'])

'banana'

## Wordnet

In [None]:
poses = {'n':'noun', 'v': 'verb', 's':'adj (s)','a':'adj', 'r':'adv'}
for synset in wn.synsets('good'):
  print('{}:{}'.format(poses[synset.pos()],'.'.join([l.name() for l in synset.lemmas()])))

In [None]:
panda = (wn.synset('panda.n.01'))
hyper = lambda s: s.hypernyms()
list1 = panda.closure(hyper)
print(list1.synset)

## Understanding RNN Layer

In [None]:
class myRNNCell(tf.keras.layers.Layer):
  def __init__(self, rnn_units. input_dim, output_dim):
    super(myRNNCell, self).__init__()

    # initialise weight metrices
    self.W_xh = self.add_weight([rnn_units, input_dim])
    self.W_hh = self.add_weight([rnn_units, rnn_units])
    self.W_hy = self.add_weight([output_dim, rnn_units])

    # initialise hidden states to zeros
    self.h = np.zeros([rnn_units, 1])

  def call(self, x):

    #update hidden state
    self.h = tf.math.tanh(self.W_hh * self.h + self.W_xh * x)

    #compute output
    output = self.W_hy * self.h

    #return the current output and hidden state
    return output, self.h


In [None]:
# above code is same as
myRNN = tf.keras.layers.SimpleRNN(rnn_units)