# Import Libraries

In [3]:
import gensim
from gensim.models import Word2Vec
import numpy as np
import nltk
import itertools
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import scipy
from scipy import spatial
from nltk.tokenize.toktok import ToktokTokenizer
import re
from nltk.corpus import stopwords

In [4]:
tokenizer = ToktokTokenizer()
stopwords_list = stopwords.words("english")

# Randomly Dataset

In [5]:
Doc1 = [
    "With the Union cabinet approving the amendments to the\
Motor Vehicles Act, 2016, those caught for drunken driving will\
have to have really deep pockets, as the fine payable in court\
has been enhanced to Rs 10,000 for first-time offenders."
]
Doc2 = [
    "Natural language processing (NLP) is an area of\
computer science and artificial intelligence concerned with the\
interactions between computers and human (natural) languages,\
in particular how to program computers to process and analyze\
large amounts of natural language data."
]
Doc3 = [
    "He points out that public transport is very good in\
Mumbai and New Delhi, where there is a good network of suburban\
and metro rail systems."
]
Doc4 = [
    "But the man behind the wickets at the other end was\
watching just as keenly. With an affirmative nod from Dhoni,\
India captain Rohit Sharma promptly asked for a review. Sure\
enough, the ball would have clipped the top of middle and leg."
]

In [6]:
fin = Doc1 + Doc2 + Doc3 + Doc4

In [7]:
# load the Word2Vec model
# you can download this model from here https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
model = gensim.models.KeyedVectors.load_word2vec_format(
    "GoogleNews-vectors-negative300.bin", binary=True
)

# Preprocessing

In [88]:
def remove_stopwords(text, is_lower_case=False):
    pattern = r"[^a-zA-z-0-9\s]"
    text = re.sub(pattern, '', ''.join(text))
    tokens = tokenizer.tokenize(text)
    text = [token.strip for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords_list]
    else:
        filtered_tokens = [
            token for token in tokens if token.lower() not in stopwords_list
        ]
    filtered_text = " ".join(filtered_tokens)
    print(filtered_text)
    return filtered_text

# Get Embeddings vector for N dimensions

In [89]:
def get_embedding(word):
    if word in model:
        return model[word]
    else:
        return np.zeros(300)

# Getting the average vector for each document

In [None]:
out_dict = {}
for sen in fin:
    average_vector = np.mean(
        np.array(
            [get_embedding(x) for x in word_tokenize(remove_stopwords(sen))],
            dtype=float,
        ),
        axis=0,
    )
    dict = {sen: (average_vector)}
    out_dict.update(dict)

# Calculate the similarity between the query vector and document vector

In [97]:
def get_sim(query_embedding, average_vector):
    sim = [(1 - scipy.spatial.distance.cosine(query_embedding, average_vector))]
    return sim

# Ranking the documents and query using word embeddings

In [98]:
def randDocuments(query):
    query_words = np.mean(
        np.array([get_embedding(x) for x in word_tokenize(query.lower())], dtype=float),
        axis=0,
    )
    rank = []
    for k, v in out_dict.items():
        rank.append((k, get_sim(query_words, v)))
    rank = sorted(rank, key=lambda x: x[1], reverse=True)
    print("Ranked Documents : ")
    return rank

In [None]:
randDocuments("cricket")