In [39]:
import csv
import gensim
import numpy as np
from gensim.models import KeyedVectors
import os
from scipy.spatial.distance import cdist

In [2]:
def extract_w2v():
    if not os.path.exists('vectors.csv'):
        if not os.path.exists('GoogleNews-vectors-negative300.bin.gz'):
            print("Download and place the model in the folder.")
            exit(0)
        wv = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True, limit=1000000)
        print("Extracting embeddings of top 1000000 words.")
        wv.save_word2vec_format('vectors.csv')
    else:
        print("using existing vector.csv")

In [3]:
def build_w2v_mapping():
    embeddings = {}
    with open('vectors.csv', encoding="utf8") as f:
        wv_file = f.readlines()
        for embed in wv_file[1:]:
            embed_split = embed.split(" ")
            embeddings[embed_split[0]] = np.array([float(x) for x in embed_split[1:]])
    return embeddings

In [20]:
def phrase_vector(embeddings, phrase):
    phrase_vector = np.array([0.0] * 300)
    words = phrase.split()
    words_found = 0
    for word in words:
        if word in embeddings:
            phrase_vector += embeddings[word]
            words_found += 1
    print(f"{words_found} words found in '{phrase}'")
    if words_found > 0:
        phrase_vector /= words_found
    return phrase_vector

In [15]:
# extract_w2v()
# embeddings = build_w2v_mapping()

In [68]:
phrase_embeddings = []
phrases = []
with open('phrases (1).csv', 'r') as csv_file:
    reader = csv.reader(csv_file)
    for phrase in reader:
        phrases.append(phrase[0])
        p_vector = phrase_vector(embeddings, phrase[0])
        phrase_embeddings.append(p_vector)

1 words found in 'Phrases'
4 words found in 'how company compares to its peers?'
7 words found in 'what is the detailed income statement breakdown of Axa?'
4 words found in 'world premium penetration in 2020'
12 words found in 'How does the forecasted insurance premium penetration in country trend compare to its peers?'
11 words found in 'what are the total losses for companies in country  non life market?'
5 words found in 'Insurance premiums market in Country'
13 words found in 'How have the profit margins been for the airlines industry through the years, by regions?'
8 words found in 'What is the complete profile of the top oil rigs?'
4 words found in 'what is company general information?'
6 words found in 'How is the airline industry performing globally?'
8 words found in 'how does economic profit for in country compare to others?'
5 words found in 'most profitable insurance company India'
9 words found in 'Give me a detailed breakup of the income and expenses in South-East Asia'
9

In [69]:
phrases = phrases[1:]
phrase_embeddings = np.stack(phrase_embeddings[1:])

In [70]:
phrase_distance = cdist(phrase_embeddings, phrase_embeddings,metric='cosine')

In [71]:
phrase_distance.round(2)

array([[0.  , 0.49, 0.63, ..., 0.58, 0.43, 0.65],
       [0.49, 0.  , 0.69, ..., 0.54, 0.53, 0.52],
       [0.63, 0.69, 0.  , ..., 0.58, 0.61, 0.63],
       ...,
       [0.58, 0.54, 0.58, ..., 0.  , 0.45, 0.27],
       [0.43, 0.53, 0.61, ..., 0.45, 0.  , 0.41],
       [0.65, 0.52, 0.63, ..., 0.27, 0.41, 0.  ]])

In [72]:
def closest_phrase(new_phrase, phrase_embeddings):
    new_phrase_embed = phrase_vector(embeddings, new_phrase).reshape(1, -1)
    phrase_distance = cdist(phrase_embeddings, new_phrase_embed, metric='cosine').reshape(-1)
    closest_phrase_idx = phrase_distance.argmin()
    print(f"Closest phrase is {phrases[closest_phrase_idx]}")

In [73]:
closest_phrase("Which are the best airports?", phrase_embeddings)

4 words found in 'Which are the best airports?'
Closest phrase is Which are the top airports?
