# Comparing word2vec embeddings for a user input phrase against a list of preloaded phrases

## Loading dependancies

In [38]:
import os

import pandas as pd
import numpy as np

import gensim
from gensim.models import KeyedVectors

## Setting paths for models and files

In [2]:
WORD2VEC_MODEL_PATH = "../models/GoogleNews-vectors-negative300.bin.gz"
PHRASES_FILE_PATH = "../data/phrases.csv"
VECTORS_FILE_PATH = "../data/vectors.csv"

In [14]:
# Check if VECTORS_FILE exists

if os.path.exists(VECTORS_FILE_PATH):
    print(f"Vectors file exists, loading from {VECTORS_FILE_PATH}")
    wv = KeyedVectors.load_word2vec_format(VECTORS_FILE_PATH)
elif os.path.exists(WORD2VEC_MODEL_PATH):
    print(f"Vectors file does not exist, loading from {WORD2VEC_MODEL_PATH}")
    wv = KeyedVectors.load_word2vec_format(WORD2VEC_MODEL_PATH, binary=True, limit=1000000)
    print(f"Saving vectors to {VECTORS_FILE_PATH}")
    wv.save_word2vec_format(VECTORS_FILE_PATH)
else:
    print("Word2Vec model not found. Please place the model at models/GoogleNews-vectors-negative300.bin.gz")

Vectors file exists, loading from ../data/vectors.csv


## Loading Phrases CSV

In [4]:
phrases = pd.read_csv(PHRASES_FILE_PATH)

In [5]:
phrases

Unnamed: 0,Phrases
0,how company compares to its peers?
1,what is the detailed income statement breakdow...
2,world premium penetration in 2020
3,How does the forecasted insurance premium pene...
4,what are the total losses for companies in cou...
5,Insurance premiums market in Country
6,How have the profit margins been for the airli...
7,What is the complete profile of the top oil rigs?
8,what is company general information?
9,How is the airline industry performing globally?


## Defining functions to convert phrases to embeddings


Making an assumption here, normalising the vector in the traditional sense, by dividing it by the norm of the said vector. \
This might be incorrect for word2vec embeddings, or not. I'm not sure.

In [61]:
def get_phrase_vector(phrase):
    combined_vectors = None
    normalized_vector = None

    # Splitting phrase into words
    for word in phrase.split(): 
        try:
            vector = wv.get_vector(word)
            if combined_vectors is None:
                combined_vectors = vector
            else:
                vectors = combined_vectors + vector
        except KeyError as e:
            print(f"Word {word} not found in the embeddings. Skipping.")

    # Normalising the vector, not sure if this is correct for word2vec vectors
    if combined_vectors is not None:
        norm = np.linalg.norm(combined_vectors)
        normalized_vector = combined_vectors / norm
    
    return normalized_vector

## Computing embeddings for all given phrases

In [62]:
phrases['Embedding'] = phrases['Phrases'].apply(get_phrase_vector)

Word to not found in the embeddings. Skipping.
Word peers? not found in the embeddings. Skipping.
Word of not found in the embeddings. Skipping.
Word Axa? not found in the embeddings. Skipping.
Word 2020 not found in the embeddings. Skipping.
Word to not found in the embeddings. Skipping.
Word peers? not found in the embeddings. Skipping.
Word market? not found in the embeddings. Skipping.
Word years, not found in the embeddings. Skipping.
Word regions? not found in the embeddings. Skipping.
Word of not found in the embeddings. Skipping.
Word rigs? not found in the embeddings. Skipping.
Word information? not found in the embeddings. Skipping.
Word globally? not found in the embeddings. Skipping.
Word to not found in the embeddings. Skipping.
Word others? not found in the embeddings. Skipping.
Word a not found in the embeddings. Skipping.
Word of not found in the embeddings. Skipping.
Word and not found in the embeddings. Skipping.
Word South-East not found in the embeddings. Skipping.


## Looks like we really need some cleanup here
Going forward the following approach
- Remove special characters
- Remove stop words
- convert all words to lowercase


For stop words, using spaCy's list of english stop words. We will use some regular expressions for cleaning up strings.

In [66]:
from spacy.lang.en.stop_words import STOP_WORDS
import re

### We need to redefine get_phrase_vector function to remove stop words and special characters

In [75]:
def get_phrase_vector(phrase):
    combined_vectors = None
    normalized_vector = None

    phrase = phrase.lower().strip()
    phrase = re.sub(r'[^a-z0-9\s]', '', phrase) # Removing anything that's not a lower case letter or a digit
    phrase = re.sub(r'\s{2,}', ' ', phrase) # Replacing consecutive spaces with single space

    tokens = [token for token in phrase.split() if token not in STOP_WORDS]

    # Splitting phrase into words
    for word in tokens:
        try:
            vector = wv.get_vector(word)
            if combined_vectors is None:
                combined_vectors = vector
            else:
                vectors = combined_vectors + vector
        except KeyError as e:
            print(f"Word {word} not found in the embeddings. Skipping.")

    # Normalising the vector, not sure if this is correct for word2vec vectors
    if combined_vectors is not None:
        norm = np.linalg.norm(combined_vectors)
        normalized_vector = combined_vectors / norm
    
    return normalized_vector

In [74]:
# Let's try again
phrases['Embedding'] = phrases['Phrases'].apply(get_phrase_vector)

Word axa not found in the embeddings. Skipping.
Word 2020 not found in the embeddings. Skipping.
Word 2020 not found in the embeddings. Skipping.
Word cholamandalam not found in the embeddings. Skipping.
Word 2020 not found in the embeddings. Skipping.
Word qantas not found in the embeddings. Skipping.
Word kdo not found in the embeddings. Skipping.
Word nejvt not found in the embeddings. Skipping.
Word konkurent not found in the embeddings. Skipping.
Word esk not found in the embeddings. Skipping.
Word pojiovny not found in the embeddings. Skipping.
Word aban not found in the embeddings. Skipping.
Word abraham not found in the embeddings. Skipping.
Word lloyds not found in the embeddings. Skipping.
Word lloyds not found in the embeddings. Skipping.
Word lloyds not found in the embeddings. Skipping.
Word axa not found in the embeddings. Skipping.
Word 2020 not found in the embeddings. Skipping.
Word 2020 not found in the embeddings. Skipping.
Word 2020 not found in the embeddings. Skip

Data is much cleaner now, only some words have been skipped.

In [81]:
phrases

Unnamed: 0,Phrases,Embedding
0,how company compares to its peers?,"[-0.01728776, -0.0648883, -0.035522792, -0.045..."
1,what is the detailed income statement breakdow...,"[-0.06807313, -0.071081884, -0.044567212, -0.0..."
2,world premium penetration in 2020,"[-0.02804473, 0.029971467, 0.09847768, 0.05780..."
3,How does the forecasted insurance premium pene...,"[-0.08757816, -0.02301324, 0.0032761905, 0.012..."
4,what are the total losses for companies in cou...,"[0.045693465, -0.06264081, 0.05298726, 0.03303..."
5,Insurance premiums market in Country,"[0.017501833, 0.015246042, 0.010423314, 0.0448..."
6,How have the profit margins been for the airli...,"[-0.045135155, -0.036137626, -0.044545155, 0.0..."
7,What is the complete profile of the top oil rigs?,"[-0.056534067, -0.04306306, -0.076850995, 0.01..."
8,what is company general information?,"[-0.01728776, -0.0648883, -0.035522792, -0.045..."
9,How is the airline industry performing globally?,"[0.042373963, -0.06820466, -0.035988845, 0.076..."


### This solves 1(a)

## Calculating distances for all combinations of phrases
Using scipy for calculating distances for all combination of points

In [102]:
from scipy.spatial.distance import cdist

In [91]:
vectors = np.array([x for x in phrases['Embedding']])
distances = cdist(vectors, vectors, 'euclidean')

In [105]:
distances

array([[0.        , 1.3112473 , 1.2982951 , ..., 1.35003452, 1.34635131,
        1.39190258],
       [1.3112473 , 0.        , 1.40876738, ..., 1.37352588, 1.36544314,
        1.26578919],
       [1.2982951 , 1.40876738, 0.        , ..., 1.19965208, 1.40668654,
        1.33857932],
       ...,
       [1.35003452, 1.37352588, 1.19965208, ..., 0.        , 1.37200837,
        1.17078994],
       [1.34635131, 1.36544314, 1.40668654, ..., 1.37200837, 0.        ,
        1.35255317],
       [1.39190258, 1.26578919, 1.33857932, ..., 1.17078994, 1.35255317,
        0.        ]])

In [106]:
# Confirming the shape of the distance array, it should be 50x50 for all combinations of embeddings
distances.shape

(50, 50)

### All pairwise distances for this dataset. This solves 1b.

## Now to find matching phrases to a given phrase

In [117]:
user_input = input()

 What has been the profitability of airline companies in India?


In [124]:
def get_closest_phrase(input_phrase):
    phrase_vector = get_phrase_vector(input_phrase)
    vectors = np.array([x for x in phrases['Embedding']])
    distances = cdist([phrase_vector], vectors)
    index_min = np.argmin(distances[0])
    return phrases["Phrases"][index_min]

In [125]:
get_closest_phrase(user_input)

'most profitable insurance company India'

### Closest phrase in the dataset from user input. This solves 1c.