

# Understanding word embeddings using real world example:



## Word Embeddings using Spacy

In [32]:
!pip install spacy
import spacy
!python -m spacy download en_core_web_lg
nlp = spacy.load("en_core_web_lg")
xx=nlp("spirituality")
yy=nlp("spirituality criminality")

for token in yy:
  print(f"{token.text} <-> {xx.text}:", token.similarity(xx))

2023-12-29 07:07:30.759594: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-29 07:07:30.759663: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-29 07:07:30.766169: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Collecting en-core-web-lg==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.6.0/en_core_web_lg-3.6.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m?[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the 

## Word Embeddings using Continuous Bag of Words (CBOW) and Skip-Gram

In [30]:
!pip install gensim
!pip install gensim-downloader

import gensim.downloader as api
from scipy.spatial.distance import cosine

# Load pre-trained Word2Vec model (CBOW)
word_vectors_cbow = api.load("word2vec-google-news-300")

# Load pre-trained model that can serve as an approximation for Skip-Gram
word_vectors_skipgram = api.load("glove-wiki-gigaword-100")

def calculate_similarity(model, word1, word2):
    if word1 in model.key_to_index and word2 in model.key_to_index:
        return model.similarity(word1, word2)
    else:
        return None

# Words to compare
word1 = "spirituality"
word2 = "criminality"

# Calculate similarities
similarity_cbow = calculate_similarity(word_vectors_cbow, word1, word2)
similarity_skipgram = calculate_similarity(word_vectors_skipgram, word1, word2)

print(f"CBOW model similarity between '{word1}' and '{word2}': {similarity_cbow}")
print(f"Skip-Gram model similarity between '{word1}' and '{word2}': {similarity_skipgram}")



[31mERROR: Could not find a version that satisfies the requirement gensim-downloader (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for gensim-downloader[0m[31m
CBOW model similarity between 'spirituality' and 'criminality': 0.19835983216762543
Skip-Gram model similarity between 'spirituality' and 'criminality': 0.27936938405036926


## Word Embeddings using BERT

In [24]:
!pip install transformers
from transformers import BertModel, BertTokenizer
import torch
import torch.nn.functional as F

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings
def get_bert_embedding(sentence, tokenizer, model):
    inputs = tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze()

# Get embeddings for the words
embedding_spirituality = get_bert_embedding("spirituality", tokenizer, model)
embedding_criminality = get_bert_embedding("criminality", tokenizer, model)

# Calculate cosine similarity
cosine_similarity = F.cosine_similarity(embedding_spirituality.unsqueeze(0), embedding_criminality.unsqueeze(0))
print(f"Cosine Similarity between 'spirituality' and 'criminality': {cosine_similarity.item()}")



Cosine Similarity between 'spirituality' and 'criminality': 0.7088658809661865


## Word Embeddings using FastText

In [31]:
import gensim.downloader as api
from scipy.spatial.distance import cosine

# Load pre-trained FastText model
fasttext_model = api.load("fasttext-wiki-news-subwords-300")

# Define the words
word1 = "spirituality"
word2 = "criminality"

# Function to calculate cosine similarity
def calculate_similarity(model, w1, w2):
    if w1 in model.key_to_index and w2 in model.key_to_index:
        return model.similarity(w1, w2)
    else:
        return None

# Calculate and display similarity
similarity = calculate_similarity(fasttext_model, word1, word2)
print(f"FastText model similarity between '{word1}' and '{word2}': {similarity}")


FastText model similarity between 'spirituality' and 'criminality': 0.4805312156677246


## Using OpenAI- This code may not be working in some cases and need some debugging

In [29]:
import openai
import numpy as np
from scipy.spatial.distance import cosine

openai.api_key = 'your_key' #input your api key here

def get_embedding(text):
    response = openai.Embedding.create(input=[text], engine="text-similarity-babbage-001")
    return np.array(response['data'][0]['embedding'])

embedding_spirituality = get_embedding("spirituality")
embedding_criminality = get_embedding("criminality")

def cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)

similarity = cosine_similarity(embedding_spirituality, embedding_criminality)
print(f"Cosine Similarity between 'spirituality' and 'criminality': {similarity}")


ImportError: ignored

## Word Embeddings using ELMo (Embeddings from Language Models)

In [3]:
!pip install torch
!pip install allennlp allennlp-models



Collecting allennlp
  Downloading allennlp-2.10.1-py3-none-any.whl (730 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m730.2/730.2 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting allennlp-models
  Downloading allennlp_models-2.10.1-py3-none-any.whl (464 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m464.5/464.5 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch<1.13.0,>=1.10.0 (from allennlp)
  Downloading torch-1.12.1-cp310-cp310-manylinux1_x86_64.whl (776.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.3/776.3 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision<0.14.0,>=0.8.1 (from allennlp)
  Downloading torchvision-0.13.1-cp310-cp310-manylinux1_x86_64.whl (19.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.1/19.1 MB[0m [31m67.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cached-path<1.2.0,>=1.1.3 (from allennlp)
  Downloading cac

In [4]:
import numpy as np
from allennlp.modules.elmo import Elmo, batch_to_ids
from scipy.spatial.distance import cosine

# Function to calculate cosine similarity
def cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)

# Initialize ELMo
options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
elmo = Elmo(options_file, weight_file, 1, dropout=0)

# Function to get ELMo embeddings
def get_elmo_embedding(word):
    character_ids = batch_to_ids([[word]])
    embeddings = elmo(character_ids)['elmo_representations'][0]
    return embeddings.mean(dim=1).squeeze().detach().numpy()

# Get embeddings
embedding_spirituality = get_elmo_embedding("spirituality")
embedding_criminality = get_elmo_embedding("criminality")

# Calculate cosine similarity
similarity = cosine_similarity(embedding_spirituality, embedding_criminality)
print(f"ELMo model similarity between 'spirituality' and 'criminality': {similarity}")


Output()

Output()

ELMo model similarity between 'spirituality' and 'criminality': 0.5876579880714417


## Word Embedding Similarity Results

| Model | Similarity Score |
| --- | --- |
| Spacy (en_core_web_lg)                     | 0.556077702724868   |
| Gensim CBOW (word2vec-google-news-300)     | 0.19835983216762543 |
| Gensim Skip-Gram (glove-wiki-gigaword-100) | 0.27936938405036926 |
| FastText (fasttext-wiki-news-subwords-300) | 0.5876579880714417  |
| BERT                                       | 0.7088658809661865  |
| ELMo model                                 | 0.5876579880714417  |

# Conclusion
But how good are these code-makers at capturing the true essence of words? We decided to put them to the test with two words that seem like oil and water – "spirituality" and "criminality." Using different embedding models, we asked: how close are these words in the computer's mind?

The results were surprising, to say the least. Some models thought they were practically best friends, while others kept them at arm's length. This got us thinking – are these machines really understanding what these words mean, or are they just playing a numbers game?