# Data Augmentation Using Word Embeddings



## Install Dependencies

In [None]:
!pip install gensim
from gensim.models import KeyedVectors
import numpy as np


## Download Google News Vectors

In [None]:
!wget -P /root/vectors/ -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

In [None]:
GOOGLE_NEWS_VECTORS = '/root/vectors/GoogleNews-vectors-negative300.bin.gz'
word2vec = KeyedVectors.load_word2vec_format(GOOGLE_NEWS_VECTORS, binary=True)
print("Vectors loaded successfully: "+str(len(word2vec.vocab)))

## Vectors

Example Sentences:

*   `Winter` is coming.
*   Any `man` who must say "I am the `king`" is no true `king`.




In [None]:
keywords = [["winter"], ["man"], ["king"]]

for w in keywords:
  word = w[0]
  print("Word: "+word)
  vector = word2vec.get_vector(word)
  print("Vector length: {}".format(len(vector)))
  print(vector)

## Cosine Similarity



Consine Similarity formula: 

$\text{cos}(a,b) = \frac{a \cdot b}{||a|| \cdot  ||b||}$

In [None]:
def cosine_similarity(x1, x2):
  return np.round(np.dot(x1, x2)/(np.linalg.norm(x1)*np.linalg.norm(x2)),5)
print(cosine_similarity(word2vec.get_vector("winter"), word2vec.get_vector("summer")))

In [None]:
word2vec.similarity("winter","summer")

## Similar Words

In [None]:
for w in keywords:
  print(word2vec.most_similar(positive=w, topn = 10))


## References






*   [Efficient estimation of word representations in vector space](https://arxiv.org/pdf/1301.3781.pdf) 
*   [Word embeddings](https://https://en.wikipedia.org/wiki/Word_embedding) 
*   [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity)
*   [Gensim](https://github.com/RaRe-Technologies/gensim) 