In [48]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp39-cp39-macosx_11_0_arm64.whl.metadata (8.3 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.0.5-py3-none-any.whl.metadata (24 kB)
Downloading gensim-4.3.3-cp39-cp39-macosx_11_0_arm64.whl (24.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.0/24.0 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading smart_open-7.0.5-py3-none-any.whl (61 kB)
Installing collected packages: smart-open, gensim
Successfully installed gensim-4.3.3 smart-open-7.0.5


In [49]:
# Takes a long time to download (1.6GB)
#import gensim.downloader as api

#print(api.info())

#model = api.load("word2vec-google-news-300")


{'corpora': {'semeval-2016-2017-task3-subtaskBC': {'num_records': -1, 'record_format': 'dict', 'file_size': 6344358, 'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/semeval-2016-2017-task3-subtaskB-eng/__init__.py', 'license': 'All files released for the task are free for general research use', 'fields': {'2016-train': ['...'], '2016-dev': ['...'], '2017-test': ['...'], '2016-test': ['...']}, 'description': 'SemEval 2016 / 2017 Task 3 Subtask B and C datasets contain train+development (317 original questions, 3,169 related questions, and 31,690 comments), and test datasets in English. The description of the tasks and the collected data is given in sections 3 and 4.1 of the task paper http://alt.qcri.org/semeval2016/task3/data/uploads/semeval2016-task3-report.pdf linked in section “Papers” of https://github.com/RaRe-Technologies/gensim-data/issues/18.', 'checksum': '701ea67acd82e75f95e1d8e62fb0ad29', 'file_name': 'semeval-2016-2017-task3-subtaskBC.gz',

In [51]:
import re
import numpy as np
import gensim.downloader as api

In [59]:
# pre processing function 
def preprocess_text(text):
    # lowercase the text
    text = text.lower()
    
    # remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # tokenize (split into words)
    tokens = text.split()

    return tokens

In [60]:
# sentence embedding generator by averaging word vectors for each token
def get_word2vec_embeddings(tokens, model):
    word_vectors = []
    for word in tokens:
        word_vectors.append(model[word])
        
    
    if word_vectors:
        # compute the mean of all word vectors
        sentence_embedding = np.mean(word_vectors, axis=0)
    else:
        # if there are no valid words return a vector of zeros
        sentence_embedding = np.zeros(model.vector_size)
    
    return sentence_embedding

In [61]:
print("Available models:")
print(api.info()['models'].keys())

Available models:
dict_keys(['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis'])


In [62]:
# choosed a smaller model to save time and space
model_name = "glove-wiki-gigaword-50"  # 70MB

# load the model
model = api.load(model_name)

In [63]:
# example words
words = ["arda", "asmi", "paige", "tanisha", "rebecca", "vaishnavi"]
for word in words:
    embedding = model[word]
    print(f"Embedding for '{word}':")
    print(embedding)
    print(embedding.shape)

Embedding for 'arda':
[ 0.34357   -0.46947    0.14199    0.57961   -0.21052   -0.50971
 -0.081962   0.17801    1.0739     0.56665    0.5446     0.54928
  0.11969    0.31244    0.31591    0.10055    0.056002  -0.49547
 -0.095288   1.9341    -0.64631   -0.27184    0.69072   -0.22731
  1.1693     0.48167    0.17152    0.15155   -0.81181   -0.71838
 -0.85394   -0.20436    0.81764    0.22587   -0.82647    0.39545
 -0.13125   -0.0040177  1.0222     1.1695    -0.2633     0.34738
 -0.65852    0.48763   -1.1467     0.015759   0.33977    0.73546
  0.22286   -1.0063   ]
(50,)
Embedding for 'asmi':
[-1.012     -0.35748   -0.028167  -0.45164   -0.82435    0.077309
  0.60695    0.17228    0.42893    0.11335    1.1651    -0.24633
  0.32278    0.11193   -0.25446    0.1523     0.0054664  0.74313
  0.56203   -0.45239   -0.55464   -0.044636  -0.3816     0.5075
  0.072137   0.70254    0.19183    0.46638   -0.28685    0.3488
 -1.2844     0.10795   -0.33361    0.77058   -0.37058    0.28308
  0.38504   -0.03

In [64]:
# Example of preprocessing and generating sentence embedding
sample_text = "american express team 1 is the best team"
tokens = preprocess_text(sample_text)
print(f"preprocessed tokens: {tokens}")

sentence_embedding = get_word2vec_embeddings(tokens, model)
print(sentence_embedding.shape)
print(f"Sentence embedding: {sentence_embedding}")


preprocessed tokens: ['american', 'express', 'team', 'is', 'the', 'best', 'team']
(50,)
Sentence embedding: [-2.46224269e-01  4.54204291e-01 -3.28541458e-01  4.16515738e-01
  3.67826253e-01  1.33671630e-02 -8.58426571e-01  5.33828549e-02
 -2.78489082e-03 -2.11018562e-01  4.55962837e-01  1.98742852e-01
 -6.72284126e-01  1.59288570e-01  2.00416207e-01 -1.87341705e-01
  2.37975433e-01  3.46383154e-01 -8.59257162e-01 -3.27792853e-01
 -2.90891558e-01  2.01734304e-01 -3.04371747e-03  2.28831425e-01
  6.31000102e-02 -1.68061435e+00 -5.70835710e-01 -2.15054139e-01
 -1.91969290e-01 -2.82030553e-01  3.17280006e+00  4.23662871e-01
 -2.45778650e-01 -4.42038268e-01  2.34448746e-01 -1.04000732e-01
 -3.15715163e-03  2.31771439e-01 -1.65591836e-01 -4.64741856e-01
 -5.60125746e-02 -2.17643946e-01 -2.53777802e-02 -1.62444428e-01
 -9.45385695e-02  3.16428602e-01 -1.37081295e-01  2.41646871e-01
 -1.93311386e-02  1.09048426e-01]
