In [1]:
# Imports
from gensim.test.utils import common_texts
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec, FastText
from keras.layers import Embedding

import re
import numpy as np

2023-08-14 10:11:36.402321: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-14 10:11:36.494338: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-08-14 10:11:36.494414: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
# Task 0. Bag of Words
def bag_of_words(sentences, vocab=None):
    """
    Creates a bag of words embedding matrix
    Args:
        sentences: list of sentences to analyze
        vocab: list of the vocabulary words to use for the analysis

    Returns:
        embeddings: np.ndarray shape (s, f) containing the embeddings
            s: number of sentences in sentences
            f: number of features analyzed
        features: list of the features used for embeddings
    """
    # If vocab is None, make an empty list to append to
    if vocab:
        features = vocab
        vocab_sentences = []
    else:
        features = []

    corpus = []
    # Remove punctuation, to lowercase, create corpus
    for sentence in sentences:
        review = re.sub('[^a-zA-Z]', ' ', sentence)
        review = review.lower()
        review = review.split()
        # If vocab list is provided, need to remove all words except words that
        # are inside the vocab list.
        if vocab:
            vocab_sentences.clear()
            for j in range(0, len(review)):
                if review[j] in vocab:
                    word = review[j]
                    vocab_sentences.append(word)
            review = ' '.join(vocab_sentences)
        else:
            review = ' '.join(review)
        corpus.append(review)

    if vocab is None:
        # All of this to get the vocab list
        for sentence in corpus:
            words = sentence.split()
            for word in words:
                if len(word) > 1 and word not in features:
                    features.append(word)
                    features.sort()

    cv = CountVectorizer(vocabulary=features)
    embeddings = cv.fit_transform(corpus).toarray()

    return embeddings, features




In [3]:
# 0-main - Vocab is None
sentences = ["Holberton school is Awesome!",
             "Machine learning is awesome",
             "NLP is the future!",
             "The children are our future",
             "Our children's children are our grandchildren",
             "The cake was not very good",
             "No one said that the cake was not very good",
             "Life is beautiful"]
E, F = bag_of_words(sentences)
print(E)
print(F)

[[0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0]
 [1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0]
 [1 0 0 0 2 0 0 1 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0]
 [0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1]
 [0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1 0 1 1 1 1]
 [0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0]]
['are', 'awesome', 'beautiful', 'cake', 'children', 'future', 'good', 'grandchildren', 'holberton', 'is', 'learning', 'life', 'machine', 'nlp', 'no', 'not', 'one', 'our', 'said', 'school', 'that', 'the', 'very', 'was']


In [4]:
#0-main - Vocab is not None
sentences = ["Holberton school is Awesome!",
             "Machine learning is awesome",
             "NLP is the future!",
             "The children are our future",
             "Our children's children are our grandchildren",
             "The cake was not very good",
             "No one said that the cake was not very good",
             "Life is beautiful"]
vocab = ["children", "is", "awesome", "cake", "are", "our", "future"]
E, F = bag_of_words(sentences, vocab)
print(E)
print(F)

[[0 1 1 0 0 0 0]
 [0 1 1 0 0 0 0]
 [0 1 0 0 0 0 1]
 [1 0 0 0 1 1 1]
 [2 0 0 0 1 2 0]
 [0 0 0 1 0 0 0]
 [0 0 0 1 0 0 0]
 [0 1 0 0 0 0 0]]
['children', 'is', 'awesome', 'cake', 'are', 'our', 'future']


In [5]:
# Task 1. TF-IDF
def tf_idf(sentences, vocab=None):
    """
    Creates a TF-IDF embedding
    Args:
        sentences: list of sentences to analyze
        vocab: list of vocab words to use for the analysis
            - If vocab is none, use all words within sentences

    Returns:
        embeddings: np.ndarray shape (s, f) containing the embeddings
            s: number of sentences in sentences
            f: number of features analyzed
        features: list of the features used for embeddings
    """
    if vocab:
        features = vocab
        vocab_sentences = []
    else:
        features = []

    corpus = []
    # Remove punctuation, to lowercase, create corpus
    for sentence in sentences:
        review = re.sub('[^a-zA-Z]', ' ', sentence)
        review = review.lower()
        review = review.split()
        # If vocab list is provided, need to remove all words except words that
        # are inside the vocab list.
        if vocab:
            vocab_sentences.clear()
            for j in range(0, len(review)):
                if review[j] in vocab:
                    word = review[j]
                    vocab_sentences.append(word)
            review = ' '.join(vocab_sentences)
        else:
            review = ' '.join(review)
        corpus.append(review)

    if vocab is None:
        # All of this to get the vocab list
        for sentence in corpus:
            words = sentence.split()
            for word in words:
                if len(word) > 1 and word not in features:
                    features.append(word)
                    features.sort()

    tfidf = TfidfVectorizer(vocabulary=features)
    embedding = tfidf.fit_transform(corpus).toarray()

    return embedding, features

In [6]:
# 1-main
sentences = ["Holberton school is Awesome!",
             "Machine learning is awesome",
             "NLP is the future!",
             "The children are our future",
             "Our children's children are our grandchildren",
             "The cake was not very good",
             "No one said that the cake was not very good",
             "Life is beautiful"]
vocab = ["awesome", "learning", "children", "cake", "good", "none", "machine"]
E, F = tf_idf(sentences, vocab)
print(E)
print(F)

[[1.         0.         0.         0.         0.         0.
  0.        ]
 [0.5098139  0.60831315 0.         0.         0.         0.
  0.60831315]
 [0.         0.         0.         0.         0.         0.
  0.        ]
 [0.         0.         1.         0.         0.         0.
  0.        ]
 [0.         0.         1.         0.         0.         0.
  0.        ]
 [0.         0.         0.         0.70710678 0.70710678 0.
  0.        ]
 [0.         0.         0.         0.70710678 0.70710678 0.
  0.        ]
 [0.         0.         0.         0.         0.         0.
  0.        ]]
['awesome', 'learning', 'children', 'cake', 'good', 'none', 'machine']


In [7]:
# Task 2. Train Word2Vec
def word2vec_model(sentences, size=100, min_count=5, window=5, negative=5, cbow=True, iterations=5, seed=0, workers=1):
    """
    Create and train a gensim word2vec model

    Args:
        sentences: List of sentences to be trained on
        size: Dimensionality of the embedding layer
        min_count: Minimum number of occurrences of a word for use in training
        window: Maximum distance between the current and predicted word within
            a sentence
        negative: Size of negative sampling
        cbow: Training type; True for CBOW, False for Skip-gram
        iterations: Number of iterations to train over
        seed: Seed for the random number generator
        workers: Number of worker threads to train the model

    Returns:
        model: The trained model
    """
    if cbow is True:
        sg = 0
    else:
        sg = 1

    model = Word2Vec(sentences,
                     vector_size=size,
                     window=window,
                     min_count=min_count,
                     workers=workers,
                     sg=sg,
                     negative=negative,
                     seed=seed,
                     epochs=iterations)

    return model


In [8]:
# 2-main
print(common_texts[:2])
w2v = word2vec_model(common_texts, min_count=1)
print(w2v.wv["computer"])

[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time']]
[-9.17425146e-04  4.23241127e-03  5.63164940e-03  6.88221911e-03
 -6.18189573e-03  3.55597492e-03 -4.59551578e-03 -2.62356992e-03
 -2.58884183e-03  1.51444075e-03  1.76495546e-03  1.26824854e-03
 -8.70202854e-03  8.73132143e-03  7.04515446e-03 -2.24651699e-03
  1.43263815e-03 -6.70434721e-03  2.69516581e-03  7.53865717e-03
  8.56675580e-03  7.89457001e-03 -8.89756717e-03 -9.03468858e-03
  4.73744608e-03 -6.03551976e-03 -6.21854421e-03  2.72567268e-03
 -8.80681351e-03  5.77690266e-03 -6.42151944e-03  2.13384978e-03
  2.60995259e-03 -6.16821647e-03 -1.97864044e-03 -7.64716882e-03
  9.61878430e-03  1.19452474e-04 -7.03770155e-03  6.31020777e-03
  4.79384791e-03 -5.65865776e-03 -3.22094793e-03 -8.49734619e-03
  1.13402959e-03  1.02089881e-03 -8.94187670e-03 -6.16365811e-03
 -9.08331887e-04 -8.65152571e-03  4.83665941e-03  5.46529191e-03
  4.19504056e-03  6.42453181e-03  6.02116482e-03 -2.0332

In [9]:
# Task 3. Extract Word2Vec
def gensim_to_keras(model):
    """Get a Keras 'Embedding' layer with weights set from Word2Vec model's
    learned word embeddings.

    Parameters
    ----------
    train_embeddings : bool
        If False, the returned weights are frozen and stopped from being
        updated.
        If True, the weights can / will be further updated in Keras.

    Returns
    -------
    `keras.layers.Embedding`
        Embedding layer, to be used as input to deeper network layers.

    Note: get_keras_embedding used to be a part of the KeyedVectors class in
    gensim.models. This was removed, and their wiki gave this function as a
    replacement. I copied this from their wiki, seems the same as using a
    library function.
    """
    # structure holding the result of training
    keyed_vectors = model.wv
    # vectors themselves, a 2D numpy array
    weights = keyed_vectors.vectors
    #  which row in `weights` corresponds to which word?
    index_to_key = keyed_vectors.index_to_key

    layer = Embedding(
        input_dim=weights.shape[0],
        output_dim=weights.shape[1],
        weights=[weights],
        trainable=False,
    )
    return layer

In [10]:
# 3-main
print(common_texts[:2])
w2v = word2vec_model(common_texts, min_count=1)
print(gensim_to_keras(w2v))

[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time']]
<keras.layers.core.embedding.Embedding object at 0x7fc5377eaee0>


In [11]:
# Task 4. FastText
def fasttext_model(sentences, size=100, min_count=5, negative=5, window=5, cbow=True, iterations=5, seed=0, workers=1):
    """
    Creates and trains a gensim fastText model

    Args:
        sentences: List of sentences to be trained on
        size: Dimensionality of the embedding layer
        min_count: Minimum number of occurrences of a word for use in training
        window: Maximum distance between the current and predicted word within
            a sentence
        negative: Size of negative sampling
        cbow: Training type; True for CBOW, False for Skip-gram
        iterations: Number of iterations to train over
        seed: Seed for the random number generator
        workers: Number of worker threads to train the model

    Returns:
        model: The trained model
    """
    if cbow is True:
        sg = 0
    else:
        sg = 1

    model = FastText(sentences,
                     vector_size=size,
                     window=window,
                     min_count=min_count,
                     workers=workers,
                     sg=sg,
                     negative=negative,
                     seed=seed,
                     epochs=iterations)

    return model

In [12]:
# 4-main
print(common_texts[:2])
ft = fasttext_model(common_texts, min_count=1)
print(ft.wv["computer"])

[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time']]


[-4.4518875e-04  1.9057443e-04  7.1344204e-04  1.5088863e-04
  7.3785416e-04  2.0828047e-03 -1.4264339e-03 -6.6978252e-04
 -3.9446630e-04  6.1643129e-04  3.7035978e-04 -1.7527672e-03
  2.0829479e-05  1.0929988e-03 -6.6954875e-04  7.9767447e-04
 -9.0742309e-04  1.9187949e-03 -6.9725298e-04  3.7622583e-04
 -5.0849823e-05  1.6160590e-04 -8.3575735e-04 -1.4309353e-03
  1.8365250e-04 -1.1365860e-03 -2.1796341e-03  3.3816829e-04
 -1.0266158e-03  1.9360909e-03  9.3765622e-05 -1.2577525e-03
  1.7052694e-04 -1.0470246e-03  9.1582153e-04 -1.1945128e-03
  1.2874184e-03 -3.1551000e-04 -1.1084992e-03  2.2345960e-04
  5.9021922e-04 -5.7232735e-04  1.6017178e-04 -1.0333696e-03
 -2.6842864e-04 -1.2489735e-03 -3.4248878e-05  2.0717620e-03
  1.0997808e-03  4.9419136e-04 -4.3252495e-04  7.6816598e-04
  3.0231036e-04  6.4548600e-04  2.5580439e-03 -1.2883682e-04
 -3.8391326e-04 -2.1800243e-04  6.5950496e-04 -2.8844117e-04
 -7.4177544e-04 -6.5318396e-04  1.4357771e-03  1.7945657e-03
  3.2790678e-03 -1.13009