In [42]:
# Imports
from sklearn.feature_extraction.text import CountVectorizer

import re
import string

In [59]:
# Task 0. Bag of Words
def bag_of_words(sentences, vocab=None):
    """
    Creates a bag of words embedding matrix
    Args:
        sentences: list of sentences to analyze
        vocab: list of the vocabulary words to use for the analysis

    Returns:
        embeddings: np.ndarray shape (s, f) containing the embeddings
            s: number of sentences in sentences
            f: number of features analyzed
        features: list of the features used for embeddings
    """
    # If vocab is None, make an empty list to append to
    if vocab is None:
        f = []

    short_sentences = []
    real_list = []
    corpus = []
    # First, convert sentences list to corpus, lowercase, no punctuation
    for i in range(len(sentences)):
        review = re.sub('[^a-zA-Z]', ' ', sentences[i])
        review = review.lower()
        review = review.split()
        if vocab:
            short_sentences.clear()
            for j in range(0, len(review)):
                if review[j] in vocab:
                    word = review[j]
                    short_sentences.append(word)
            review = ' '.join(short_sentences)
        else:
            review = ' '.join(review)
        corpus.append(review)

    if vocab is None:
        print(corpus)
        # All of this to get the vocab list
        for sentence in corpus:
            words = sentence.split()
            for word in words:
                if len(word) > 1 and word not in real_list:
                    real_list.append(word)
                    real_list.sort()
        cv = CountVectorizer()
        s = cv.fit_transform(corpus).toarray()
        f = real_list

        return s, f

    else:
        print(corpus)
        cv = CountVectorizer(vocabulary=vocab)
        s = cv.fit_transform(corpus).toarray()

        return s, vocab




In [55]:
# 0-main - Vocab is None
sentences = ["Holberton school is Awesome!",
             "Machine learning is awesome",
             "NLP is the future!",
             "The children are our future",
             "Our children's children are our grandchildren",
             "The cake was not very good",
             "No one said that the cake was not very good",
             "Life is beautiful"]
E, F = bag_of_words(sentences)
print(E)
print(F)

['holberton school is awesome', 'machine learning is awesome', 'nlp is the future', 'the children are our future', 'our children s children are our grandchildren', 'the cake was not very good', 'no one said that the cake was not very good', 'life is beautiful']
[[0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0]
 [1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0]
 [1 0 0 0 2 0 0 1 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0]
 [0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1]
 [0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1 0 1 1 1 1]
 [0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0]]
['are', 'awesome', 'beautiful', 'cake', 'children', 'future', 'good', 'grandchildren', 'holberton', 'is', 'learning', 'life', 'machine', 'nlp', 'no', 'not', 'one', 'our', 'said', 'school', 'that', 'the', 'very', 'was']


In [60]:
#0-main - Vocab is not None
sentences = ["Holberton school is Awesome!",
             "Machine learning is awesome",
             "NLP is the future!",
             "The children are our future",
             "Our children's children are our grandchildren",
             "The cake was not very good",
             "No one said that the cake was not very good",
             "Life is beautiful"]
vocab = ["children", "is", "awesome", "cake", "are", "our", "future"]
E, F = bag_of_words(sentences, vocab)
print(E)
print(F)

['is awesome', 'is awesome', 'is future', 'children are our future', 'our children children are our', 'cake', 'cake', 'is']
[[0 1 1 0 0 0 0]
 [0 1 1 0 0 0 0]
 [0 1 0 0 0 0 1]
 [1 0 0 0 1 1 1]
 [2 0 0 0 1 2 0]
 [0 0 0 1 0 0 0]
 [0 0 0 1 0 0 0]
 [0 1 0 0 0 0 0]]
['children', 'is', 'awesome', 'cake', 'are', 'our', 'future']
