<a href="https://colab.research.google.com/github/armandossrecife/mysentimentanalysis/blob/main/my_bag_of_words.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exemplo 1

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample documents
documents = [
    "The quick brown fox jumps over the lazy dog.",
    "The dog is lazy. It jumps slowly over the brown fox."
]
# Preprocessing (optional)
# You can add additional preprocessing steps like stop word removal here

# Create a CountVectorizer object
vectorizer = CountVectorizer()

# Fit the vectorizer to the documents (learn the vocabulary)
vectorizer.fit(documents)

# Transform the documents into BoW feature vectors
bow_matrix = vectorizer.transform(documents)

# Print the vocabulary (unique words)
print(vectorizer.get_feature_names_out())

# Print the BoW feature vectors as a dense matrix (easier to read)
print(bow_matrix.toarray())

['brown' 'dog' 'fox' 'is' 'it' 'jumps' 'lazy' 'over' 'quick' 'slowly'
 'the']
[[1 1 1 0 0 1 1 1 1 0 2]
 [1 1 1 1 1 1 1 1 0 1 2]]


# Exemplo 2

In [9]:
# Make sure to install the necessary packages first
# pip install --upgrade pip
# pip install tensorflow
from tensorflow import keras
from typing import List
from keras.preprocessing.text import Tokenizer

p1 = "Sarah enjoys watching fantasy movies in her free time. Michael likes watching them too, especially ones with dragons"
p2 = "Lisa loves baking delicious cookies and watching movies on weekends. Her brother David enjoys watching movies with her while they enjoy the cookies"
p3 = "The park is a great place to relax and watch movies outdoors on a projector. Many people come here to unwind after work and enjoy a movie under the stars"
p4 = "Learning a new language can be challenging, but watching movies with subtitles in that language can be a fun way to practice. The rewards of fluency are definitely worth it. 5. Traveling to new places broadens your horizons and exposes you to new cultures. Watching movies made in those countries can be a great way to experience their stories and traditions"

sentence1 = [p1]
sentence2 = [p2]
sentence3 = [p3]
sentence4 = [p4]

def print_bow(sentence: List[str]) -> None:
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(sentence)
    sequences = tokenizer.texts_to_sequences(sentence)
    word_index = tokenizer.word_index
    bow = {}
    for key in word_index:
        bow[key] = sequences[0].count(word_index[key])

    print(f"Bag of word sentence :\n{bow}")
    print(f"We found {len(word_index)} unique tokens.")

In [8]:
print_bow(sentence1)
print_bow(sentence2)
print_bow(sentence3)
print_bow(sentence4)

Bag of word sentence 1:
{'watching': 2, 'sarah': 1, 'enjoys': 1, 'fantasy': 1, 'movies': 1, 'in': 1, 'her': 1, 'free': 1, 'time': 1, 'michael': 1, 'likes': 1, 'them': 1, 'too': 1, 'especially': 1, 'ones': 1, 'with': 1, 'dragons': 1}
We found 17 unique tokens.
Bag of word sentence 1:
{'cookies': 2, 'watching': 2, 'movies': 2, 'her': 2, 'lisa': 1, 'loves': 1, 'baking': 1, 'delicious': 1, 'and': 1, 'on': 1, 'weekends': 1, 'brother': 1, 'david': 1, 'enjoys': 1, 'with': 1, 'while': 1, 'they': 1, 'enjoy': 1, 'the': 1}
We found 19 unique tokens.
Bag of word sentence 1:
{'a': 3, 'the': 2, 'to': 2, 'and': 2, 'park': 1, 'is': 1, 'great': 1, 'place': 1, 'relax': 1, 'watch': 1, 'movies': 1, 'outdoors': 1, 'on': 1, 'projector': 1, 'many': 1, 'people': 1, 'come': 1, 'here': 1, 'unwind': 1, 'after': 1, 'work': 1, 'enjoy': 1, 'movie': 1, 'under': 1, 'stars': 1}
We found 25 unique tokens.
Bag of word sentence 1:
{'to': 4, 'a': 3, 'new': 3, 'can': 3, 'be': 3, 'language': 2, 'watching': 2, 'movies': 2, '

In [11]:
all_p = p1 + p2 + p3 + p4
all_sentences = [all_p]
print_bow(all_sentences)

Bag of word sentence :
{'watching': 6, 'movies': 6, 'a': 6, 'to': 6, 'and': 5, 'in': 3, 'her': 3, 'with': 3, 'the': 3, 'new': 3, 'can': 3, 'be': 3, 'enjoys': 2, 'on': 2, 'enjoy': 2, 'great': 2, 'language': 2, 'way': 2, 'sarah': 1, 'fantasy': 1, 'free': 1, 'time': 1, 'michael': 1, 'likes': 1, 'them': 1, 'too': 1, 'especially': 1, 'ones': 1, 'dragonslisa': 1, 'loves': 1, 'baking': 1, 'delicious': 1, 'cookies': 1, 'weekends': 1, 'brother': 1, 'david': 1, 'while': 1, 'they': 1, 'cookiesthe': 1, 'park': 1, 'is': 1, 'place': 1, 'relax': 1, 'watch': 1, 'outdoors': 1, 'projector': 1, 'many': 1, 'people': 1, 'come': 1, 'here': 1, 'unwind': 1, 'after': 1, 'work': 1, 'movie': 1, 'under': 1, 'starslearning': 1, 'challenging': 1, 'but': 1, 'subtitles': 1, 'that': 1, 'fun': 1, 'practice': 1, 'rewards': 1, 'of': 1, 'fluency': 1, 'are': 1, 'definitely': 1, 'worth': 1, 'it': 1, '5': 1, 'traveling': 1, 'places': 1, 'broadens': 1, 'your': 1, 'horizons': 1, 'exposes': 1, 'you': 1, 'cultures': 1, 'made': 1