In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from gensim.corpora import Dictionary
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/anushka/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/anushka/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anushka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
 # Sample reviews
review1 = "The TechTrend X1 camera captures stunning photos, but the battery life could be better. I'm very impressed with the camera quality."
review2 = "I'm disappointed with the TechTrend X1 battery life, although the camera quality is exceptional. However, the camera features are lacking."

In [3]:
# Tokenization
tokens1 = word_tokenize(review1, language="english", preserve_line=False)
tokens2 = word_tokenize(review2, language="english", preserve_line=False)

In [4]:
tokens1

['The',
 'TechTrend',
 'X1',
 'camera',
 'captures',
 'stunning',
 'photos',
 ',',
 'but',
 'the',
 'battery',
 'life',
 'could',
 'be',
 'better',
 '.',
 'I',
 "'m",
 'very',
 'impressed',
 'with',
 'the',
 'camera',
 'quality',
 '.']

In [5]:
stop_words = set(stopwords.words("english"))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [6]:
filtered_tokens1 = [word for word in tokens1 if word.lower() not in stop_words]
filtered_tokens2 = [word for word in tokens2 if word.lower() not in stop_words]

In [7]:
filtered_tokens1

['TechTrend',
 'X1',
 'camera',
 'captures',
 'stunning',
 'photos',
 ',',
 'battery',
 'life',
 'could',
 'better',
 '.',
 "'m",
 'impressed',
 'camera',
 'quality',
 '.']

In [8]:
# Create dictionary
documents = [filtered_tokens1, filtered_tokens2]
dictionary = Dictionary(documents)

In [9]:
for key, value in dictionary.items():
    print(key, ":", value)

0 : 'm
1 : ,
2 : .
3 : TechTrend
4 : X1
5 : battery
6 : better
7 : camera
8 : captures
9 : could
10 : impressed
11 : life
12 : photos
13 : quality
14 : stunning
15 : However
16 : although
17 : disappointed
18 : exceptional
19 : features
20 : lacking


In [10]:
# Generate bag-of-words vectors
bow_vector1 = dictionary.doc2bow(filtered_tokens1)
bow_vector2 = dictionary.doc2bow(filtered_tokens2)

In [11]:
bow_vector1

[(0, 1),
 (1, 1),
 (2, 2),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 2),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 1)]

In [12]:
# Print results
print("Filtered Tokens 1:", filtered_tokens1)
print("Filtered Tokens 2:", filtered_tokens2)
print("Dictionary:", dictionary.token2id)
print("BoW Vector 1:", bow_vector1)
print("BoW Vector 2:", bow_vector2)

Filtered Tokens 1: ['TechTrend', 'X1', 'camera', 'captures', 'stunning', 'photos', ',', 'battery', 'life', 'could', 'better', '.', "'m", 'impressed', 'camera', 'quality', '.']
Filtered Tokens 2: ["'m", 'disappointed', 'TechTrend', 'X1', 'battery', 'life', ',', 'although', 'camera', 'quality', 'exceptional', '.', 'However', ',', 'camera', 'features', 'lacking', '.']
Dictionary: {"'m": 0, ',': 1, '.': 2, 'TechTrend': 3, 'X1': 4, 'battery': 5, 'better': 6, 'camera': 7, 'captures': 8, 'could': 9, 'impressed': 10, 'life': 11, 'photos': 12, 'quality': 13, 'stunning': 14, 'However': 15, 'although': 16, 'disappointed': 17, 'exceptional': 18, 'features': 19, 'lacking': 20}
BoW Vector 1: [(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1)]
BoW Vector 2: [(0, 1), (1, 2), (2, 2), (3, 1), (4, 1), (5, 1), (7, 2), (11, 1), (13, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1)]
