In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import re
import string

# If you haven't downloaded the NLTK stopword package, you will need to do so
nltk.download('stopwords')

# Sample text
documents = ["I like this movie, it's funny.", "I hate this movie.", "This was awesome! I like it.", "Nice one. I love it."]


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Text preprocessing steps - remove numbers, captial letters and punctuation
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

documents = [alphanumeric(document) for document in documents]
documents = [punc_lower(document) for document in documents]
documents

['i like this movie  it s funny ',
 'i hate this movie ',
 'this was awesome  i like it ',
 'nice one  i love it ']

In [6]:
# Tokenization
documents = [nltk.word_tokenize(document) for document in documents]
documents

[['i', 'like', 'this', 'movie', 'it', 's', 'funny'],
 ['i', 'hate', 'this', 'movie'],
 ['this', 'was', 'awesome', 'i', 'like', 'it'],
 ['nice', 'one', 'i', 'love', 'it']]

In [8]:
# Stopword removal
stop_words = set(stopwords.words('english'))
documents = [[word for word in document if not word in stop_words] for document in documents]
documents

[['like', 'movie', 'funny'],
 ['hate', 'movie'],
 ['awesome', 'like'],
 ['nice', 'one', 'love']]

In [9]:
# Stemming
stemmer = PorterStemmer()
documents = [[stemmer.stem(word) for word in document] for document in documents]
documents

[['like', 'movi', 'funni'],
 ['hate', 'movi'],
 ['awesom', 'like'],
 ['nice', 'one', 'love']]

In [10]:
# Join words back to document
documents = [" ".join(document) for document in documents]
documents

['like movi funni', 'hate movi', 'awesom like', 'nice one love']

In [11]:
# Vectorization - Bag of Words
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)

# Let's see the result
print(pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out()))


   awesom  funni  hate  like  love  movi  nice  one
0       0      1     0     1     0     1     0    0
1       0      0     1     0     0     1     0    0
2       1      0     0     1     0     0     0    0
3       0      0     0     0     1     0     1    1


In [12]:
# Vectorization - TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)

# Let's see the result
print(pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out()))

     awesom     funni      hate      like     love      movi     nice      one
0  0.000000  0.667679  0.000000  0.526405  0.00000  0.526405  0.00000  0.00000
1  0.000000  0.000000  0.785288  0.000000  0.00000  0.619130  0.00000  0.00000
2  0.785288  0.000000  0.000000  0.619130  0.00000  0.000000  0.00000  0.00000
3  0.000000  0.000000  0.000000  0.000000  0.57735  0.000000  0.57735  0.57735
