In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt

In [2]:
corpus = ['The food was very tasty and expensive',
          'The food was not tasty and was bland',
          'The food was spicy and good',
          'The worst dining experience ever',
          'Great dining experience, good menu and delicious food',
          'Food was as delicious as always'
]

print(corpus)


['The food was very tasty and expensive', 'The food was not tasty and was bland', 'The food was spicy and good', 'The worst dining experience ever', 'Great dining experience, good menu and delicious food', 'Food was as delicious as always']


In [3]:
# import WordPunctTokenizer() method from nltk
from nltk.tokenize import WordPunctTokenizer
# Create a reference variable for Class WordPunctTokenizer
wpt = nltk.WordPunctTokenizer()

from nltk.corpus import stopwords
stop_words = nltk.corpus.stopwords.words('english')

def preprocess_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    
    # tokenize document
    tokens = wpt.tokenize(doc)
    
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc


preprocess_corpus = np.vectorize(preprocess_document) # output will be numpy array#

In [4]:
clean_corpus = preprocess_corpus(corpus)
clean_corpus
print(len(clean_corpus))

6


In [5]:
from sklearn.feature_extraction.text import CountVectorizer

# get bag of words features in sparse format
cv = CountVectorizer()
cv_matrix = cv.fit_transform(clean_corpus)
cv_matrix


<6x14 sparse matrix of type '<class 'numpy.int64'>'
	with 23 stored elements in Compressed Sparse Row format>

In [6]:
print(cv.get_feature_names_out())

['always' 'bland' 'delicious' 'dining' 'ever' 'expensive' 'experience'
 'food' 'good' 'great' 'menu' 'spicy' 'tasty' 'worst']


In [7]:
print(cv_matrix)

  (0, 7)	1
  (0, 12)	1
  (0, 5)	1
  (1, 7)	1
  (1, 12)	1
  (1, 1)	1
  (2, 7)	1
  (2, 11)	1
  (2, 8)	1
  (3, 13)	1
  (3, 3)	1
  (3, 6)	1
  (3, 4)	1
  (4, 7)	1
  (4, 8)	1
  (4, 3)	1
  (4, 6)	1
  (4, 9)	1
  (4, 10)	1
  (4, 2)	1
  (5, 7)	1
  (5, 2)	1
  (5, 0)	1


In [8]:
# view dense representation 
# warning might give a memory error if data is too big

cv_matrix = cv_matrix.toarray()
cv_matrix

array([[0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0],
       [1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]], dtype=int64)

# Creating Document Term Matrix (DTM)
Bag of Words representation

In [9]:

# get all unique words in the corpus
vocab = cv.get_feature_names_out()

# show document feature vectors / BOW representation using Document Term Matrix(DTM)
print("Document Term Matrix")
pd.DataFrame(cv_matrix, columns=vocab)

Document Term Matrix


Unnamed: 0,always,bland,delicious,dining,ever,expensive,experience,food,good,great,menu,spicy,tasty,worst
0,0,0,0,0,0,1,0,1,0,0,0,0,1,0
1,0,1,0,0,0,0,0,1,0,0,0,0,1,0
2,0,0,0,0,0,0,0,1,1,0,0,1,0,0
3,0,0,0,1,1,0,1,0,0,0,0,0,0,1
4,0,0,1,1,0,0,1,1,1,1,1,0,0,0
5,1,0,1,0,0,0,0,1,0,0,0,0,0,0


# Creating TF-IDF matrix
 Reference : https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html#sklearn.feature_extraction.text.TfidfTransformer
 

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords


stop_words = stopwords.words('english')

transformer = TfidfVectorizer(norm = None, stop_words=stop_words)
cv_matrix = transformer.fit_transform(clean_corpus).todense()
cv_matrix

import pandas as pd
vocab = transformer.get_feature_names_out()
print("TF-IDF")
pd.DataFrame(cv_matrix,  columns=vocab)

TF-IDF


Unnamed: 0,always,bland,delicious,dining,ever,expensive,experience,food,good,great,menu,spicy,tasty,worst
0,0.0,0.0,0.0,0.0,0.0,2.252763,0.0,1.154151,0.0,0.0,0.0,0.0,1.847298,0.0
1,0.0,2.252763,0.0,0.0,0.0,0.0,0.0,1.154151,0.0,0.0,0.0,0.0,1.847298,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.154151,1.847298,0.0,0.0,2.252763,0.0,0.0
3,0.0,0.0,0.0,1.847298,2.252763,0.0,1.847298,0.0,0.0,0.0,0.0,0.0,0.0,2.252763
4,0.0,0.0,1.847298,1.847298,0.0,0.0,1.847298,1.154151,1.847298,2.252763,2.252763,0.0,0.0,0.0
5,2.252763,0.0,1.847298,0.0,0.0,0.0,0.0,1.154151,0.0,0.0,0.0,0.0,0.0,0.0
