In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt



from nltk.corpus import stopwords
stop_words = nltk.corpus.stopwords.words('english')


In [2]:
corpus = ['The food was very tasty and expensive',
          'The food was not tasty and was bland',
          'The food was spicy and good',
          'The worst dining experience ever',
          'Great dining experience, good menu and delicious food',
          'Food was as delicious as always'
]

print(corpus)


['The food was very tasty and expensive', 'The food was not tasty and was bland', 'The food was spicy and good', 'The worst dining experience ever', 'Great dining experience, good menu and delicious food', 'Food was as delicious as always']


In [3]:
# import WordPunctTokenizer() method from nltk
from nltk.tokenize import WordPunctTokenizer

# Create a reference variable for Class WordPunctTokenizer
wpt = nltk.WordPunctTokenizer()

# Define Function 

def preprocess_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    
    # tokenize document
    tokens = wpt.tokenize(doc)
    
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc




In [4]:
# Call Function
preprocess_corpus = np.vectorize(preprocess_document) # output will be numpy array#

In [5]:
clean_corpus = preprocess_corpus(corpus)
clean_corpus
print(len(clean_corpus))

6


In [6]:
from sklearn.feature_extraction.text import CountVectorizer

# get bag of words features in sparse format
cv = CountVectorizer()
cv_matrix = cv.fit_transform(clean_corpus)
cv_matrix


<6x14 sparse matrix of type '<class 'numpy.int64'>'
	with 23 stored elements in Compressed Sparse Row format>

In [7]:
print(cv.get_feature_names_out())

['always' 'bland' 'delicious' 'dining' 'ever' 'expensive' 'experience'
 'food' 'good' 'great' 'menu' 'spicy' 'tasty' 'worst']


In [8]:
print(cv_matrix)

  (0, 7)	1
  (0, 12)	1
  (0, 5)	1
  (1, 7)	1
  (1, 12)	1
  (1, 1)	1
  (2, 7)	1
  (2, 11)	1
  (2, 8)	1
  (3, 13)	1
  (3, 3)	1
  (3, 6)	1
  (3, 4)	1
  (4, 7)	1
  (4, 8)	1
  (4, 3)	1
  (4, 6)	1
  (4, 9)	1
  (4, 10)	1
  (4, 2)	1
  (5, 7)	1
  (5, 2)	1
  (5, 0)	1


In [9]:
# view dense representation 
# warning might give a memory error if data is too big

cv_matrix = cv_matrix.toarray()
cv_matrix

array([[0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0],
       [1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]])

# Creating Document Term Matrix (DTM)
Bag of Words representation

In [11]:

# get all unique words in the corpus
vocab = cv.get_feature_names_out()

# show document feature vectors / BOW representation using Document Term Matrix(DTM)
print("Document Term Matrix")
pd.DataFrame(cv_matrix, columns=vocab)

Document Term Matrix


Unnamed: 0,always,bland,delicious,dining,ever,expensive,experience,food,good,great,menu,spicy,tasty,worst
0,0,0,0,0,0,1,0,1,0,0,0,0,1,0
1,0,1,0,0,0,0,0,1,0,0,0,0,1,0
2,0,0,0,0,0,0,0,1,1,0,0,1,0,0
3,0,0,0,1,1,0,1,0,0,0,0,0,0,1
4,0,0,1,1,0,0,1,1,1,1,1,0,0,0
5,1,0,1,0,0,0,0,1,0,0,0,0,0,0


# Creating N - Gram

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

# get bag of words features in sparse format
cv = CountVectorizer(ngram_range=(1,1))
cv_matrix = cv.fit_transform(clean_corpus)
cv_matrix


cv_matrix = cv_matrix.toarray()
# get all unique words in the corpus
vocab = cv.get_feature_names_out()

# show document feature vectors / BOW representation using Document Term Matrix(DTM)
print("Document Term Matrix")
pd.DataFrame(cv_matrix, columns=vocab)

Document Term Matrix


Unnamed: 0,always,bland,delicious,dining,ever,expensive,experience,food,good,great,menu,spicy,tasty,worst
0,0,0,0,0,0,1,0,1,0,0,0,0,1,0
1,0,1,0,0,0,0,0,1,0,0,0,0,1,0
2,0,0,0,0,0,0,0,1,1,0,0,1,0,0
3,0,0,0,1,1,0,1,0,0,0,0,0,0,1
4,0,0,1,1,0,0,1,1,1,1,1,0,0,0
5,1,0,1,0,0,0,0,1,0,0,0,0,0,0


# Creating BI-Gram 

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

# get bag of words features in sparse format
cv = CountVectorizer(ngram_range=(2,2))
cv_matrix = cv.fit_transform(clean_corpus)
cv_matrix


cv_matrix = cv_matrix.toarray()
# get all unique words in the corpus
vocab = cv.get_feature_names_out()

# show document feature vectors / BOW representation using Document Term Matrix(DTM)
print("Document Term Matrix")
pd.DataFrame(cv_matrix, columns=vocab)

Document Term Matrix


Unnamed: 0,delicious always,delicious food,dining experience,experience ever,experience good,food delicious,food spicy,food tasty,good menu,great dining,menu delicious,spicy good,tasty bland,tasty expensive,worst dining
0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
2,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
3,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1
4,0,1,1,0,1,0,0,0,1,1,1,0,0,0,0
5,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0


# Creating Tri -Gram

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

# get bag of words features in sparse format
cv = CountVectorizer(ngram_range=(3,3))
cv_matrix = cv.fit_transform(clean_corpus)
cv_matrix


cv_matrix = cv_matrix.toarray()
# get all unique words in the corpus
vocab = cv.get_feature_names_out()

# show document feature vectors / BOW representation using Document Term Matrix(DTM)
print("Document Term Matrix")
pd.DataFrame(cv_matrix, columns=vocab)

Document Term Matrix


Unnamed: 0,dining experience ever,dining experience good,experience good menu,food delicious always,food spicy good,food tasty bland,food tasty expensive,good menu delicious,great dining experience,menu delicious food,worst dining experience
0,0,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,1
4,0,1,1,0,0,0,0,1,1,1,0
5,0,0,0,1,0,0,0,0,0,0,0


In [18]:
from sklearn.feature_extraction.text import CountVectorizer

# get bag of words features in sparse format
cv = CountVectorizer(ngram_range=(4,4))
cv_matrix = cv.fit_transform(clean_corpus)
cv_matrix


cv_matrix = cv_matrix.toarray()
# get all unique words in the corpus
vocab = cv.get_feature_names_out()

# show document feature vectors / BOW representation using Document Term Matrix(DTM)
print("Document Term Matrix")
pd.DataFrame(cv_matrix, columns=vocab)

Document Term Matrix


Unnamed: 0,dining experience good menu,experience good menu delicious,good menu delicious food,great dining experience good,worst dining experience ever
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,1
4,1,1,1,1,0
5,0,0,0,0,0


In [19]:
from sklearn.feature_extraction.text import CountVectorizer

# get bag of words features in sparse format
cv = CountVectorizer(ngram_range=(5,5))
cv_matrix = cv.fit_transform(clean_corpus)
cv_matrix


cv_matrix = cv_matrix.toarray()
# get all unique words in the corpus
vocab = cv.get_feature_names_out()

# show document feature vectors / BOW representation using Document Term Matrix(DTM)
print("Document Term Matrix")
pd.DataFrame(cv_matrix, columns=vocab)

Document Term Matrix


Unnamed: 0,dining experience good menu delicious,experience good menu delicious food,great dining experience good menu
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,1,1,1
5,0,0,0


In [40]:
from sklearn.feature_extraction.text import CountVectorizer

# get bag of words features in sparse format
cv = CountVectorizer(ngram_range=(1,3))
cv_matrix = cv.fit_transform(clean_corpus)
cv_matrix


cv_matrix = cv_matrix.toarray()
# get all unique words in the corpus
vocab = cv.get_feature_names_out()

# show document feature vectors / BOW representation using Document Term Matrix(DTM)
print("Document Term Matrix")
pd.DataFrame(cv_matrix, columns=vocab)

Document Term Matrix


Unnamed: 0,always,bland,delicious,delicious always,delicious food,dining,dining experience,dining experience ever,dining experience good,ever,...,menu delicious,menu delicious food,spicy,spicy good,tasty,tasty bland,tasty expensive,worst,worst dining,worst dining experience
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
3,0,0,0,0,0,1,1,1,0,1,...,0,0,0,0,0,0,0,1,1,1
4,0,0,1,0,1,1,1,0,1,0,...,1,1,0,0,0,0,0,0,0,0
5,1,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
