In [5]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer

In [23]:
stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

# add stemming and lemmatisation in the preprocess function
def preprocess(document, stem=True):
    
    #'changes document to lower case and removes stopwords' and tokenize into words
    words = word_tokenize(document)

    # remove stop words
    words = [word for word in words if word.lower() not in stopwords.words("english")]
    
    if stem:
        words = [stemmer.stem(word.lower()) for word in words]
    else:
        words = [wordnet_lemmatizer.lemmatize(word.lower(), pos='v') for word in words]

    # join words to make sentence
    # document = " ".join(words)
    
    return words # document

In [20]:
d1 = 'Vapour, Bangalore has a really great terrace seating and an awesome view of the Bangalore skyline'
d2 = 'The beer at Vapour, Bangalore was amazing. My favourites are the wheat beer and the ale beer.'
d3 = 'Vapour, Bangalore has the best view in Bangalore.'

d = [d1, d2, d3]

In [21]:
d

['Vapour, Bangalore has a really great terrace seating and an awesome view of the Bangalore skyline',
 'The beer at Vapour, Bangalore was amazing. My favourites are the wheat beer and the ale beer.',
 'Vapour, Bangalore has the best view in Bangalore.']

In [25]:
d1 = preprocess(d1, True)
d2 = preprocess(d2, True)
d3 = preprocess(d3, True)

d = [d1, d2, d3]

print(d)

[['vapour', ',', 'bangalor', 'realli', 'great', 'terrac', 'seat', 'awesom', 'view', 'bangalor', 'skylin'], ['beer', 'vapour', ',', 'bangalor', 'amaz', '.', 'favourit', 'wheat', 'beer', 'ale', 'beer', '.'], ['vapour', ',', 'bangalor', 'best', 'view', 'bangalor', '.']]


In [14]:
cv = CountVectorizer()
bow_mod = cv.fit_transform(d)

In [16]:
print(bow_mod.toarray())

[[0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
