In [1]:
#nltk - natural language toolkit
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
documents = [
    "Hello this is online machine learning course going on..",
    "Yesterday it was raining and I was having a cup of tea !!",
    "This weather is good to play cricket, Let's play cricket"
]

In [3]:
import string

In [5]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [6]:
documents[0]

'Hello this is online machine learning course going on..'

In [8]:
table = str.maketrans('','',string.punctuation)

In [9]:
documents[0].translate(table)

'Hello this is online machine learning course going on'

In [10]:
documents[1].translate(table)

'Yesterday it was raining and I was having a cup of tea '

In [11]:
documents[2].translate(table)

'This weather is good to play cricket Lets play cricket'

In [12]:
for i in range(len(documents)):
    documents[i] = documents[i].lower().translate(table)

In [13]:
documents

['hello this is online machine learning course going on',
 'yesterday it was raining and i was having a cup of tea ',
 'this weather is good to play cricket lets play cricket']

In [14]:
word_tokenize(documents[0])

['hello',
 'this',
 'is',
 'online',
 'machine',
 'learning',
 'course',
 'going',
 'on']

In [15]:
tokens = []
for i in range(len(documents)):
    tokens.append(word_tokenize(documents[i]))

In [17]:
print(tokens)

[['hello', 'this', 'is', 'online', 'machine', 'learning', 'course', 'going', 'on'], ['yesterday', 'it', 'was', 'raining', 'and', 'i', 'was', 'having', 'a', 'cup', 'of', 'tea'], ['this', 'weather', 'is', 'good', 'to', 'play', 'cricket', 'lets', 'play', 'cricket']]


In [21]:
eng_stopwords = stopwords.words('english')

In [24]:
wordsList = []
for i in range(len(tokens)):
    updatedTokens = []
    for token in tokens[i]:
        if token not in eng_stopwords:
            updatedTokens.append(token)
    wordsList.append(updatedTokens)

In [25]:
wordsList

[['hello', 'online', 'machine', 'learning', 'course', 'going'],
 ['yesterday', 'raining', 'cup', 'tea'],
 ['weather', 'good', 'play', 'cricket', 'lets', 'play', 'cricket']]

In [26]:
ps = PorterStemmer()

In [27]:
ps.stem('learning')

'learn'

In [28]:
ps.stem('going')

'go'

In [29]:
ps.stem('wasted')

'wast'

In [30]:
ps.stem('bought')

'bought'

In [31]:
wnet = WordNetLemmatizer()

In [33]:
# v - verb
# n - noun
wnet.lemmatize('going',pos='v')

'go'

In [34]:
wnet.lemmatize('wasted',pos='v')

'waste'

In [35]:
wnet.lemmatize('bought',pos='v')

'buy'

In [36]:
for i in range(len(wordsList)):
    for j in range(len(wordsList[i])):
        wordsList[i][j] = wnet.lemmatize(wordsList[i][j], pos='v')

In [37]:
wordsList

[['hello', 'online', 'machine', 'learn', 'course', 'go'],
 ['yesterday', 'rain', 'cup', 'tea'],
 ['weather', 'good', 'play', 'cricket', 'let', 'play', 'cricket']]

In [41]:
for i in range(len(wordsList)):
    wordsList[i] = ' '.join(wordsList[i])

print(wordsList)

['hello online machine learn course go', 'yesterday rain cup tea', 'weather good play cricket let play cricket']


In [45]:
tfidf = TfidfVectorizer()

In [46]:
vect = tfidf.fit(wordsList)

In [49]:
print(vect.vocabulary_)

{'hello': 5, 'online': 9, 'machine': 8, 'learn': 6, 'course': 0, 'go': 3, 'yesterday': 14, 'rain': 11, 'cup': 2, 'tea': 12, 'weather': 13, 'good': 4, 'play': 10, 'cricket': 1, 'let': 7}


In [50]:
vect = tfidf.transform(wordsList)

In [51]:
vect

<3x15 sparse matrix of type '<class 'numpy.float64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [53]:
wordsList

['hello online machine learn course go',
 'yesterday rain cup tea',
 'weather good play cricket let play cricket']

In [52]:
vect.toarray()

array([[0.40824829, 0.        , 0.        , 0.40824829, 0.        ,
        0.40824829, 0.40824829, 0.        , 0.40824829, 0.40824829,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.5       , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.5       , 0.5       , 0.        , 0.5       ],
       [0.        , 0.60302269, 0.        , 0.        , 0.30151134,
        0.        , 0.        , 0.30151134, 0.        , 0.        ,
        0.60302269, 0.        , 0.        , 0.30151134, 0.        ]])