In [1]:
# importing Natural Language Tool Kit
import nltk

In [2]:
# Stemming
# Stemming involves simply lopping off easily-identified prefixes and suffixes to produce what’s often the simplest version of a
#word. Connection, for example, would have the -ion suffix removed and be correctly reduced to connect. This kind of simple 
#stemming is often all that’s needed, but lemmatization—which actually looks at words and their roots (called lemma) as 
#described in the dictionary—is more precise (as long as the words exist in the dictionary).

from nltk.stem import PorterStemmer #importing porter stemmer algorithm

In [3]:
words = ['wait','waiting','waited','waits']

In [4]:
#creating object of porter stemmer class
port = PorterStemmer()

In [6]:
# finding the root for each word in the list "words"
for i in words:
    root = port.stem(i)
    print(root)

wait
wait
wait
wait


In [7]:
# We oberved above that we have a commnon root for all those words and that's what we wanted!
# Now we'll see another example. Here we have a sentence and we first have to do the word tokenization and then stemming.

text = "Studying Studies Cries Cry" # our text

In [8]:
tokenization = nltk.word_tokenize(text)  # doing word tokenization

In [10]:
porter_stemmer = PorterStemmer()

In [11]:
# printing the roots for each word/token generated in the sentence above. 
for w in tokenization:
    print("Stemming for {} is {}".format(w,porter_stemmer.stem(w)))

Stemming for Studying is studi
Stemming for Studies is studi
Stemming for Cries is cri
Stemming for Cry is cri


In [12]:
# In the stemming above, we can see that the roots of the words generated are incorrect which prohibits us from using stemming
# everywhere. Hence we use Lemmatization.
#Lemmatization is a way of dealing with the fact that while words like connect, connection, connecting, connected, etc. aren’t 
#exactly the same, they all have the same essential meaning: connect. The differences in spelling have grammatical functions in 
#spoken language, but for machine processing, those differences can be confusing, so we need a way to change all the words that
#are forms of the word connect into the word connect itself.

from nltk.stem import WordNetLemmatizer  #importing Word Net Lemmatizer algorithm

In [13]:
#creating object
word_net = WordNetLemmatizer()

In [14]:
# applying the lemmatizer algorithm in the same text as above after tokenization
for w in tokenization:
    print("Lemmatization for {} is {}".format(w,word_net.lemmatize(w)))

Lemmatization for Studying is Studying
Lemmatization for Studies is Studies
Lemmatization for Cries is Cries
Lemmatization for Cry is Cry


In [16]:
#The text must be parsed to remove words, called tokenization. Then the words need to be encoded as integers or floating point 
#values for use as input to a machine learning algorithm, called feature extraction (or vectorization).
# COUNT VECTORIZATION = BAG OF WORDS
# The model is simple in that it throws away all of the order information in the words and focuses on the occurrence of words in
#a document.This can be done by assigning each word a unique number. Then any document we see can be encoded as a fixed-length 
#vector with the length of the vocabulary of known words. The value in each position in the vector could be filled with a count
#or frequency of each word in the encoded document.This is the bag of words model, where we are only concerned with encoding 
#schemes that represent what words are present or the degree to which they are present in encoded documents without any 
#information about order.

from sklearn.feature_extraction.text import CountVectorizer #importing our algorithm

In [33]:
#The CountVectorizer provides a simple way to both tokenize a collection of text documents and build a vocabulary of known words
#, but also to encode new documents using that vocabulary. You can use it as follows:
#Create an instance of the CountVectorizer class.
#Call the fit() function in order to learn a vocabulary from one or more documents.
#Call the transform() function on one or more documents as needed to encode each as a vector.
#Below is an example of using the CountVectorizer to tokenize, build a vocabulary, and then encode a document.
tex1 = ['Natural Language Processing! This is used for text classification and sentiment analysis']

In [34]:
#creating object/instance of CountVectorizer class
vect = CountVectorizer()  # assign the target variable

In [35]:
type(vect) #finding out the type of "vect".

sklearn.feature_extraction.text.CountVectorizer

In [36]:
vect.fit(tex1) #calling the fit() function.

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [37]:
vect.vocabulary_ #sorted alphabetically or we can say that we are seeing their column numbers. 

{'natural': 6,
 'language': 5,
 'processing': 7,
 'this': 10,
 'is': 4,
 'used': 11,
 'for': 3,
 'text': 9,
 'classification': 2,
 'and': 1,
 'sentiment': 8,
 'analysis': 0}

In [38]:
print(vect.vocabulary_) 
#We can see that all words were made lowercase by default and that the punctuation was ignored.

{'natural': 6, 'language': 5, 'processing': 7, 'this': 10, 'is': 4, 'used': 11, 'for': 3, 'text': 9, 'classification': 2, 'and': 1, 'sentiment': 8, 'analysis': 0}


In [26]:
vector = vect.transform(tex1) # converting vect(object) to sparse matrix

In [31]:
type(vector)

scipy.sparse.csr.csr_matrix

In [27]:
vector.shape

(1, 12)

In [28]:
vector.toarray() # one row, 12 columns

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int64)

In [39]:
# One issue with simple counts is that some words like “the” will appear many times and their large counts will not be very 
# meaningful in the encoded vectors. That's why we use the far by most widely used method called "TF-IDF".
# TFIDF VECTORIZATION
# TF - Term frequency = (No of repitition of words in a sentence)/(no of words in a sentence)
# IDF - Inverse Document frequency = log((no of sentences)/(no of sentence containing words))
# We multiply TF with IDF to generate vector numbers and do the encoding afterwards
# TF-IDF are word frequency scores that try to highlight words that are more interesting, e.g. frequent in a document but
# not across documents. The same create, fit, and transform process is used as with the CountVectorizer.
# Below is an example of using the TfidfVectorizer to learn vocabulary and inverse document frequencies and then encode
# those documents.

from sklearn.feature_extraction.text import TfidfVectorizer # importing our algorithm cum class

In [40]:
#creating oject/instance of the class
vec1 = TfidfVectorizer()

In [41]:
type(vec1) #finding out the type of "vec1"

sklearn.feature_extraction.text.TfidfVectorizer

In [42]:
vec1.fit(tex1) #calling the fit function

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [44]:
#  if you have the name of the term and you look for the column position of it at the tf-idf matrix then you go for 
# the ".vocabulary_" method. 
vec1.vocabulary_

{'natural': 6,
 'language': 5,
 'processing': 7,
 'this': 10,
 'is': 4,
 'used': 11,
 'for': 3,
 'text': 9,
 'classification': 2,
 'and': 1,
 'sentiment': 8,
 'analysis': 0}

In [45]:
print(vec1.vocabulary_)

{'natural': 6, 'language': 5, 'processing': 7, 'this': 10, 'is': 4, 'used': 11, 'for': 3, 'text': 9, 'classification': 2, 'and': 1, 'sentiment': 8, 'analysis': 0}


In [46]:
vector1 = vec1.transform(tex1) #converting to sparse matrix

In [47]:
type(vector1)

scipy.sparse.csr.csr_matrix

In [49]:
vector1.shape

(1, 12)

In [48]:
vector1.toarray()

array([[0.28867513, 0.28867513, 0.28867513, 0.28867513, 0.28867513,
        0.28867513, 0.28867513, 0.28867513, 0.28867513, 0.28867513,
        0.28867513, 0.28867513]])

In [52]:
vec1.idf_

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [65]:
text = ["The quick brown fox jumped over the lazy dog.","The dog.","The fox"]
# create the transform
vectorizer = TfidfVectorizer()
vectorizer.fit(text)
print(vectorizer.vocabulary_)
print(vectorizer.idf_)
vector = vectorizer.transform([text[0]])
print(vector.shape)
print(vector.toarray())

{'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}
[1.69314718 1.28768207 1.28768207 1.69314718 1.69314718 1.69314718
 1.69314718 1.        ]
(1, 8)
[[0.36388646 0.27674503 0.27674503 0.36388646 0.36388646 0.36388646
  0.36388646 0.42983441]]


In [67]:
vec1.get_feature_names() # a function which gives the names of all the features in the given text.

['analysis',
 'and',
 'classification',
 'for',
 'is',
 'language',
 'natural',
 'processing',
 'sentiment',
 'text',
 'this',
 'used']