### Overview of nltk package

In [1]:
import nltk

In [2]:
 from nltk.corpus import brown

In [3]:
print(brown.categories())
print(len(brown.categories()))

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
15


In [4]:
data = brown.sents(categories = 'adventure' )
print(len(data))

4637


In [5]:
data[0]

['Dan',
 'Morgan',
 'told',
 'himself',
 'he',
 'would',
 'forget',
 'Ann',
 'Turner',
 '.']

In [6]:
" ".join(data[0])

'Dan Morgan told himself he would forget Ann Turner .'

# Bag of Words Pipeline

- Get the Corpus
- Tokenisation, StopWard Removal
- Stemming
- Building a Vocal
- Vectorization
- Classification

# Tokenization & Stopwords Removal

In [7]:
document = """Science can amuse and fascinate us all, but it is engineering that changes the world.
              The engineer has been, and is, a maker of history. 
              Scientists study the world as it is; engineers create the world that has never been.
              The way to succeed is to double your failure rate."""

sentence = "Send all the 50 documents and other data at jamesbond@001.com"

In [8]:
from nltk.tokenize import sent_tokenize, word_tokenize #one for conversion into token of sentences and another one for word tokens

In [9]:
sents = sent_tokenize(document)
print(sents)
print(len(sents))

['Science can amuse and fascinate us all, but it is engineering that changes the world.', 'The engineer has been, and is, a maker of history.', 'Scientists study the world as it is; engineers create the world that has never been.', 'The way to succeed is to double your failure rate.']
4


In [10]:
print(sentence.split(sep = " "))

['Send', 'all', 'the', '50', 'documents', 'and', 'other', 'data', 'at', 'jamesbond@001.com']


In [11]:
print(sent_tokenize(sentence))

['Send all the 50 documents and other data at jamesbond@001.com']


In [12]:
words = word_tokenize(sentence)
print(words)
print(len(words))

['Send', 'all', 'the', '50', 'documents', 'and', 'other', 'data', 'at', 'jamesbond', '@', '001.com']
12


# Removing Stopwords

In [13]:
from nltk.corpus import stopwords #Some pre-defined non useful words

In [14]:
eng_sw = set(stopwords.words("english")) #Stopwords in english

In [15]:
print(eng_sw)
print(len(eng_sw))

{'yourself', 'by', 'have', 'further', 'we', 'yourselves', 'can', 'only', "wouldn't", 'are', 'not', 'mustn', 'o', 'with', 'before', 'ours', 'wouldn', 'been', 'as', 'him', 'too', 'very', 'these', 'she', 'shouldn', 'above', "she's", "haven't", 'then', 'your', 'why', 'now', 'do', 'but', 'just', "don't", "mustn't", 'all', "hadn't", 'herself', 'because', 'under', 'here', 'each', 'them', 't', 'be', 'most', 'myself', 'during', "couldn't", 'is', 'should', 'there', "hasn't", 'for', 'yours', 'and', 'which', 'a', 'doing', 'when', 'more', 'other', 'ma', 'had', 'being', 'up', 'until', 'weren', 'at', 've', 'what', 'hasn', 'isn', 'won', "aren't", "needn't", "wasn't", 'their', 'such', 'm', 'having', 'how', 'both', 'ourselves', 'theirs', "you're", "shouldn't", 'our', 'doesn', 'will', 'that', 'they', 'down', 'has', 'on', 'don', "should've", 'didn', 'those', 'were', "isn't", 'any', 'itself', 'd', "shan't", "that'll", 'me', 'while', 'haven', 'himself', 'nor', 'wasn', 'to', "won't", 'hadn', "didn't", 's', '

In [16]:
def RemoveStopWords(text, StopWords):
    useful_Words = [word  for word in text if word not in StopWords]
    return useful_Words

In [17]:
demo_text = "i am not bothered about her very much".split()
useful_text = RemoveStopWords(demo_text, eng_sw)
print(useful_text)

['bothered', 'much']


# Tokenization using RegeX

In [18]:
sentence

'Send all the 50 documents and other data at jamesbond@001.com'

In [19]:
from nltk.tokenize import RegexpTokenizer 

In [20]:
tokenizer = RegexpTokenizer("[a-zA-Z]+") # I want all Words but not the numbers

In [21]:
useful_text = tokenizer.tokenize(sentence)

In [22]:
useful_text

['Send',
 'all',
 'the',
 'documents',
 'and',
 'other',
 'data',
 'at',
 'jamesbond',
 'com']

# Stemming

- Process that transforms particular words(verbs, plurals) into their radical form
- Preserve the semantics of the sentence without increasing the number of unique tokens
- Example - jumps, jumping, jumped, jump  ==> jump

In [23]:
text = "Foxes love to make jumps. The quick brown fox was seen jumping over the lovely dog from a 6ft high wall. This is how fox made his first jump"

## There are 3 types of stemmers in nltk : 
1. Snowball Stemmer
2. Porter Stemmer
3. Lancaster Stemmer

In [24]:
from nltk.stem.snowball import SnowballStemmer, PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

In [25]:
# Step 1: Create object of Stemmer class

PS = PorterStemmer()

In [26]:
# Demo 
PS.stem("jumping")

'jump'

In [27]:
PS.stem("Jumping")

'jump'

In [28]:
PS.stem("Jumps")

'jump'

In [29]:
## Using SnowballStemmer (It is a multilingual Stemmer) need to specify language

ss = SnowballStemmer(language = "english")

In [30]:
ss.stem("Jumping")

'jump'

In [31]:
ss.stem("Jumps")

'jump'

In [32]:
ls = LancasterStemmer()

In [33]:
ls.stem("Jumping")

'jump'

In [34]:
ls.stem("jumps")

'jump'

# Lemmatization

In [35]:
from nltk.stem import WordNetLemmatizer

wn = WordNetLemmatizer()

In [36]:
wn.lemmatize("jumps")

'jump'

In [37]:
wn.lemmatize("jumping")

'jumping'

# Building a Vocab & Vectorization

In [38]:
# Creating a demo corpus

corpus = [
        "As engineers, we were going to be in a position to change the world – not just study it.",
        "The scientist discovers a new type of material or energy and the engineer discovers a new use for it.",
        "This job is a great scientific adventure. But it’s also a great human adventure.",
        "Science can amuse and fascinate us all, but it is engineering that changes the world."
]

In [39]:
from sklearn.feature_extraction.text import CountVectorizer
cvt = CountVectorizer() #Object instantiation

In [40]:
vectorized_corpus = cvt.fit_transform(corpus)

In [41]:
vectorized_corpus

<4x46 sparse matrix of type '<class 'numpy.int64'>'
	with 55 stored elements in Compressed Sparse Row format>

In [42]:
vectorized_corpus = vectorized_corpus.toarray()
vectorized_corpus

array([[0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
        0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 2, 0, 0, 0, 1,
        1, 1],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 1, 0, 0, 1, 2, 0, 1, 1, 0, 0, 0, 1, 0, 0, 2, 0, 0, 1, 0, 1, 0,
        0, 0],
       [2, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0,
        1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0],
       [0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
        1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
        0, 1]], dtype=int64)

In [43]:
print(cvt.get_feature_names())

['adventure', 'all', 'also', 'amuse', 'and', 'as', 'be', 'but', 'can', 'change', 'changes', 'discovers', 'energy', 'engineer', 'engineering', 'engineers', 'fascinate', 'for', 'going', 'great', 'human', 'in', 'is', 'it', 'job', 'just', 'material', 'new', 'not', 'of', 'or', 'position', 'science', 'scientific', 'scientist', 'study', 'that', 'the', 'this', 'to', 'type', 'us', 'use', 'we', 'were', 'world']


In [44]:
cvt.vocabulary_.keys()

dict_keys(['as', 'engineers', 'we', 'were', 'going', 'to', 'be', 'in', 'position', 'change', 'the', 'world', 'not', 'just', 'study', 'it', 'scientist', 'discovers', 'new', 'type', 'of', 'material', 'or', 'energy', 'and', 'engineer', 'use', 'for', 'this', 'job', 'is', 'great', 'scientific', 'adventure', 'but', 'also', 'human', 'science', 'can', 'amuse', 'fascinate', 'us', 'all', 'engineering', 'that', 'changes'])

In [45]:
cvt.vocabulary_.values()

dict_values([5, 15, 43, 44, 18, 39, 6, 21, 31, 9, 37, 45, 28, 25, 35, 23, 34, 11, 27, 40, 29, 26, 30, 12, 4, 13, 42, 17, 38, 24, 22, 19, 33, 0, 7, 2, 20, 32, 8, 3, 16, 41, 1, 14, 36, 10])

In [46]:
len(cvt.vocabulary_.keys())

46

In [47]:
# Dictionary

cvt.vocabulary_

{'as': 5,
 'engineers': 15,
 'we': 43,
 'were': 44,
 'going': 18,
 'to': 39,
 'be': 6,
 'in': 21,
 'position': 31,
 'change': 9,
 'the': 37,
 'world': 45,
 'not': 28,
 'just': 25,
 'study': 35,
 'it': 23,
 'scientist': 34,
 'discovers': 11,
 'new': 27,
 'type': 40,
 'of': 29,
 'material': 26,
 'or': 30,
 'energy': 12,
 'and': 4,
 'engineer': 13,
 'use': 42,
 'for': 17,
 'this': 38,
 'job': 24,
 'is': 22,
 'great': 19,
 'scientific': 33,
 'adventure': 0,
 'but': 7,
 'also': 2,
 'human': 20,
 'science': 32,
 'can': 8,
 'amuse': 3,
 'fascinate': 16,
 'us': 41,
 'all': 1,
 'engineering': 14,
 'that': 36,
 'changes': 10}

In [48]:
print(cvt.vocabulary_)

{'as': 5, 'engineers': 15, 'we': 43, 'were': 44, 'going': 18, 'to': 39, 'be': 6, 'in': 21, 'position': 31, 'change': 9, 'the': 37, 'world': 45, 'not': 28, 'just': 25, 'study': 35, 'it': 23, 'scientist': 34, 'discovers': 11, 'new': 27, 'type': 40, 'of': 29, 'material': 26, 'or': 30, 'energy': 12, 'and': 4, 'engineer': 13, 'use': 42, 'for': 17, 'this': 38, 'job': 24, 'is': 22, 'great': 19, 'scientific': 33, 'adventure': 0, 'but': 7, 'also': 2, 'human': 20, 'science': 32, 'can': 8, 'amuse': 3, 'fascinate': 16, 'us': 41, 'all': 1, 'engineering': 14, 'that': 36, 'changes': 10}


In [49]:
# Reverse Mapping!

numbers = vectorized_corpus[2]
print(numbers)

[2 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 2 1 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 0
 0 1 0 0 0 0 0 0 0]


In [50]:
s = cvt.inverse_transform(numbers)
print(type(s))
print(s)
print()
print("Original text: ", corpus[2])
print("After inverse_transformation: ", " ".join(*s))

<class 'list'>
[array(['adventure', 'also', 'but', 'great', 'human', 'is', 'it', 'job',
       'scientific', 'this'], dtype='<U11')]

Original text:  This job is a great scientific adventure. But it’s also a great human adventure.
After inverse_transformation:  adventure also but great human is it job scientific this


# Vectorization with StopWord Removal

In [51]:
def MyTokenizer(document):
    words = tokenizer.tokenize(document.lower())
    
    #Remove StopWords
    words = RemoveStopWords(words, eng_sw)
    return words

In [52]:
MyTokenizer("This is an ordinary function")

['ordinary', 'function']

In [53]:
sentence

'Send all the 50 documents and other data at jamesbond@001.com'

In [54]:
MyTokenizer(sentence)

['send', 'documents', 'data', 'jamesbond', 'com']

In [55]:
cvt = CountVectorizer(tokenizer = MyTokenizer)

In [56]:
corpus

['As engineers, we were going to be in a position to change the world – not just study it.',
 'The scientist discovers a new type of material or energy and the engineer discovers a new use for it.',
 'This job is a great scientific adventure. But it’s also a great human adventure.',
 'Science can amuse and fascinate us all, but it is engineering that changes the world.']

In [57]:
vectorized_corpus = cvt.fit_transform(corpus).toarray()

In [58]:
print(vectorized_corpus)

[[0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 1]
 [0 0 0 0 0 2 1 1 0 0 0 0 0 0 0 1 2 0 0 0 1 0 1 0 1 0]
 [2 1 0 0 0 0 0 0 0 0 0 0 2 1 1 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1]]


In [59]:
print(cvt.vocabulary_)

{'engineers': 9, 'going': 11, 'position': 17, 'change': 3, 'world': 25, 'study': 21, 'scientist': 20, 'discovers': 5, 'new': 16, 'type': 22, 'material': 15, 'energy': 6, 'engineer': 7, 'use': 24, 'job': 14, 'great': 12, 'scientific': 19, 'adventure': 0, 'also': 1, 'human': 13, 'science': 18, 'amuse': 2, 'fascinate': 10, 'us': 23, 'engineering': 8, 'changes': 4}


In [60]:
" ".join(*cvt.inverse_transform(vectorized_corpus[0]))

'change engineers going position study world'

In [61]:
print(cvt.vocabulary_)

{'engineers': 9, 'going': 11, 'position': 17, 'change': 3, 'world': 25, 'study': 21, 'scientist': 20, 'discovers': 5, 'new': 16, 'type': 22, 'material': 15, 'energy': 6, 'engineer': 7, 'use': 24, 'job': 14, 'great': 12, 'scientific': 19, 'adventure': 0, 'also': 1, 'human': 13, 'science': 18, 'amuse': 2, 'fascinate': 10, 'us': 23, 'engineering': 8, 'changes': 4}


# For Test Data

In [62]:
TestCorpus = ["As engineer is trained to do any job in the world whether it could be joining ISIS or becoming Alphabet CEO in short they are best!!!."]


In [63]:
cvt.transform(TestCorpus).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1]], dtype=int64)

In [64]:
print(cvt.vocabulary_)

{'engineers': 9, 'going': 11, 'position': 17, 'change': 3, 'world': 25, 'study': 21, 'scientist': 20, 'discovers': 5, 'new': 16, 'type': 22, 'material': 15, 'energy': 6, 'engineer': 7, 'use': 24, 'job': 14, 'great': 12, 'scientific': 19, 'adventure': 0, 'also': 1, 'human': 13, 'science': 18, 'amuse': 2, 'fascinate': 10, 'us': 23, 'engineering': 8, 'changes': 4}


In [65]:
cvt.inverse_transform(*cvt.transform(TestCorpus).toarray())

[array(['engineer', 'job', 'world'], dtype='<U11')]

In [66]:
# Only use predefined/trained object to operate over Test Corpus i.e. use transform for test data instead of inverse_trand


cvt.fit_transform(TestCorpus).toarray()
print(cvt.vocabulary_) # It will overwrite the previous transformed matrix

{'engineer': 5, 'trained': 10, 'job': 7, 'world': 12, 'whether': 11, 'could': 4, 'joining': 8, 'isis': 6, 'becoming': 1, 'alphabet': 0, 'ceo': 3, 'short': 9, 'best': 2}
