### Overview of nltk package

In [1]:
import nltk

In [2]:
 from nltk.corpus import brown

In [3]:
print(brown.categories())
print(len(brown.categories()))

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
15


In [4]:
data = brown.sents(categories = 'adventure' )
print(len(data))

4637


In [5]:
data[0]

['Dan',
 'Morgan',
 'told',
 'himself',
 'he',
 'would',
 'forget',
 'Ann',
 'Turner',
 '.']

In [6]:
" ".join(data[0])

'Dan Morgan told himself he would forget Ann Turner .'

# Bag of Words Pipeline

- Get the Corpus
- Tokenisation, StopWard Removal
- Stemming
- Building a Vocal
- Vectorization
- Classification

# Tokenization & Stopwords Removal

In [7]:
document = """Science can amuse and fascinate us all, but it is engineering that changes the world.
              The engineer has been, and is, a maker of history. 
              Scientists study the world as it is; engineers create the world that has never been.
              The way to succeed is to double your failure rate."""

sentence = "Send all the 50 documents and other data at jamesbond@001.com"

In [8]:
from nltk.tokenize import sent_tokenize, word_tokenize #one for conversion into token of sentences and another one for word tokens

In [9]:
sents = sent_tokenize(document)
print(sents)
print(len(sents))

['Science can amuse and fascinate us all, but it is engineering that changes the world.', 'The engineer has been, and is, a maker of history.', 'Scientists study the world as it is; engineers create the world that has never been.', 'The way to succeed is to double your failure rate.']
4


In [10]:
print(sentence.split(sep = " "))

['Send', 'all', 'the', '50', 'documents', 'and', 'other', 'data', 'at', 'jamesbond@001.com']


In [11]:
words = word_tokenize(sentence)
print(words)
print(len(words))

['Send', 'all', 'the', '50', 'documents', 'and', 'other', 'data', 'at', 'jamesbond', '@', '001.com']
12


# Removing Stopwords

In [12]:
from nltk.corpus import stopwords #Some pre-defined non useful words

In [13]:
eng_sw = set(stopwords.words("english")) #Stopwords in english

In [14]:
print(eng_sw)
print(len(eng_sw))

{'i', 'hers', 't', 'when', "it's", 'are', 'his', 'will', 'be', 'have', "weren't", 'under', 'them', 'at', "aren't", 'himself', 'she', 'is', 'ourselves', 'against', 'only', 'in', 'own', "don't", 'isn', 'do', "wouldn't", 'he', 'yourselves', 'which', "mightn't", 'above', 'more', 'this', 'here', 've', 'those', "you've", 'him', 'o', "mustn't", 'theirs', 'other', 'had', 'it', 'my', "you'll", 'again', 'did', 'you', 'off', 'for', 'too', 'any', 'each', 'was', 'does', 'their', 'with', 'couldn', 'not', 'shouldn', 'am', 'through', 'then', 'been', 'who', 'of', 'below', 'y', 'until', 'so', 're', 'our', 'or', 'haven', 'mightn', 'weren', 'd', 's', 'its', "you'd", 'no', "hasn't", 'about', 'hasn', 'just', 'don', 'over', 'we', 'yourself', 'further', "she's", 'between', 'nor', 'doesn', "shan't", "couldn't", 'shan', 'whom', 'on', 'ain', 'myself', 'wasn', 'but', 'being', 'into', "hadn't", 'won', 'as', 'has', 'to', 'there', 'a', "wasn't", 'during', 'some', 'from', 'should', 'me', 'if', 'out', 'by', 'once', 'a

In [15]:
def RemoveStopWords(text, StopWords):
    useful_Words = [word  for word in text if word not in StopWords]
    return useful_Words

In [16]:
demo_text = "i am not bothered about her very much".split()
useful_text = RemoveStopWords(demo_text, eng_sw)
print(useful_text)

['bothered', 'much']


# Tokenization using RegeX

In [17]:
sentence

'Send all the 50 documents and other data at jamesbond@001.com'

In [18]:
from nltk.tokenize import RegexpTokenizer 

In [19]:
tokenizer = RegexpTokenizer("[a-zA-Z]+") # I want all Words but not the numbers

In [20]:
useful_text = tokenizer.tokenize(sentence)

In [21]:
useful_text

['Send',
 'all',
 'the',
 'documents',
 'and',
 'other',
 'data',
 'at',
 'jamesbond',
 'com']

# Stemming

- Process that transforms particular words(verbs, plurals) into their radical form
- Preserve the semantics of the sentence without increasing the number of unique tokens
- Example - jumps, jumping, jumped, jump  ==> jump

In [22]:
text = "Foxes love to make jumps. The quick brown fox was seen jumping over the lovely dog from a 6ft high wall. This is how fox made his first jump"

## There are 3 types of stemmers in nltk : 
1. Snowball Stemmer
2. Porter Stemmer
3. Lancaster Stemmer

In [23]:
from nltk.stem.snowball import SnowballStemmer, PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

In [24]:
# Step 1: Create object of Stemmer class

PS = PorterStemmer()

In [25]:
# Demo 
PS.stem("jumping")

'jump'

In [26]:
PS.stem("Jumping")

'jump'

In [27]:
PS.stem("Jumps")

'jump'

In [28]:
## Using SnowballStemmer (It is a multilingual Stemmer) need to specify language

ss = SnowballStemmer(language = "english")

In [29]:
ss.stem("Jumping")

'jump'

In [30]:
ss.stem("Jumps")

'jump'

# Lemmatization

In [31]:
from nltk.stem import WordNetLemmatizer

wn = WordNetLemmatizer()

In [32]:
wn.lemmatize("jumps")

'jump'

In [33]:
wn.lemmatize("jumping")

'jumping'