### Overview of nltk package

In [1]:
import nltk

In [2]:
 from nltk.corpus import brown

In [3]:
print(brown.categories())
print(len(brown.categories()))

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
15


In [4]:
data = brown.sents(categories = 'adventure' )
print(len(data))

4637


In [5]:
data[0]

['Dan',
 'Morgan',
 'told',
 'himself',
 'he',
 'would',
 'forget',
 'Ann',
 'Turner',
 '.']

In [6]:
" ".join(data[0])

'Dan Morgan told himself he would forget Ann Turner .'

# Bag of Words Pipeline

- Get the Corpus
- Tokenisation, StopWard Removal
- Stemming
- Building a Vocal
- Vectorization
- Classification

# Tokenization & Stopwords Removal

In [7]:
document = """Science can amuse and fascinate us all, but it is engineering that changes the world.
              The engineer has been, and is, a maker of history. 
              Scientists study the world as it is; engineers create the world that has never been.
              The way to succeed is to double your failure rate."""

sentence = "Send all the 50 documents and other data at jamesbond@001.com"

In [8]:
from nltk.tokenize import sent_tokenize, word_tokenize #one for conversion into token of sentences and another one for word tokens

In [9]:
sents = sent_tokenize(document)
print(sents)
print(len(sents))

['Science can amuse and fascinate us all, but it is engineering that changes the world.', 'The engineer has been, and is, a maker of history.', 'Scientists study the world as it is; engineers create the world that has never been.', 'The way to succeed is to double your failure rate.']
4


In [10]:
print(sentence.split(sep = " "))

['Send', 'all', 'the', '50', 'documents', 'and', 'other', 'data', 'at', 'jamesbond@001.com']


In [11]:
words = word_tokenize(sentence)
print(words)
print(len(words))

['Send', 'all', 'the', '50', 'documents', 'and', 'other', 'data', 'at', 'jamesbond', '@', '001.com']
12


# Removing Stopwords

In [12]:
from nltk.corpus import stopwords #Some pre-defined non useful words

In [13]:
eng_sw = set(stopwords.words("english")) #Stopwords in english

In [14]:
print(eng_sw)
print(len(eng_sw))

{'with', 'isn', "isn't", 'ourselves', 'under', 'above', "needn't", 'up', 'all', 'wouldn', 'didn', 'how', 'very', 'hers', "haven't", "wouldn't", 'was', 'which', 'your', 'them', "hadn't", 'on', 'mustn', 'each', 'whom', 'doesn', 'some', 'couldn', 'myself', 'herself', 'before', 'to', 'if', 'by', 't', 'had', 're', 'did', 'can', "wasn't", 'and', 'against', 'wasn', 'yourself', 'into', "that'll", 'my', 'him', "aren't", "mightn't", "should've", 'now', 'having', 'from', 'weren', 'both', "weren't", 'm', 'while', "hasn't", 'these', 'we', 'those', "don't", 'in', "didn't", 'is', 'after', 'this', 'he', 'theirs', 'over', 'her', 'most', 'll', 'as', 'further', 'don', "shan't", 'won', 'y', "mustn't", "won't", 'for', 'when', 'themselves', 'only', 'here', 'doing', 'yours', 'be', 'why', "you'd", "you're", 'because', 'of', 'once', 'no', 'just', 'who', "it's", 'hasn', 'their', 'will', 'our', 've', 'his', 'were', "shouldn't", 'shan', 'what', 'such', 'the', 'are', 'again', 'needn', 'am', 'shouldn', 'they', 'nor

In [15]:
def RemoveStopWords(text, StopWords):
    useful_Words = [word  for word in text if word not in StopWords]
    return useful_Words

In [16]:
demo_text = "i am not bothered about her very much".split()
useful_text = RemoveStopWords(demo_text, eng_sw)
print(useful_text)

['bothered', 'much']
