In [1]:
!pip install --user -U nltk



In [1]:
import nltk
import numpy as np

In [2]:
import requests

url = "http://www.gutenberg.org/files/2554/2554.txt"

response = requests.get(url)
raw_html = response.content
text = raw_html.decode("utf-8-sig")

### Bag of Words

Bag of Words based encoding or TF-IDF vector is a frequentist based approach to NLP applications.

In [3]:
import urllib.request

url = "https://www.gutenberg.org/files/829/829-0.txt" # gulliver's travels
#Alt
#url = 'https://www.gutenberg.org/files/2701/2701-0.txt' # Moby Dick

file = urllib.request.urlopen(url)
text = [line.decode('utf-8') for line in file]
text = ''.join(text)

#### Tokenize

In [4]:
nltk.download('punkt')
from nltk import word_tokenize
tokens = word_tokenize(text)

[nltk_data] Downloading package punkt to /home/yashroff/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
import string
tokens = [word for word in tokens if word.isalpha()]
table = str.maketrans('', '', string.punctuation)
tokens = [w.translate(table) for w in tokens]
tokens = [word.lower() for word in tokens]

Removing **stop-words** and **stemming**

In [6]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]

from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
tokens = [porter.stem(word) for word in tokens]
tokens[200:202]

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/yashroff/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['fault', 'find']

**Understanding the vocabulary** 

* A vocabulary of a document represents all the words in that document and the frequency they appear.
* `FreqDist` class


In [7]:
from nltk.probability import FreqDist

word_counts = FreqDist(tokens)
word_counts

FreqDist({'could': 395, 'upon': 393, 'would': 370, 'great': 298, 'one': 288, 'two': 252, 'time': 240, 'countri': 231, 'made': 228, 'much': 212, ...})

**Scoring words with frequency**

In [8]:
top = 100
vocabulary = word_counts.most_common(top)

vocabulary[:10]

[('could', 395),
 ('upon', 393),
 ('would', 370),
 ('great', 298),
 ('one', 288),
 ('two', 252),
 ('time', 240),
 ('countri', 231),
 ('made', 228),
 ('much', 212)]

In [9]:
voc_size = len(vocabulary)
doc_vector = np.zeros(voc_size)

word_vector = [(idx,word_counts[word[0]]) for idx, word in enumerate(vocabulary) if word[0] in word_counts.keys()] 
word_vector[10]

(10, 191)

In [10]:
# Generating a model of Bag of Words

from nltk import sent_tokenize

docs = sent_tokenize(text)[703:706]
docs

from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer=CountVectorizer(stop_words='english')
word_count_vector=count_vectorizer.fit_transform(docs)
word_count_vector.shape
word_count_vector.toarray()
count_vectorizer.get_feature_names()

['advance',
 'beards',
 'betwixt',
 'body',
 'came',
 'clothes',
 'corn',
 'creep',
 'difficulty',
 'distance',
 'distant',
 'ears',
 'extreme',
 'fallen',
 'field',
 'flesh',
 'foot',
 'forced',
 'forward',
 'great',
 'hardly',
 'impossible',
 'interwoven',
 'kept',
 'laid',
 'pierced',
 'pointed',
 'rain',
 'shift',
 'squeeze',
 'stalks',
 'step',
 'strong',
 'till',
 'wind']