# Chapter 6 -  Data Sourcing via Web
## Segment 5 - Introduction to NLP

In [1]:
import nltk

In [2]:
text = "On Wednesday, the Association for Computing Machinery, the world’s largest society of computing professionals, announced that Hinton, LeCun and Bengio had won this year’s Turing Award for their work on neural networks. The Turing Award, which was introduced in 1966, is often called the Nobel Prize of computing, and it includes a $1 million prize, which the three scientists will share."

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/d022785/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

<h3>Sentence Tokenizer</h3>

In [4]:
from nltk.tokenize import sent_tokenize
sent_tk = sent_tokenize(text)
print('Sentence tokenizing the text : \n')
print(sent_tk)

Sentence tokenizing the text : 

['On Wednesday, the Association for Computing Machinery, the world’s largest society of computing professionals, announced that Hinton, LeCun and Bengio had won this year’s Turing Award for their work on neural networks.', 'The Turing Award, which was introduced in 1966, is often called the Nobel Prize of computing, and it includes a $1 million prize, which the three scientists will share.']


### Word Tokenizer

In [5]:
from nltk.tokenize import word_tokenize
word_tk = word_tokenize(text)
print('Sentence tokenizing the text : \n')
print(word_tk)

Sentence tokenizing the text : 

['On', 'Wednesday', ',', 'the', 'Association', 'for', 'Computing', 'Machinery', ',', 'the', 'world', '’', 's', 'largest', 'society', 'of', 'computing', 'professionals', ',', 'announced', 'that', 'Hinton', ',', 'LeCun', 'and', 'Bengio', 'had', 'won', 'this', 'year', '’', 's', 'Turing', 'Award', 'for', 'their', 'work', 'on', 'neural', 'networks', '.', 'The', 'Turing', 'Award', ',', 'which', 'was', 'introduced', 'in', '1966', ',', 'is', 'often', 'called', 'the', 'Nobel', 'Prize', 'of', 'computing', ',', 'and', 'it', 'includes', 'a', '$', '1', 'million', 'prize', ',', 'which', 'the', 'three', 'scientists', 'will', 'share', '.']


### Removing stop words

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/d022785/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
from multiprocessing.resource_sharer import stop
from nltk.corpus import stopwords
sw = set(stopwords.words('english'))
print("stop words in english lkanguage:", sw)

stop words in english lkanguage: {'below', 'is', 'once', 'all', 'for', "doesn't", 'aren', "that'll", 'where', "needn't", 'couldn', 'by', 'doesn', 'doing', 'am', 'y', "you'd", 'yourselves', 've', 'so', 'i', 'because', 'whom', 'does', 'you', 't', 'under', 'further', 'some', 'now', 'than', "wouldn't", 'not', "don't", 'her', 'yours', 'down', 'out', 'into', 'should', 'other', "mightn't", 'if', 'up', 'and', 'a', 'them', 'off', 'such', 'during', 'me', 'over', 'shouldn', 'to', 'then', 'very', "haven't", 'they', 'hadn', 'had', "aren't", 'll', 'theirs', "shouldn't", 'about', 'he', "it's", 'in', "couldn't", "shan't", "you'll", 'at', 'both', 'did', 'him', 're', 'haven', 'own', "isn't", 'shan', 'wasn', 'have', "hasn't", 'an', 'his', 'was', 'isn', 'we', 'won', 'will', 'needn', 'be', "wasn't", 'my', 'or', 'she', 'with', 'until', 'before', 'why', 'this', 'do', 'who', 'what', 'weren', "she's", 'here', 'on', "weren't", 'how', 'its', 'when', 'no', 'as', 'ain', 'there', 'can', "won't", 'same', 'ourselves'

In [8]:
filtered_words = [w for w in word_tk if not w in sw]

In [9]:
print('text after removing stop words_ \n')
print(filtered_words)

text after removing stop words_ 

['On', 'Wednesday', ',', 'Association', 'Computing', 'Machinery', ',', 'world', '’', 'largest', 'society', 'computing', 'professionals', ',', 'announced', 'Hinton', ',', 'LeCun', 'Bengio', 'year', '’', 'Turing', 'Award', 'work', 'neural', 'networks', '.', 'The', 'Turing', 'Award', ',', 'introduced', '1966', ',', 'often', 'called', 'Nobel', 'Prize', 'computing', ',', 'includes', '$', '1', 'million', 'prize', ',', 'three', 'scientists', 'share', '.']


In [10]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

port_stem = PorterStemmer()

In [14]:
stemmed_words = []
for w in filtered_words:
    stemmed_words.append(port_stem.stem(w))


print('filtered words: \n', filtered_words, '\n')
print('Stem sentence: \n', stemmed_words)

filtered words: 
 ['On', 'Wednesday', ',', 'Association', 'Computing', 'Machinery', ',', 'world', '’', 'largest', 'society', 'computing', 'professionals', ',', 'announced', 'Hinton', ',', 'LeCun', 'Bengio', 'year', '’', 'Turing', 'Award', 'work', 'neural', 'networks', '.', 'The', 'Turing', 'Award', ',', 'introduced', '1966', ',', 'often', 'called', 'Nobel', 'Prize', 'computing', ',', 'includes', '$', '1', 'million', 'prize', ',', 'three', 'scientists', 'share', '.'] 

Stem sentence: 
 ['on', 'wednesday', ',', 'associ', 'comput', 'machineri', ',', 'world', '’', 'largest', 'societi', 'comput', 'profession', ',', 'announc', 'hinton', ',', 'lecun', 'bengio', 'year', '’', 'ture', 'award', 'work', 'neural', 'network', '.', 'the', 'ture', 'award', ',', 'introduc', '1966', ',', 'often', 'call', 'nobel', 'prize', 'comput', ',', 'includ', '$', '1', 'million', 'prize', ',', 'three', 'scientist', 'share', '.']
