# Natural Language Processing - Basics

#### Import resources

In [1]:
import nltk
import urllib.request
from bs4 import BeautifulSoup

#### Import an HTML web page

In [2]:
response =  urllib.request.urlopen('https://en.wikipedia.org/wiki/Natural_language_processing')
html = response.read()

#### BeautifulSoup to clean the webpage text of HTML tags

In [3]:
b_soup = BeautifulSoup(html,'html5lib')
text = b_soup.get_text(strip = True)

#### Now we have clean text from the crawled web page, let’s convert the text into tokens

Tokenization example

In [4]:
text_2 = "Good afternoon Mr. Doug, how are you? I hope everything is going well. I will call you soon, best regards."
print("sentence =", nltk.tokenize.sent_tokenize(text_2))
print("word =", nltk.tokenize.word_tokenize(text_2))

sentence = ['Good afternoon Mr. Doug, how are you?', 'I hope everything is going well.', 'I will call you soon, best regards.']
word = ['Good', 'afternoon', 'Mr.', 'Doug', ',', 'how', 'are', 'you', '?', 'I', 'hope', 'everything', 'is', 'going', 'well', '.', 'I', 'will', 'call', 'you', 'soon', ',', 'best', 'regards', '.']


Other languages

In [5]:
text_fr = "Bonjour M. Doug, comment allez-vous? J'espère que tout va bien. Je vous contacterai prochainement, cordialement."
print("Fr =", nltk.tokenize.sent_tokenize(text_fr,"french"))

Fr = ['Bonjour M. Doug, comment allez-vous?', "J'espère que tout va bien.", 'Je vous contacterai prochainement, cordialement.']


#### Back to our main example

In [6]:
tokens = [t for t in text.split()]

#### Words frequencies

In [7]:
sr= nltk.corpus.stopwords.words('english')
clean_tokens = tokens[:]
for token in tokens:
    if token in nltk.corpus.stopwords.words('english'):
        clean_tokens.remove(token)

freq = nltk.FreqDist(clean_tokens)

In [8]:
freq.plot(25)

<Figure size 640x480 with 1 Axes>

### Stop words

Text may contain stop words like ‘the’, ‘is’, ‘are’. Stop words can be filtered from the text to be processed. 

There is no universal list of stop words in nlp research, however the nltk module contains a list of stop words.

In [9]:
from nltk.corpus import stopwords

In [10]:
stopWords = set(stopwords.words('english'))
data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."
words = nltk.tokenize.word_tokenize(data)
wordsFiltered = []
print(words)

['All', 'work', 'and', 'no', 'play', 'makes', 'jack', 'dull', 'boy', '.', 'All', 'work', 'and', 'no', 'play', 'makes', 'jack', 'a', 'dull', 'boy', '.']


In [11]:
for w in words:
    if w not in stopWords:
        wordsFiltered.append(w)
 
print(wordsFiltered)

['All', 'work', 'play', 'makes', 'jack', 'dull', 'boy', '.', 'All', 'work', 'play', 'makes', 'jack', 'dull', 'boy', '.']


In [12]:
print(stopWords)

{'where', 'before', 'because', 'doesn', 'off', 'most', 'am', 'they', 'with', 've', 'theirs', 'y', 'above', 'isn', 'his', 'mightn', 'himself', "weren't", 'hers', 'it', "couldn't", 'do', 'over', 'aren', 'themselves', 'so', 'same', 'yourselves', 'few', 'about', 're', 'shan', 'again', "she's", 'weren', 'won', 'that', 'doing', 'has', 'from', "mightn't", 'as', "that'll", 'how', 'nor', "hadn't", 'once', "mustn't", 'been', 'ours', 'the', 'or', 'while', 'by', "didn't", 'between', 'down', 'my', 'have', "you've", 'these', 's', 'm', 'into', "don't", "shan't", "you'd", 'couldn', "isn't", 'whom', 'this', 'below', 'them', "doesn't", 'didn', 'our', 'wasn', 'wouldn', 't', 'its', 'can', 'those', 'be', 'a', 'don', "hasn't", 'other', "it's", 'did', 'very', 'when', 'will', 'through', 'during', 'under', 'needn', "won't", 'd', 'o', 'then', 'of', 'both', 'after', 'are', 'ain', 'should', 'at', 'he', 'yours', 'in', 'further', 'any', 'out', "you'll", 'shouldn', 'myself', 'was', 'for', 'their', "aren't", "wouldn'

### Stemming

A word stem is part of a word. It is sort of a normalization idea, but linguistic. <br>
For example : A stemming algorithm reduces the words fishing, fished, and fisher to the stem fish.

In [13]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

In [14]:
words = ["process","processing","processed","processes"]

In [15]:
ps = PorterStemmer()
for word in words:
    print(ps.stem(word))

process
process
process
process


### Speech tagging

In [16]:
from nltk.tokenize import PunktSentenceTokenizer

In [17]:
document = 'Whether you\'re new to programming or an experienced developer, it\'s easy to learn and use Python.'
sentences = nltk.sent_tokenize(document)   
for sent in sentences:
    print(nltk.pos_tag(nltk.word_tokenize(sent)))

[('Whether', 'IN'), ('you', 'PRP'), ("'re", 'VBP'), ('new', 'JJ'), ('to', 'TO'), ('programming', 'VBG'), ('or', 'CC'), ('an', 'DT'), ('experienced', 'JJ'), ('developer', 'NN'), (',', ','), ('it', 'PRP'), ("'s", 'VBZ'), ('easy', 'JJ'), ('to', 'TO'), ('learn', 'VB'), ('and', 'CC'), ('use', 'VB'), ('Python', 'NNP'), ('.', '.')]


Here is the meanings of these speech codes : <br>
<img src="images/codes.png" width="800">

Thus, you can filter the data based on the type of word you want

In [18]:
data = []
for sent in sentences:
    data = data + nltk.pos_tag(nltk.word_tokenize(sent))
    
for word in data: 
    if 'JJ' in word[1]: 
        print(word)

('new', 'JJ')
('experienced', 'JJ')
('easy', 'JJ')
