## Introduction to Natural Language Processing

In [102]:
!pip3 install nltk

Collecting nltk
Collecting six (from nltk)
  Using cached https://files.pythonhosted.org/packages/65/eb/1f97cb97bfc2390a276969c6fae16075da282f5058082d4cb10c6c5c1dba/six-1.14.0-py2.py3-none-any.whl
Installing collected packages: six, nltk
Successfully installed nltk-3.4.5 six-1.14.0


In [103]:
import nltk

In [104]:
nltk.download('brown')

[nltk_data] Downloading package brown to /home/amit/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [105]:
# Corpus -  large collection of text

In [106]:
from nltk.corpus import brown

In [107]:
print(brown.categories())
print(len(brown.categories()))

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
15


In [108]:
data = brown.sents(categories="adventure")
" ".join(data[2])

"He certainly didn't want a wife who was fickle as Ann ."

## Bag of Words Pipeline
* Get the data
* Tokenization, stopword removal
* Stemming, Lemmatization
* Building a vocabulary
* Vectorization
* Classification

In [109]:
# Stemming - converts a word into base word. Eg. running,runs to run

### Tokenization & Stopword Removal

In [110]:
from nltk.tokenize import sent_tokenize,word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/amit/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [111]:
document = """Computers are used as control systems for a wide variety of industrial and consumer devices.
This includes simple special purpose devices like microwave ovens and remote controls, factory devices such as industrial robots and computer-aided design, and also general purpose devices like personal computers and mobile devices such as smartphones.
The Internet is run on computers and it connects hundreds of millions of other computers and their users."""

In [112]:
sents = sent_tokenize(document)
print(sents)
print(len(sents))

['Computers are used as control systems for a wide variety of industrial and consumer devices.', 'This includes simple special purpose devices like microwave ovens and remote controls, factory devices such as industrial robots and computer-aided design, and also general purpose devices like personal computers and mobile devices such as smartphones.', 'The Internet is run on computers and it connects hundreds of millions of other computers and their users.']
3


In [113]:
sents[0].split()

['Computers',
 'are',
 'used',
 'as',
 'control',
 'systems',
 'for',
 'a',
 'wide',
 'variety',
 'of',
 'industrial',
 'and',
 'consumer',
 'devices.']

In [114]:
words = word_tokenize(sents[1])
print(words)

['This', 'includes', 'simple', 'special', 'purpose', 'devices', 'like', 'microwave', 'ovens', 'and', 'remote', 'controls', ',', 'factory', 'devices', 'such', 'as', 'industrial', 'robots', 'and', 'computer-aided', 'design', ',', 'and', 'also', 'general', 'purpose', 'devices', 'like', 'personal', 'computers', 'and', 'mobile', 'devices', 'such', 'as', 'smartphones', '.']


### Stopwords

In [115]:
from nltk.corpus import stopwords

In [116]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/amit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [117]:
sw = set(stopwords.words('english'))

In [118]:
print(sw)

{'isn', 'whom', 'he', 'because', 'needn', 'she', 'itself', 'i', 'haven', 'then', 'being', 'between', "hadn't", 'now', 'too', 'off', 'this', 'during', 'some', 'have', 'only', 'why', 'them', 'don', 'they', 'what', 'down', "don't", 'his', 'here', 'wasn', 'before', 'ourselves', 'but', 'nor', 'm', 'where', 'or', 'few', 've', 'from', "you're", 'the', "you'll", 'd', 'own', "didn't", 'over', 'hers', "you'd", 'under', 'these', 'hadn', 'are', "mightn't", "you've", 'same', 'which', 'doesn', 'each', 'it', 'will', "should've", 'in', 'if', 'y', 'as', 'him', "won't", 'both', 'on', 'at', 'after', 'again', 'about', 'for', 'while', 'having', 'who', 'just', 'an', 'weren', 'all', 'into', "shouldn't", 'do', "hasn't", 'shan', 'myself', 'a', 'when', 'by', 't', 'through', 'most', "mustn't", 'their', "wasn't", 'once', 'mightn', 'has', 'those', 'up', 'out', "shan't", 'than', 'against', 'to', 'yourselves', 'be', 'we', 'such', 'won', 'yourself', 'ours', 'you', 'wouldn', 's', 'there', 'didn', 'our', "isn't", 'was'

In [119]:
def remove_stopwords(text,stopwords):
    useful_words = [w for w in text if w not in stopwords]
    return useful_words

In [120]:
text = "I am not going to win the game at a very rapid pace".split()
useful_text = remove_stopwords(text,sw)
print(useful_text)

['I', 'going', 'win', 'game', 'rapid', 'pace']


### Tokenization using Regular Expression

In [121]:
sentence = "Send all the 50 documents related to chapters 1,2,3 at prateek@cb.com"

In [122]:
from nltk.tokenize import RegexpTokenizer

In [123]:
# regexpal.com refer for Regular Expression
tokenizer = RegexpTokenizer('[a-zA-Z@.]+')
useful_text = tokenizer.tokenize(sentence)

In [124]:
useful_text

['Send',
 'all',
 'the',
 'documents',
 'related',
 'to',
 'chapters',
 'at',
 'prateek@cb.com']

### Stemming
* Transforms words to their radical forms
* Eg : plays, playing, play, played to play

In [125]:
text= """Foxes love to make jumps.The quick brown fox was seen jumping over the 
        lovely dog from a 6ft feet high wall"""

3 types of stemmer are provided by NLTK
* Snowball Stemmer
* Porter Stemmer
* Lancaster Stemmer

In [126]:
from nltk.stem.snowball import SnowballStemmer, PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

In [127]:
ps = PorterStemmer()

In [128]:
ps.stem("jumping")

'jump'

In [129]:
ps.stem("saved")

'save'

In [130]:
# Multilingual Stemmer
ss = SnowballStemmer("english")

In [131]:
ss.stem("typing")

'type'

### Lemmatization

In [132]:
from nltk.stem import WordNetLemmatizer

In [133]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/amit/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [137]:
wn = WordNetLemmatizer()
wn.lemmatize('seeing')

'seeing'