## Introduction to Natural Language Processing

In [102]:
!pip3 install nltk

Collecting nltk
Collecting six (from nltk)
  Using cached https://files.pythonhosted.org/packages/65/eb/1f97cb97bfc2390a276969c6fae16075da282f5058082d4cb10c6c5c1dba/six-1.14.0-py2.py3-none-any.whl
Installing collected packages: six, nltk
Successfully installed nltk-3.4.5 six-1.14.0


In [157]:
import nltk

In [158]:
nltk.download('brown')

[nltk_data] Downloading package brown to /home/amit/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [159]:
# Corpus -  large collection of text

In [160]:
from nltk.corpus import brown

In [161]:
print(brown.categories())
print(len(brown.categories()))

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
15


In [162]:
data = brown.sents(categories="adventure")
" ".join(data[2])

"He certainly didn't want a wife who was fickle as Ann ."

## Bag of Words Pipeline
* Get the data
* Tokenization, stopword removal
* Stemming, Lemmatization
* Building a vocabulary
* Vectorization
* Classification

In [163]:
# Stemming - converts a word into base word. Eg. running,runs to run

### Tokenization & Stopword Removal

In [164]:
from nltk.tokenize import sent_tokenize,word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/amit/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [165]:
document = """Computers are used as control systems for a wide variety of industrial and consumer devices.
This includes simple special purpose devices like microwave ovens and remote controls, factory devices such as industrial robots and computer-aided design, and also general purpose devices like personal computers and mobile devices such as smartphones.
The Internet is run on computers and it connects hundreds of millions of other computers and their users."""

In [166]:
sents = sent_tokenize(document)
print(sents)
print(len(sents))

['Computers are used as control systems for a wide variety of industrial and consumer devices.', 'This includes simple special purpose devices like microwave ovens and remote controls, factory devices such as industrial robots and computer-aided design, and also general purpose devices like personal computers and mobile devices such as smartphones.', 'The Internet is run on computers and it connects hundreds of millions of other computers and their users.']
3


In [167]:
sents[0].split()

['Computers',
 'are',
 'used',
 'as',
 'control',
 'systems',
 'for',
 'a',
 'wide',
 'variety',
 'of',
 'industrial',
 'and',
 'consumer',
 'devices.']

In [168]:
words = word_tokenize(sents[1])
print(words)

['This', 'includes', 'simple', 'special', 'purpose', 'devices', 'like', 'microwave', 'ovens', 'and', 'remote', 'controls', ',', 'factory', 'devices', 'such', 'as', 'industrial', 'robots', 'and', 'computer-aided', 'design', ',', 'and', 'also', 'general', 'purpose', 'devices', 'like', 'personal', 'computers', 'and', 'mobile', 'devices', 'such', 'as', 'smartphones', '.']


### Stopwords

In [169]:
from nltk.corpus import stopwords

In [170]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/amit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [171]:
sw = set(stopwords.words('english'))

In [172]:
print(sw)

{'any', 'both', "you're", 'off', 'each', 'is', 'have', 'just', 'i', "wouldn't", 'too', 'few', "won't", 'when', 'mightn', 'm', "she's", 'his', "mightn't", "mustn't", 'hers', 'because', 'itself', 'by', 'nor', 'very', 'yourself', 'more', 'after', 'until', 'haven', 'its', "shan't", "hasn't", 'which', 'then', 'does', 'now', 'were', 'being', "should've", "weren't", 'an', 'our', 'further', 'had', 'shan', 'me', 'will', 'himself', "don't", 'it', 'where', 'over', 'other', 'here', 'than', 'd', "hadn't", "isn't", "needn't", 'for', 'below', 'against', 'so', 'into', 't', "aren't", 'at', 'are', 'can', 'theirs', 'only', 'been', 'don', 'before', 'won', 'yourselves', 'their', 'those', 'myself', 'such', 'has', 'through', 'you', 'did', 'from', 'this', 'your', 'between', 'themselves', 'if', "shouldn't", "doesn't", 'we', 'do', 'mustn', 'ours', 'with', 'same', 'am', 'most', 'again', "you'll", 'on', 'about', 'during', 'she', 'wouldn', 'and', 'couldn', 'll', "haven't", 'to', 'having', 'doesn', 'hasn', 'some', 

In [173]:
def remove_stopwords(text,stopwords):
    useful_words = [w for w in text if w not in stopwords]
    return useful_words

In [174]:
text = "I am not going to win the game at a very rapid pace".split()
useful_text = remove_stopwords(text,sw)
print(useful_text)

['I', 'going', 'win', 'game', 'rapid', 'pace']


### Tokenization using Regular Expression

In [175]:
sentence = "Send all the 50 documents related to chapters 1,2,3 at prateek@cb.com"

In [176]:
from nltk.tokenize import RegexpTokenizer

In [177]:
# regexpal.com refer for Regular Expression
tokenizer = RegexpTokenizer('[a-zA-Z@.]+')
useful_text = tokenizer.tokenize(sentence)

In [178]:
useful_text

['Send',
 'all',
 'the',
 'documents',
 'related',
 'to',
 'chapters',
 'at',
 'prateek@cb.com']

### Stemming
* Transforms words to their radical forms
* Eg : plays, playing, play, played to play

In [179]:
text= """Foxes love to make jumps.The quick brown fox was seen jumping over the 
        lovely dog from a 6ft feet high wall"""

3 types of stemmer are provided by NLTK
* Snowball Stemmer
* Porter Stemmer
* Lancaster Stemmer

In [180]:
from nltk.stem.snowball import SnowballStemmer, PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

In [181]:
ps = PorterStemmer()

In [182]:
ps.stem("jumping")

'jump'

In [183]:
ps.stem("saved")

'save'

In [184]:
# Multilingual Stemmer
ss = SnowballStemmer("english")

In [185]:
ss.stem("typing")

'type'

### Lemmatization

In [186]:
from nltk.stem import WordNetLemmatizer

In [187]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/amit/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [188]:
wn = WordNetLemmatizer()
wn.lemmatize('found','v') # second parameter is the type of word, here it is verb (v)

'find'

### Building a vocab & Vectorization

In [189]:
corpus = [
        'Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
        'We will win next Lok Sabha Elections, says confident Indian PM',
        'The nobel laurate won the hearts of the people.',
        'The movie Raazi is an exciting Indian Spy thriller based upon a real story.'
]

In [190]:
from sklearn.feature_extraction.text import CountVectorizer

In [191]:
cv = CountVectorizer()

In [192]:
vectorized_corpus = cv.fit_transform(corpus)

In [193]:
vectorized_corpus = vectorized_corpus.toarray()
print(vectorized_corpus[0])
print(len(vectorized_corpus[0]))

[0 1 0 1 1 0 1 2 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0
 2 0 1 0 2]
42


In [194]:
print(cv.vocabulary_)
print(len(cv.vocabulary_.keys()))

{'indian': 12, 'cricket': 6, 'team': 31, 'will': 37, 'wins': 39, 'world': 41, 'cup': 7, 'says': 27, 'capt': 4, 'virat': 35, 'kohli': 14, 'be': 3, 'held': 11, 'at': 1, 'sri': 29, 'lanka': 15, 'we': 36, 'win': 38, 'next': 19, 'lok': 17, 'sabha': 26, 'elections': 8, 'confident': 5, 'pm': 23, 'the': 32, 'nobel': 20, 'laurate': 16, 'won': 40, 'hearts': 10, 'of': 21, 'people': 22, 'movie': 18, 'raazi': 24, 'is': 13, 'an': 0, 'exciting': 9, 'spy': 28, 'thriller': 33, 'based': 2, 'upon': 34, 'real': 25, 'story': 30}
42


In [195]:
# Reverse Mapping
numbers = vectorized_corpus[1]
print(numbers)
cv.inverse_transform(numbers)

[0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 1
 1 1 0 0 0]


[array(['confident', 'elections', 'indian', 'lok', 'next', 'pm', 'sabha',
        'says', 'we', 'will', 'win'], dtype='<U9')]

### Vectorization with stopword removal

In [196]:
def myTokenizer(document):
    words = tokenizer.tokenize(document.lower()) # convert to lower case
    words = remove_stopwords(words,sw) # Remove stopwords
    return words

In [197]:
cv = CountVectorizer(tokenizer=myTokenizer)
vectorized_corpus = cv.fit_transform(corpus).toarray()

In [198]:
print(vectorized_corpus)
print(len(vectorized_corpus[0]))

[[0 1 0 1 2 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 2]
 [0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 1 1 0 0 0 0]]
33


In [199]:
cv.inverse_transform(vectorized_corpus)

[array(['capt.', 'cricket', 'cup', 'held', 'indian', 'kohli.', 'lanka.',
        'says', 'sri', 'team', 'virat', 'wins', 'world'], dtype='<U9'),
 array(['confident', 'elections', 'indian', 'lok', 'next', 'pm', 'sabha',
        'says', 'win'], dtype='<U9'),
 array(['hearts', 'laurate', 'nobel', 'people.'], dtype='<U9'),
 array(['based', 'exciting', 'indian', 'movie', 'raazi', 'real', 'spy',
        'story.', 'thriller', 'upon'], dtype='<U9')]

In [200]:
# Test Data
test_corpus = ["India will win the match"]
cv.transform(test_corpus).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]])

### More ways to create Features
- Unigram -  every word as a feature
- Bigrams
- Trigrams
- n-grams
- TF-IDF Normalisation

In [201]:
sent_1 = ["this is good movie"]
sent_2 = ["this is good movie but actor is not present"]
sent_3 = ["this is not good movie"]

In [211]:
cv = CountVectorizer(ngram_range=(1,3))

In [212]:
docs = [sent_1[0],sent_2[0]]
cv.fit_transform(docs).toarray()

array([[0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
        1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1]])

In [213]:
cv.vocabulary_

{'actor': 0,
 'actor is': 1,
 'actor is not': 2,
 'but': 3,
 'but actor': 4,
 'but actor is': 5,
 'good': 6,
 'good movie': 7,
 'good movie but': 8,
 'is': 9,
 'is good': 10,
 'is good movie': 11,
 'is not': 12,
 'is not present': 13,
 'movie': 14,
 'movie but': 15,
 'movie but actor': 16,
 'not': 17,
 'not present': 18,
 'present': 19,
 'this': 20,
 'this is': 21,
 'this is good': 22}

### TF-IDF Normalisation
- Term Frequency - Inverse Document Frequency
- Avoid features that occur very often, becauase they contain less information
- Information decreases as the number of occurences increases across different type of documents
- So we define another term - term-document-frequency which associates a weight with every term

In [214]:
sent_1  = "this is good movie"
sent_2 = "this was good movie"
sent_3 = "this is not good movie"
corpus = [sent_1,sent_2,sent_3]

In [217]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [218]:
tfidf = TfidfVectorizer()

In [222]:
vc = tfidf.fit_transform(corpus).toarray()
print(vc)
tfidf.vocabulary_

[[0.46333427 0.59662724 0.46333427 0.         0.46333427 0.        ]
 [0.41285857 0.         0.41285857 0.         0.41285857 0.69903033]
 [0.3645444  0.46941728 0.3645444  0.61722732 0.3645444  0.        ]]


{'good': 0, 'is': 1, 'movie': 2, 'not': 3, 'this': 4, 'was': 5}