# **Introduction to Natural Language Processing**

In [1]:
import nltk

# 1. Get the Data
* Get the Data from NLTK Corpora
* or Scrape the Data/ Use API

In [5]:
nltk.download('brown')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [6]:
# Corpus - A large collection of text

from nltk.corpus import brown

In [7]:
print(brown.categories())
print(len(brown.categories()))

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
15


In [8]:
data = brown.sents(categories='fiction')

In [9]:
#1 sentence in fiction category
' '.join(data[1])

'Scotty did not go back to school .'

# **Bag of Words Pipeline**
* Get the Data/Corpus
* Tokenisation, Stopward Removal
* Stemming
* Building a Vocab
* Vectorization
* Classification

# Tokenisation & Stopword Removal

In [10]:
document = """It was a very pleasant day. The weather was cool and there were light showers. 
I went to the market to buy some fruits."""

sentence = "Send all the 50 documents related to chapters 1,2,3 at prateek@cb.com"

In [11]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [14]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [15]:
sents = sent_tokenize(document)
print(sents)
print(len(sents))

['It was a very pleasant day.', 'The weather was cool and there were light showers.', 'I went to the market to buy some fruits.']
3


In [16]:
sents[0]

'It was a very pleasant day.'

In [17]:
sentence.split()

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapters',
 '1,2,3',
 'at',
 'prateek@cb.com']

In [18]:
words = word_tokenize(sentence)

In [19]:
words

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapters',
 '1,2,3',
 'at',
 'prateek',
 '@',
 'cb.com']

# Stopwords

In [21]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [22]:
from nltk.corpus import stopwords

sw = set(stopwords.words('english'))

In [23]:
print(sw)

{'me', 's', 'so', 'them', 'yourself', 'it', 'couldn', 'his', 'how', 'then', 'has', 'again', 'a', "shan't", 'into', 'will', 'most', "weren't", 'those', 'ours', "won't", 'from', 'isn', 'these', 'hers', "wouldn't", 'mightn', 'her', 'hasn', "hasn't", 'own', 'being', 'before', 'that', 'shan', 'some', 'don', 'few', 'just', "it's", "you're", 'below', 'than', "you'll", 'himself', 'herself', 'for', 'shouldn', 'between', 'themselves', 'y', "that'll", "you'd", 'too', 'after', 'we', 'doesn', 'with', 'yourselves', 'to', "mustn't", 'through', 'was', "she's", 'myself', 'which', 'your', 'yours', 'there', "don't", 'because', 'him', 'itself', 'more', 'mustn', 'having', 'as', 'have', 'down', "wasn't", 'both', "you've", 'who', 'hadn', "shouldn't", 'won', 'same', 'its', 'once', 'ourselves', 'by', 've', 'ma', 'until', 'over', 'whom', 'this', "aren't", 'at', 'any', 'other', 'while', 'what', 'been', 'above', 'in', 'is', 'll', 'but', 'weren', 'are', 'does', 'ain', 'she', 'an', 'the', 'against', 'do', 'our', 'w

In [24]:
def remove_stopwords(text,stopwords):
    useful_words = [w for w in text if w not in stopwords]
    return useful_words

In [25]:
text = "i am not bothered about her very much".split()
useful_text = remove_stopwords(text,sw)
print(useful_text)

['bothered', 'much']


# Tokenization using Regular Expression

In [26]:
sentence = "Send all the 50 documents related to chapters 1,2,3 at prateek@cb.com"

In [27]:
from nltk.tokenize import RegexpTokenizer

In [28]:
tokenizer = RegexpTokenizer('[a-zA-Z@.]+') # Write expression here.
useful_text = tokenizer.tokenize(sentence)

In [29]:
useful_text

['Send',
 'all',
 'the',
 'documents',
 'related',
 'to',
 'chapters',
 'at',
 'prateek@cb.com']

# Stemming
* Process that transforms particular words(verbs,plurals)into their radical form
* Preserve the semantics of the sentence without increasing the number of unique tokens
* Example - jumps, jumping, jumped, jump ==> jump

In [30]:
text= """Foxes love to make jumps.The quick brown fox was seen jumping over the 
        lovely dog from a 6ft feet high wall"""

In [31]:
from nltk.stem.snowball import SnowballStemmer, PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
#Snowball Stemmer, Porter, Lancaster Stemmer

In [32]:
ps = PorterStemmer()

In [33]:
ps.stem('jumping')

'jump'

In [34]:
ps.stem('lovely')

'love'

In [35]:
# Snowball Stemmer
ss = SnowballStemmer('english') # it is available for various languages

In [36]:
ss.stem('lovely')

'love'

In [37]:
ss.stem('jumping')

'jump'

In [39]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [40]:
## Lemmatization
from nltk.stem import WordNetLemmatizer

wn = WordNetLemmatizer()
wn.lemmatize('jumping')

'jumping'

# Building a Vocab & Vectorization

In [41]:
# Sample Corpus - Contains 4 Documents, each document can have 1 or more sentences
corpus = [
        'Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
        'We will win next Lok Sabha Elections, says confident Indian PM',
        'The nobel laurate won the hearts of the people.',
        'The movie Raazi is an exciting Indian Spy thriller based upon a real story.'
]

In [42]:
from sklearn.feature_extraction.text import CountVectorizer

In [43]:
cv = CountVectorizer()

In [44]:
vectorized_corpus = cv.fit_transform(corpus)

In [45]:
vectorized_corpus = vectorized_corpus.toarray()

In [46]:
len(vectorized_corpus[0])
print(vectorized_corpus[0])

[0 1 0 1 1 0 1 2 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0
 2 0 1 0 2]


In [47]:
print(cv.vocabulary_)

{'indian': 12, 'cricket': 6, 'team': 31, 'will': 37, 'wins': 39, 'world': 41, 'cup': 7, 'says': 27, 'capt': 4, 'virat': 35, 'kohli': 14, 'be': 3, 'held': 11, 'at': 1, 'sri': 29, 'lanka': 15, 'we': 36, 'win': 38, 'next': 19, 'lok': 17, 'sabha': 26, 'elections': 8, 'confident': 5, 'pm': 23, 'the': 32, 'nobel': 20, 'laurate': 16, 'won': 40, 'hearts': 10, 'of': 21, 'people': 22, 'movie': 18, 'raazi': 24, 'is': 13, 'an': 0, 'exciting': 9, 'spy': 28, 'thriller': 33, 'based': 2, 'upon': 34, 'real': 25, 'story': 30}


In [48]:
len(cv.vocabulary_.keys())

42

In [49]:
# Reverse Mapping!
numbers = vectorized_corpus[2]
numbers

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0])

In [50]:
s = cv.inverse_transform(numbers)
print(s)

[array(['hearts', 'laurate', 'nobel', 'of', 'people', 'the', 'won'],
      dtype='<U9')]


**Vectorization with Stopword Removal**

In [51]:
def myTokenizer(document):
    words = tokenizer.tokenize(document.lower())
    # Remove Stopwords
    words = remove_stopwords(words,sw)
    return words

In [52]:
cv = CountVectorizer(tokenizer=myTokenizer) # we can pass our tokenizer also

In [53]:
vectorized_corpus = cv.fit_transform(corpus).toarray()

In [54]:
print(vectorized_corpus)

[[0 1 0 1 2 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 2]
 [0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 1 1 0 0 0 0]]


In [55]:
print(len(vectorized_corpus[0]))

33


In [56]:
cv.inverse_transform(vectorized_corpus)

[array(['capt.', 'cricket', 'cup', 'held', 'indian', 'kohli.', 'lanka.',
        'says', 'sri', 'team', 'virat', 'wins', 'world'], dtype='<U9'),
 array(['confident', 'elections', 'indian', 'lok', 'next', 'pm', 'sabha',
        'says', 'win'], dtype='<U9'),
 array(['hearts', 'laurate', 'nobel', 'people.'], dtype='<U9'),
 array(['based', 'exciting', 'indian', 'movie', 'raazi', 'real', 'spy',
        'story.', 'thriller', 'upon'], dtype='<U9')]

In [57]:
# For Test Data
test_corpus = [
        'Indian cricket rock !',        
]

In [58]:

cv.transform(test_corpus).toarray()

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

# More ways to Create Features
* Unigram - every word as a feature
* Bigrams
* Trigrams
* n-grams
* TF-IDF Normalisation

In [59]:
sent_1  = ["this is good movie"]
sent_2 = ["this is good movie but actor is not present"]
sent_3 = ["this is not good movie"]

In [60]:
cv = CountVectorizer(ngram_range=(1,3))

In [61]:
docs = [sent_1[0],sent_2[0]]
cv.fit_transform(docs).toarray()

array([[0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
        1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1]])

In [62]:
cv.vocabulary_

{'actor': 0,
 'actor is': 1,
 'actor is not': 2,
 'but': 3,
 'but actor': 4,
 'but actor is': 5,
 'good': 6,
 'good movie': 7,
 'good movie but': 8,
 'is': 9,
 'is good': 10,
 'is good movie': 11,
 'is not': 12,
 'is not present': 13,
 'movie': 14,
 'movie but': 15,
 'movie but actor': 16,
 'not': 17,
 'not present': 18,
 'present': 19,
 'this': 20,
 'this is': 21,
 'this is good': 22}

# Tf-idf Normalisation
* Avoid features that occur very often, becauase they contain less information
* Information decreases as the number of occurences increases across different type of documents
* So we define another term - term-document-frequency which associates a weight with every term

In [63]:
sent_1  = "this is good movie"
sent_2 = "this was good movie"
sent_3 = "this is not good movie"

corpus = [sent_1,sent_2,sent_3]

In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [65]:
tfidf = TfidfVectorizer()

In [66]:
vc = tfidf.fit_transform(corpus).toarray()

In [67]:
print(vc)

[[0.46333427 0.59662724 0.46333427 0.         0.46333427 0.        ]
 [0.41285857 0.         0.41285857 0.         0.41285857 0.69903033]
 [0.3645444  0.46941728 0.3645444  0.61722732 0.3645444  0.        ]]


In [68]:
tfidf.vocabulary_

{'good': 0, 'is': 1, 'movie': 2, 'not': 3, 'this': 4, 'was': 5}