# 01. NLTK - Getting Started with Natural Language

### Natural Language ToolKit

# 02.NLTK - Working with Brown Corpus

### https://www.nltk.org/

## A. Data Collection

In [2]:
from nltk.corpus import brown

In [4]:
brown?

In [6]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [7]:
brown.sents?

In [8]:
brown.sents(categories='editorial')

[['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.'], ...]

In [9]:
brown.sents()

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]

In [10]:
data=brown.sents(categories='editorial')
print(data)

[['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.'], ...]


In [11]:
data=brown.sents(categories='editorial')[0]
print(data)

['Assembly', 'session', 'brought', 'much', 'good']


In [12]:
data=brown.sents(categories='editorial')[1]
print(data)

['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.']


In [13]:
data=brown.sents(categories='editorial')[:10]
print(data)

[['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.'], ['It', 'was', 'faced', 'immediately', 'with', 'a', 'showdown', 'on', 'the', 'schools', ',', 'an', 'issue', 'which', 'was', 'met', 'squarely', 'in', 'conjunction', 'with', 'the', 'governor', 'with', 'a', 'decision', 'not', 'to', 'risk', 'abandoning', 'public', 'education', '.'], ['There', 'followed', 'the', 'historic', 'appropriations', 'and', 'budget', 'fight', ',', 'in', 'which', 'the', 'General', 'Assembly', 'decided', 'to', 'tackle', 'executive', 'powers', '.'], ['The', 'final', 'decision', 'went', 'to', 'the', 'executive', 'but', 'a', 'way', 'has', 'been', 'opened', 'for', 'strengthening', 'budgeting', 'procedures', 'and', 'to', 'provide', 'legislators', 'information', 'they', 'need', '.'], ['Long-range', 'planning', 'of', 'programs', 

In [14]:
print(len(data))

10


# 03. NLTK - Tokenization, Stopword Removal

## Basic NLP Pipeline
- Data Collection
- Tokenization, Stopword, Stemming, Lemmatization
- Building a common vocab
- Vectorizing the documents
- Performing Classification/Clustering

## B. Tokenization

In [16]:
text=data[1]
print(text)

['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.']


In [17]:
text='It was a very pleasant day, the weather was cool and there were light showers. I went to market to buy some fruits.'
print(text)

It was a very pleasant day, the weather was cool and there were light showers. I went to market to buy some fruits.


In [18]:
from nltk.tokenize import sent_tokenize

In [19]:
print(sent_tokenize(text))

['It was a very pleasant day, the weather was cool and there were light showers.', 'I went to market to buy some fruits.']


In [20]:
from nltk.tokenize import word_tokenize

In [21]:
print(word_tokenize(text))

['It', 'was', 'a', 'very', 'pleasant', 'day', ',', 'the', 'weather', 'was', 'cool', 'and', 'there', 'were', 'light', 'showers', '.', 'I', 'went', 'to', 'market', 'to', 'buy', 'some', 'fruits', '.']


In [22]:
sents=sent_tokenize(text)
print(sents)

['It was a very pleasant day, the weather was cool and there were light showers.', 'I went to market to buy some fruits.']


In [27]:
word_list=word_tokenize(sents[1])
print(word_list)

['I', 'went', 'to', 'market', 'to', 'buy', 'some', 'fruits', '.']


## Stopword Removal

In [24]:
from nltk.corpus import stopwords

In [25]:
sw=set(stopwords.words('english'))
print(sw)

{'her', 'didn', 'wouldn', "it's", 'before', 'it', 'ourselves', 'yourself', 'over', 'and', 's', 'them', 'me', 'has', 'few', "doesn't", 'those', 'y', 'more', 'he', 'just', 'these', 'were', 'now', "she's", 'above', "shan't", 're', 'the', 'have', 'where', 'nor', 'here', 'isn', 'through', 'to', "needn't", 'once', 'do', 'its', 'll', 'from', 'doesn', 'himself', 'yourselves', 'in', 'd', 'under', 'with', 'too', 'weren', "mightn't", 'myself', 'ours', 'him', "wouldn't", 'but', 'had', "should've", 'no', 'not', 'won', 'are', 'our', 'is', 'yours', 'your', 'each', 'ain', 'does', 'can', "haven't", 'a', 'how', 'or', 'further', 'out', "hadn't", 'so', "you'll", 'be', 'all', 'when', 'shouldn', 'shan', 'other', "shouldn't", 'she', 'my', 'then', "couldn't", 'there', 'herself', "didn't", 'what', 'you', 'down', 'between', 'off', 'should', 'was', 'both', 'about', 'been', 'during', 'we', 'having', 'on', 'because', 'such', 'themselves', 'ma', 'hers', "you're", "won't", 'again', "mustn't", 'whom', 'doing', 'being

In [26]:
print(len(sw))

179


### Filter The Words From Your Sentence

In [28]:
useful_words=[w for w in word_list if w not in sw]
print(useful_words)

['I', 'went', 'market', 'buy', 'fruits', '.']


In [30]:
word_list=word_tokenize(sents[1].lower())
print(word_list)
useful_words=[w for w in word_list if w not in sw]
print(useful_words)

['i', 'went', 'to', 'market', 'to', 'buy', 'some', 'fruits', '.']
['went', 'market', 'buy', 'fruits', '.']


In [31]:
word_list=word_tokenize(sents[0])
print(word_list)
useful_words=[w for w in word_list if w not in sw]
print(useful_words)

['It', 'was', 'a', 'very', 'pleasant', 'day', ',', 'the', 'weather', 'was', 'cool', 'and', 'there', 'were', 'light', 'showers', '.']
['It', 'pleasant', 'day', ',', 'weather', 'cool', 'light', 'showers', '.']


In [32]:
word_list=word_tokenize(sents[0].lower())
print(word_list)
useful_words=[w for w in word_list if w not in sw]
print(useful_words)

['it', 'was', 'a', 'very', 'pleasant', 'day', ',', 'the', 'weather', 'was', 'cool', 'and', 'there', 'were', 'light', 'showers', '.']
['pleasant', 'day', ',', 'weather', 'cool', 'light', 'showers', '.']


# 04. NLTK - Regex Tokenizer

### Tokenizer Using Regular Expression
- Problem With word_tokenize() :
-  Can't handle complex tokenizations!
- So we use a RegexpTokenizer class in NLTK

In [33]:
from nltk.tokenize import RegexpTokenizer 

### For regular expression
https://www.regexpal.com/

In [34]:
text='Send all the 50 documents related to clauses 1,2,3 at abc@xyz.com'
word_list=word_tokenize(text.lower())
print(word_list)

['send', 'all', 'the', '50', 'documents', 'related', 'to', 'clauses', '1,2,3', 'at', 'abc', '@', 'xyz.com']


In [35]:
# Regular Expression : '[a-zA-Z@]+' 
# Which means anything between a-z, A-Z and email. Atleast one is required (+) 
tokenizer=RegexpTokenizer('[a-zA-Z@]+')
text='Send all the 50 documents related to clauses 1,2,3 at abc@xyz.com'
print(tokenizer.tokenize(text))

['Send', 'all', 'the', 'documents', 'related', 'to', 'clauses', 'at', 'abc@xyz', 'com']


# 05. NLTK - Stemming & Lemmatization

### Stemming
- Process that transform particular word(verbs, plurals) into radical form
- Preserve the semantics of the sentence without increasing the number of unique tokens
- jump, jumped, jumping, jumps ==> jump

### Filter the words from the sentence

In [36]:
def filter_words(word_list):
    useful_words=[w for w in word_list if w not in sw]
    return useful_words
useful_words=filter_words(word_list) # Remove the stopwords
print(useful_words)

['send', '50', 'documents', 'related', 'clauses', '1,2,3', 'abc', '@', 'xyz.com']


In [37]:
text='Foxes love to make jumps. The quick brown fox was seen jumping over the lovely dog from a 6ft high wall'

words_list=tokenizer.tokenize(text.lower())
print(words_list)

['foxes', 'love', 'to', 'make', 'jumps', 'the', 'quick', 'brown', 'fox', 'was', 'seen', 'jumping', 'over', 'the', 'lovely', 'dog', 'from', 'a', 'ft', 'high', 'wall']


In [39]:
word_list=filter_words(words_list)
print(word_list)

['foxes', 'love', 'make', 'jumps', 'quick', 'brown', 'fox', 'seen', 'jumping', 'lovely', 'dog', 'ft', 'high', 'wall']


### Stemming
- 1) Snowball Stemmer (Multilingual)
- 2) PorterStemmer 
- 3) LancasterStemmer

In [40]:
from nltk.stem.snowball import PorterStemmer
ps=PorterStemmer()

In [41]:
ps.stem('jump')

'jump'

In [44]:
ps.stem('jumped')

'jump'

In [45]:
ps.stem('jumping')

'jump'

In [46]:
ps.stem('jumps')

'jump'

In [47]:
ps.stem('jumpedd')

'jumpedd'

In [48]:
ps.stem('lovely')

'love'

In [49]:
ps.stem('awesome')

'awesom'

In [50]:
from nltk.stem.lancaster import LancasterStemmer
ls=LancasterStemmer()
ls.stem('teeth')

'tee'

In [52]:
print(ps.stem('Teenager'))#English
print(ls.stem('Teenager'))#English

teenag
teen


In [55]:
# SnowballStemmer
from nltk.stem.snowball import SnowballStemmer
ss=SnowballStemmer('english')
print(ss.stem('lovely'))
print(ss.stem('teenager'))

love
teenag


In [56]:
ss_french=SnowballStemmer('french')
print(ss_french.stem('courais')) # was running

cour


## Task (Assignment): Write one function which performs all the three steps - Tokenization, Stopword Removal, Stemming, Any leading or trailing whitespaces

In [73]:
from nltk.tokenize import RegexpTokenizer 
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
def removal(text):
    tokenizer=RegexpTokenizer('[a-zA-Z]+') # Tokenization using Regular Expression
    regexp_list=tokenizer.tokenize(text)
    #print(regexp_list)
    lower_list=[w.lower() for w in regexp_list] # Change to Lower case
    #print(lower_list)
    sw=set(stopwords.words('english')) # Stopword removal
    useful_words=[w for w in lower_list if w not in sw]
    #print(useful_words)
    ss=SnowballStemmer('english')# Stemming
    stemmed_words=[ss.stem(w) for w in useful_words]
    #print(stemmed_words)
    final_words=[w.strip() for w in stemmed_words] # White space removal
    return final_words    

In [74]:
text="The quick brown fox jumps right over the little jumping lazy lovely dog.And i    liked it."
print(removal(text))

['quick', 'brown', 'fox', 'jump', 'right', 'littl', 'jump', 'lazi', 'love', 'dog', 'like']


### Lemmatization

In [75]:
from nltk.stem import WordNetLemmatizer

In [77]:
l=WordNetLemmatizer()
l.lemmatize('crying')

'cry'

# 06. Bag of Words - Constructing Vocab

### Building Common Vocabulary and Vectorizing Documents (based upon Bag of Words Model)

In [78]:
corpus=[
    'Indian cricket team will win World Cup, says Capt. Virat Kohli',
    'We will win next Lok Sabha Elections, says confident Indian PM',
    'The Nobel Laureate won the hearts of the people',
    'The movie Razi is an exciting Indian Spy thriller based upon a real story'
]

In [88]:
from sklearn.feature_extraction.text import CountVectorizer

In [89]:
cv=CountVectorizer()

In [90]:
vectorized_corpus=cv.fit_transform(corpus)

In [91]:
vectorized_corpus

<4x36 sparse matrix of type '<class 'numpy.int64'>'
	with 42 stored elements in Compressed Sparse Row format>

In [83]:
vectorized_corpus?

In [84]:
vectorized_corpus=cv.fit_transform(corpus).todense()

In [85]:
vectorized_corpus

matrix([[0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1],
        [0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
         0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
         0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0],
        [1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
         1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [92]:
vectorized_corpus=cv.fit_transform(corpus).toarray()

In [93]:
vectorized_corpus

array([[0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1],
       [0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
        1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
        0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [96]:
cv.vocabulary

In [97]:
cv.vocabulary_ # Dictionary - Word -> Index

{'an': 0,
 'based': 1,
 'capt': 2,
 'confident': 3,
 'cricket': 4,
 'cup': 5,
 'elections': 6,
 'exciting': 7,
 'hearts': 8,
 'indian': 9,
 'is': 10,
 'kohli': 11,
 'laureate': 12,
 'lok': 13,
 'movie': 14,
 'next': 15,
 'nobel': 16,
 'of': 17,
 'people': 18,
 'pm': 19,
 'razi': 20,
 'real': 21,
 'sabha': 22,
 'says': 23,
 'spy': 24,
 'story': 25,
 'team': 26,
 'the': 27,
 'thriller': 28,
 'upon': 29,
 'virat': 30,
 'we': 31,
 'will': 32,
 'win': 33,
 'won': 34,
 'world': 35}

In [99]:
len(cv.vocabulary_)

36

In [100]:
# Reverse mapping 
# Given the Vector what is the sentence
import numpy as np
vector=np.ones((36,))
vector[3:7]=0
print(vector)

[1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [101]:
print(cv.inverse_transform(vector))

[array(['an', 'based', 'capt', 'exciting', 'hearts', 'indian', 'is',
       'kohli', 'laureate', 'lok', 'movie', 'next', 'nobel', 'of',
       'people', 'pm', 'razi', 'real', 'sabha', 'says', 'spy', 'story',
       'team', 'the', 'thriller', 'upon', 'virat', 'we', 'will', 'win',
       'won', 'world'], dtype='<U9')]


In [102]:
cv.vocabulary_['based']

1

In [107]:
cv.vocabulary_['exciting']

7

In [108]:
cv.vocabulary_['indian']

9

In [109]:
cv.vocabulary_

{'an': 0,
 'based': 1,
 'capt': 2,
 'confident': 3,
 'cricket': 4,
 'cup': 5,
 'elections': 6,
 'exciting': 7,
 'hearts': 8,
 'indian': 9,
 'is': 10,
 'kohli': 11,
 'laureate': 12,
 'lok': 13,
 'movie': 14,
 'next': 15,
 'nobel': 16,
 'of': 17,
 'people': 18,
 'pm': 19,
 'razi': 20,
 'real': 21,
 'sabha': 22,
 'says': 23,
 'spy': 24,
 'story': 25,
 'team': 26,
 'the': 27,
 'thriller': 28,
 'upon': 29,
 'virat': 30,
 'we': 31,
 'will': 32,
 'win': 33,
 'won': 34,
 'world': 35}

# 07. Bag of Words - Vectorization, Unigram Features

In [110]:
# Effectively reduce the size of the vector

from nltk.tokenize import RegexpTokenizer 
from nltk.corpus import stopwords

def filter_words(word_list):
    sw=set(stopwords.words('english'))
    useful_words=[w for w in word_list if w not in sw]
    return useful_words

def myTokenizer(sentence):
    tokenizer=RegexpTokenizer('[a-zA-Z]+')
    words=tokenizer.tokenize(sentence.lower())
    return filter_words(words)

myTokenizer(corpus[0])

['indian',
 'cricket',
 'team',
 'win',
 'world',
 'cup',
 'says',
 'capt',
 'virat',
 'kohli']

In [119]:
cv=CountVectorizer(tokenizer=myTokenizer)
vectorized_corpus=cv.fit_transform(corpus)
vc=vectorized_corpus.todense()
print(vc)
print(len(vc))
print(vc[0])
print(len(vc[0])) # As a string

[[0 1 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 1]
 [0 0 1 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 1 1 0 0 1 1 0 1 1 0 0 0]]
4
[[0 1 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 1]]
1


In [120]:
vc=vectorized_corpus.toarray()
print(vc)
print(len(vc))
print(vc[0])
print(len(vc[0])) # Individual elements

[[0 1 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 1]
 [0 0 1 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 1 1 0 0 1 1 0 1 1 0 0 0]]
4
[0 1 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 1]
29


In [121]:
v=vc[0]
cv.inverse_transform(v)

[array(['capt', 'cricket', 'cup', 'indian', 'kohli', 'says', 'team',
        'virat', 'win', 'world'], dtype='<U9')]

In [122]:
print(vc[0])

[0 1 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 1]


In [123]:
vc[0][0]=1
print(vc[0])

[1 1 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 1]


In [124]:
cv.inverse_transform(vc[0])

[array(['based', 'capt', 'cricket', 'cup', 'indian', 'kohli', 'says',
        'team', 'virat', 'win', 'world'], dtype='<U9')]

In [125]:
# This is known as Unigram : Bag of words model

# 08. Bag of Words Model - Bigrams, Trigrams, Ngrams as features

### Features of Bag of Words Model
- Unigrams
- Bigrams, Trigrams
- N-Grams

In [126]:
cv=CountVectorizer(tokenizer=myTokenizer,ngram_range=(1,2))# Bigram
vectorized_corpus=cv.fit_transform(corpus)
vc=vectorized_corpus.toarray()

print(cv.vocabulary_) # Print indexes

{'indian': 16, 'cricket': 6, 'team': 45, 'win': 53, 'world': 56, 'cup': 8, 'says': 39, 'capt': 2, 'virat': 51, 'kohli': 20, 'indian cricket': 17, 'cricket team': 7, 'team win': 46, 'win world': 55, 'world cup': 57, 'cup says': 9, 'says capt': 40, 'capt virat': 3, 'virat kohli': 52, 'next': 27, 'lok': 23, 'sabha': 37, 'elections': 10, 'confident': 4, 'pm': 32, 'win next': 54, 'next lok': 28, 'lok sabha': 24, 'sabha elections': 38, 'elections says': 11, 'says confident': 41, 'confident indian': 5, 'indian pm': 18, 'nobel': 29, 'laureate': 21, 'hearts': 14, 'people': 31, 'nobel laureate': 30, 'laureate hearts': 22, 'hearts people': 15, 'movie': 25, 'razi': 33, 'exciting': 12, 'spy': 42, 'thriller': 47, 'based': 0, 'upon': 49, 'real': 35, 'story': 44, 'movie razi': 26, 'razi exciting': 34, 'exciting indian': 13, 'indian spy': 19, 'spy thriller': 43, 'thriller based': 48, 'based upon': 1, 'upon real': 50, 'real story': 36}


In [127]:
print(vc) # Print frequency

[[0 0 1 1 0 0 1 1 1 1 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 1 1 0 0 0 0 1 1 0 0 0 0 1 1 1 0 1 1 1]
 [0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 0 1 0 1 0 0 0 0 1 1 0 0 1 1 0 0 0 1 0 0 0
  0 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1
  1 0 0 0 0 0 1 1 1 0 0 1 1 1 1 0 0 0 0 0 0 0]]


In [128]:
print(len(vc[0]))#Increased in no because of inclusion of bigrams along with unigrams

58


# 09. TF-IDF Normalisation (Term Freq - Inverse Doc Freq)

### Tf-idf Noramlisation 
- Avoid features that occur very often, because they contain less information
- Information decreases as the number of occurances increases across different type of documents
- So we define another term - term-document-frequency which associates a weight with every term

In [130]:
corpus=[
    'Indian cricket team will win World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
    'We will win next Lok Sabha Elections, says confident Indian PM',
    'The Nobel Laureate won the hearts of the people',
    'The movie Razi is an exciting Indian Spy thriller based upon a real story'
]

In [131]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [133]:
tfidf_vectorizer=TfidfVectorizer(tokenizer=myTokenizer,ngram_range=(1,2))
vectorized_corpus=tfidf_vectorizer.fit_transform(corpus).toarray()
print(vectorized_corpus)

[[0.         0.         0.17238665 0.17238665 0.         0.
  0.17238665 0.17238665 0.34477329 0.17238665 0.17238665 0.
  0.         0.         0.         0.         0.         0.17238665
  0.17238665 0.11003216 0.17238665 0.         0.         0.17238665
  0.17238665 0.17238665 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.13591161 0.17238665 0.         0.
  0.         0.17238665 0.17238665 0.         0.17238665 0.17238665
  0.         0.         0.         0.         0.17238665 0.17238665
  0.13591161 0.         0.17238665 0.34477329 0.34477329]
 [0.         0.         0.         0.         0.25277526 0.25277526
  0.         0.         0.         0.         0.         0.25277526
  0.25277526 0.         0.         0.         0.         0.
  0.         0.16134317 0.         0.25277526 0.         0.
  0.         0.         0.         0.         0.252775

In [134]:
print(tfidf_vectorizer.vocabulary_)

{'indian': 19, 'cricket': 6, 'team': 52, 'win': 60, 'world': 63, 'cup': 8, 'says': 44, 'capt': 2, 'virat': 58, 'kohli': 23, 'held': 17, 'sri': 49, 'lanka': 25, 'indian cricket': 20, 'cricket team': 7, 'team win': 53, 'win world': 62, 'world cup': 64, 'cup says': 10, 'says capt': 45, 'capt virat': 3, 'virat kohli': 59, 'kohli world': 24, 'cup held': 9, 'held sri': 18, 'sri lanka': 50, 'next': 32, 'lok': 28, 'sabha': 42, 'elections': 11, 'confident': 4, 'pm': 37, 'win next': 61, 'next lok': 33, 'lok sabha': 29, 'sabha elections': 43, 'elections says': 12, 'says confident': 46, 'confident indian': 5, 'indian pm': 21, 'nobel': 34, 'laureate': 26, 'hearts': 15, 'people': 36, 'nobel laureate': 35, 'laureate hearts': 27, 'hearts people': 16, 'movie': 30, 'razi': 38, 'exciting': 13, 'spy': 47, 'thriller': 54, 'based': 0, 'upon': 56, 'real': 40, 'story': 51, 'movie razi': 31, 'razi exciting': 39, 'exciting indian': 14, 'indian spy': 22, 'spy thriller': 48, 'thriller based': 55, 'based upon': 1,

In [135]:
tfidf_vectorizer=TfidfVectorizer(tokenizer=myTokenizer,ngram_range=(1,1))# Unigram
vectorized_corpus=tfidf_vectorizer.fit_transform(corpus).toarray()
print(vectorized_corpus)

[[0.         0.23802376 0.         0.23802376 0.47604753 0.
  0.         0.         0.23802376 0.15192748 0.23802376 0.23802376
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.18766067 0.
  0.23802376 0.         0.23802376 0.         0.         0.23802376
  0.18766067 0.47604753]
 [0.         0.         0.36153669 0.         0.         0.36153669
  0.         0.         0.         0.23076418 0.         0.
  0.         0.36153669 0.         0.36153669 0.         0.
  0.36153669 0.         0.         0.36153669 0.28503968 0.
  0.         0.         0.         0.         0.         0.
  0.28503968 0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.5        0.         0.         0.         0.
  0.5        0.         0.         0.         0.5        0.5
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]


In [136]:
print(tfidf_vectorizer.vocabulary_)

{'indian': 9, 'cricket': 3, 'team': 26, 'win': 30, 'world': 31, 'cup': 4, 'says': 22, 'capt': 1, 'virat': 29, 'kohli': 10, 'held': 8, 'sri': 24, 'lanka': 11, 'next': 15, 'lok': 13, 'sabha': 21, 'elections': 5, 'confident': 2, 'pm': 18, 'nobel': 16, 'laureate': 12, 'hearts': 7, 'people': 17, 'movie': 14, 'razi': 19, 'exciting': 6, 'spy': 23, 'thriller': 27, 'based': 0, 'upon': 28, 'real': 20, 'story': 25}


In [137]:
tfidf_vectorizer=TfidfVectorizer(tokenizer=myTokenizer,ngram_range=(1,1),norm='l2')# Nomalised by l2 norm
vectorized_corpus=tfidf_vectorizer.fit_transform(corpus).toarray()
print(vectorized_corpus)

[[0.         0.23802376 0.         0.23802376 0.47604753 0.
  0.         0.         0.23802376 0.15192748 0.23802376 0.23802376
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.18766067 0.
  0.23802376 0.         0.23802376 0.         0.         0.23802376
  0.18766067 0.47604753]
 [0.         0.         0.36153669 0.         0.         0.36153669
  0.         0.         0.         0.23076418 0.         0.
  0.         0.36153669 0.         0.36153669 0.         0.
  0.36153669 0.         0.         0.36153669 0.28503968 0.
  0.         0.         0.         0.         0.         0.
  0.28503968 0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.5        0.         0.         0.         0.
  0.5        0.         0.         0.         0.5        0.5
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]


In [138]:
print(tfidf_vectorizer.vocabulary_)

{'indian': 9, 'cricket': 3, 'team': 26, 'win': 30, 'world': 31, 'cup': 4, 'says': 22, 'capt': 1, 'virat': 29, 'kohli': 10, 'held': 8, 'sri': 24, 'lanka': 11, 'next': 15, 'lok': 13, 'sabha': 21, 'elections': 5, 'confident': 2, 'pm': 18, 'nobel': 16, 'laureate': 12, 'hearts': 7, 'people': 17, 'movie': 14, 'razi': 19, 'exciting': 6, 'spy': 23, 'thriller': 27, 'based': 0, 'upon': 28, 'real': 20, 'story': 25}
