## Basic NLP pipeline

##--> Data Collection
##--> Tokenization , stopword , stemming
##--> Building a common vocab
##--> Vectorising the documents
##--> Performing classification/clustering

# 1. Data Collection

In [11]:
from nltk.corpus import brown

In [12]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [13]:
data = brown.sents(categories='editorial')

In [14]:
print(data)

[['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.'], ...]


In [15]:
print(data[1])
data = data[:100]

['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.']


# 2. Tokenization
Converting sentences into list of words and removing ambiguities like 'isn't' by converting it into is not.

In [17]:
text = data[1]
print(text)

['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.']


In [26]:
mytext = "It was a very pleasant day , the weather was cool and there were light showers. I went to the market to buy some fruits."
print(mytext)

It was a very pleasant day , the weather was cool and there were light showers. I went to the market to buy some fruits.


In [27]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [30]:
sentences = sent_tokenize(mytext)
print(sentences)

['It was a very pleasant day , the weather was cool and there were light showers.', 'I went to the market to buy some fruits.']


In [48]:
wordsList = word_tokenize(mytext)
print(wordsList)
print(len(wordsList))

['It', 'was', 'a', 'very', 'pleasant', 'day', ',', 'the', 'weather', 'was', 'cool', 'and', 'there', 'were', 'light', 'showers', '.', 'I', 'went', 'to', 'the', 'market', 'to', 'buy', 'some', 'fruits', '.']
27


# 3. Stopwords Removal
words like is,and,the,prepositions,articles which are not important and dont tell anything about the gist of sentence. ==> Remove them.

In [41]:
from nltk.corpus import stopwords

In [42]:
sw = stopwords.words('english')
print(sw[:5])
type(sw)

['i', 'me', 'my', 'myself', 'we']


list

In [43]:
sw = set(sw)
print(sw)

{'so', "wasn't", 'nor', 'them', 'that', 'aren', 'o', 'herself', 'than', 's', 'if', "should've", 'y', "wouldn't", 'her', 'isn', 'wouldn', 'this', 'themselves', 'doing', 'above', 'having', 'ma', 'm', 'after', "isn't", 'she', "didn't", 'ours', 'hasn', 'won', 'being', 'other', 'is', "aren't", 'below', 'each', 'why', 'was', 'been', 'its', 'myself', 'again', 'd', 'from', 'most', "you're", 'am', "you'd", 'under', "weren't", 'their', 'hers', 'me', "she's", 'no', "needn't", 'which', 'at', 'couldn', 'or', 'shan', 'such', 'are', 'now', 'between', 'i', 'had', 't', 'with', 'few', "you've", 'my', "you'll", 'those', 'once', 'don', 'own', 'does', "don't", 'weren', 'same', 'didn', 'there', 'then', "won't", 'until', 'against', 'when', 'too', 'have', 'all', "mustn't", 'mustn', 'you', 'but', "couldn't", 'hadn', 'any', 'whom', 've', 'theirs', 'has', "hasn't", 'some', 'in', "it's", 'll', 'here', 'it', 'and', 'yours', 'because', 'just', 'these', 'the', 'before', 'shouldn', 'his', 'off', "shan't", 'over', 'do

In [44]:
print(len(sw))

179


# Filter useful words

In [51]:
useful_words = [word.lower() for word in wordsList if word.lower() not in sw]
print(useful_words)

# Cannot remove dots and commas.

['pleasant', 'day', ',', 'weather', 'cool', 'light', 'showers', '.', 'went', 'market', 'buy', 'fruits', '.']


# Tokenizer using Regular Expression
Problem with word tokenizer -> It cannot handle complex tokenizations.

Hence Use Regexp Tokenizer class in NLTK

In [52]:
from nltk.tokenize import RegexpTokenizer

In [66]:
tokenizer = RegexpTokenizer("[a-zA-Z@]+")

In [67]:
mytext2 = "Send all the 50 documents related to clause 1,2,3 to abqc@xyz.com"

In [68]:
print(tokenizer.tokenize(mytext2))

['Send', 'all', 'the', 'documents', 'related', 'to', 'clause', 'to', 'abqc@xyz', 'com']



## 4. Stemming 

-Converting all forms of a particular word into root word.

-eg.:-> jump , jumps , jumped , jumping is reduced to root word jump.

-Stemming reduces the total no. of words in vocab and helps in catching intuition behind sentences better.

In [70]:
text = "Foxes love to make jumps.The quick brown fox was seen jumping over a lovely dog from a 6 feet high wall"

In [74]:
wordsList = tokenizer.tokenize(text.lower())

In [75]:
print(wordsList)

['foxes', 'love', 'to', 'make', 'jumps', 'the', 'quick', 'brown', 'fox', 'was', 'seen', 'jumping', 'over', 'a', 'lovely', 'dog', 'from', 'a', 'feet', 'high', 'wall']


In [78]:
def filter_words(wordsList):
    sw = set(stopwords.words('english'))
    return [word for word in wordsList if word not in sw]


wordsList = filter_words(wordsList)
print(wordsList)

['foxes', 'love', 'make', 'jumps', 'quick', 'brown', 'fox', 'seen', 'jumping', 'lovely', 'dog', 'feet', 'high', 'wall']


## Stemmers :
1. Snowball stemmer (support for multilingual languages)
2. Porter Stemmer (supports only english)
3. Lancaster Stemmer ( only english )

In [79]:
from nltk.stem.snowball import PorterStemmer

In [80]:
ps = PorterStemmer()

In [87]:
ps.stem('jumped')

'jump'

In [88]:
ps.stem('jumping')

'jump'

In [89]:
ps.stem('lovely')

'love'

In [90]:
ps.stem('awesome')

'awesom'

In [94]:
def stem_words_porter(wordsList):
    return [ps.stem(word) for word in wordsList]

stemmedWordsList = stem_words_porter(wordsList)
print(stemmedWordsList)

['fox', 'love', 'make', 'jump', 'quick', 'brown', 'fox', 'seen', 'jump', 'love', 'dog', 'feet', 'high', 'wall']


In [95]:
from nltk.stem.lancaster import LancasterStemmer

In [96]:
ls = LancasterStemmer()

In [98]:
ls.stem('lovable')

'lov'

In [184]:
ls.stem('teeth')
print(ls.stem('exciting'))

excit


In [102]:
print(ls.stem('teenager'))  # only english
print(ps.stem('teenager'))  # only english.

teen
teenag


In [104]:
#Snowball Stemmer

from nltk.stem.snowball import SnowballStemmer

In [109]:
ss = SnowballStemmer("english")

print(ss.stem('lovely'))
print(ss.stem('lovable'))
print(ss.stem('teenage'))

love
lovabl
teenag


TypeError: 'SnowballStemmer' object is not callable

In [110]:
# Task : Write one function which performs all 3 parts.

In [185]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer 
from nltk.stem.snowball import SnowballStemmer,PorterStemmer

def DataPrep(text):
    
    # Step 1 : Tokenize
    tokenizer = RegexpTokenizer("[a-zA-Z@]+")
    wordsList = tokenizer.tokenize(text)
    
    wordsList = [word.lower() for word in wordsList]
    
    # Step 2 : Remove Stopwords.
    sw = stopwords.words('english')
    sw = set(sw)
    wordsList = [word for word in wordsList if word not in sw]
    
    
    # Step 3 : Stemming
    ss = PorterStemmer()
    wordsList = [ss.stem(word) for word in wordsList]
    
    
    return wordsList


In [186]:
mytext3 = "Foxes love to make jumps.The quick brown  fox was seen jumping over a lovely dog from a 6 feet high wall"

In [187]:
data = DataPrep(mytext3)
print(data)

['fox', 'love', 'make', 'jump', 'quick', 'brown', 'fox', 'seen', 'jump', 'love', 'dog', 'feet', 'high', 'wall']


In [188]:
from nltk.stem import WordNetLemmatizer

In [189]:
l = WordNetLemmatizer()
l.lemmatize("crying")

'cry'

## 4. Building Common Vocabulary & vectorizing documents -> BAG OF WORDS MODEL ( Unigram )

In [190]:
mycorpus = [
    "Indian men's cricket team will win World Cup , says Capt. Virat Kohli,",
    "We will win next Lok Sabha elections , says confident Indian PM" ,
    "The nobel laurate won the hearts of the people" ,
    "The movie Raazi is an exciting Indian spy thriller based upon a real story"
]

In [191]:
from sklearn.feature_extraction.text import CountVectorizer

In [192]:
cv = CountVectorizer()

In [193]:
vectorized_corpus = cv.fit_transform(mycorpus)
print(vectorized_corpus)
print()
print()
vectorized_corpus = vectorized_corpus.toarray()
print(vectorized_corpus)

  (0, 11)	1
  (0, 31)	1
  (0, 2)	1
  (0, 24)	1
  (0, 5)	1
  (0, 36)	1
  (0, 34)	1
  (0, 33)	1
  (0, 27)	1
  (0, 4)	1
  (0, 14)	1
  (0, 9)	1
  (1, 20)	1
  (1, 3)	1
  (1, 6)	1
  (1, 23)	1
  (1, 13)	1
  (1, 16)	1
  (1, 32)	1
  (1, 24)	1
  (1, 34)	1
  (1, 33)	1
  (1, 9)	1
  (2, 19)	1
  (2, 18)	1
  (2, 8)	1
  (2, 35)	1
  (2, 12)	1
  (2, 17)	1
  (2, 28)	3
  (3, 26)	1
  (3, 22)	1
  (3, 30)	1
  (3, 1)	1
  (3, 29)	1
  (3, 25)	1
  (3, 7)	1
  (3, 0)	1
  (3, 10)	1
  (3, 21)	1
  (3, 15)	1
  (3, 28)	1
  (3, 9)	1


[[0 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 1 1 0
  1]
 [0 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 1 1 0
  0]
 [0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 1
  0]
 [1 1 0 0 0 0 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 1 0 1 1 1 0 0 0 0 0
  0]]


In [194]:
print(cv.vocabulary_)  ## Dictionary in which word is mapped with index.

{'indian': 9, 'men': 14, 'cricket': 4, 'team': 27, 'will': 33, 'win': 34, 'world': 36, 'cup': 5, 'says': 24, 'capt': 2, 'virat': 31, 'kohli': 11, 'we': 32, 'next': 16, 'lok': 13, 'sabha': 23, 'elections': 6, 'confident': 3, 'pm': 20, 'the': 28, 'nobel': 17, 'laurate': 12, 'won': 35, 'hearts': 8, 'of': 18, 'people': 19, 'movie': 15, 'raazi': 21, 'is': 10, 'an': 0, 'exciting': 7, 'spy': 25, 'thriller': 29, 'based': 1, 'upon': 30, 'real': 22, 'story': 26}


## Given a vector , what is the sentence?

In [195]:
print(len(vectorized_corpus[0]))


37


In [196]:
import numpy as np
vector = np.ones((37,))
vector[1:3] = 0
print(vector)

[1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [197]:
print(cv.inverse_transform(vector))

[array(['an', 'confident', 'cricket', 'cup', 'elections', 'exciting',
       'hearts', 'indian', 'is', 'kohli', 'laurate', 'lok', 'men',
       'movie', 'next', 'nobel', 'of', 'people', 'pm', 'raazi', 'real',
       'sabha', 'says', 'spy', 'story', 'team', 'the', 'thriller', 'upon',
       'virat', 'we', 'will', 'win', 'won', 'world'], dtype='<U9')]


In [198]:
cv.vocabulary_['based']

1

In [199]:
cv.vocabulary_['win']

34

In [200]:
## Effectively reduce size of the vocab vector.

In [201]:
def myTokenizer(sentence):
    # Accepts a sentence and returns the words.
    
    wordsList = tokenizer.tokenize(sentence)
    wordsList = [word.lower() for word in wordsList]
    return filter_words(wordsList)
    
    
myTokenizer(mycorpus[0])
    
    
    

['indian',
 'men',
 'cricket',
 'team',
 'win',
 'world',
 'cup',
 'says',
 'capt',
 'virat',
 'kohli']

In [203]:
cv = CountVectorizer(tokenizer=DataPrep)
vectorized_corpus = cv.fit_transform(mycorpus)
vc = vectorized_corpus.toarray()
print(vc)
print()
print()
print(len(vc[0]))


mywords=[]
for sentence in mycorpus:
    mywords.append(DataPrep(sentence))
    
print()
print()
print(mywords)

print()
print()
print(cv.vocabulary_)

[[0 1 0 1 1 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 1]
 [0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 1 1 0 1 1 0 0 0]]


30


[['indian', 'men', 'cricket', 'team', 'win', 'world', 'cup', 'say', 'capt', 'virat', 'kohli'], ['win', 'next', 'lok', 'sabha', 'elect', 'say', 'confid', 'indian', 'pm'], ['nobel', 'laurat', 'heart', 'peopl'], ['movi', 'raazi', 'excit', 'indian', 'spi', 'thriller', 'base', 'upon', 'real', 'stori']]


{'indian': 8, 'men': 12, 'cricket': 3, 'team': 24, 'win': 28, 'world': 29, 'cup': 4, 'say': 21, 'capt': 1, 'virat': 27, 'kohli': 9, 'next': 14, 'lok': 11, 'sabha': 20, 'elect': 5, 'confid': 2, 'pm': 17, 'nobel': 15, 'laurat': 10, 'heart': 7, 'peopl': 16, 'movi': 13, 'raazi': 18, 'excit': 6, 'spi': 22, 'thriller': 25, 'base': 0, 'upon': 26, 'real': 19, 'stori': 23}


In [204]:
tmp = vc[0]
cv.inverse_transform(tmp)

[array(['capt', 'cricket', 'cup', 'indian', 'kohli', 'men', 'say', 'team',
        'virat', 'win', 'world'], dtype='<U8')]

## Features Bag Of Words model ->

# 1. Unigram
Focus on the frequency of a single word.

# 2. Bigram , Trigrams
Clubbing 2/3 words to make more sense.

eg: don't like = hate , isn't good = disliked ==> Capture this behaviour

# 3. N-gram
Clubbing more than 2 words

In [211]:
cv = CountVectorizer(tokenizer=myTokenizer , ngram_range=(1,3))
# (1,1) --> only unigrams
# (2,2) --> only bigrams
# (1,2) --> both unigrams and bigrams(combination)
# (1,3) --> unigram , bigram , trigram

vectorized_corpus = cv.fit_transform(mycorpus)
vc = vectorized_corpus.toarray()

print(vc)
print()
print()
print(len(vc[0]))
print()
print()
print(cv.vocabulary_)

[[0 0 0 1 1 1 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 0 0 0 0 0
  1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 1 1 0 0
  0 0 0 0 1 1 1 0 0 1 1 1 1 1]
 [0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 1 1
  0 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 0 1 1 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 1 1 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0
  0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0
  0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 1 1
  1 1 1 1 0 0 0 0 0 0 0 0 0 0]]


86


{'indian': 23, 'men': 36, 'cricket': 9, 'team': 67, 'win': 78, 'world': 83, 'cup': 12, 'says': 58, 'capt': 3, 'virat': 76, 'kohli': 29, 'indian men': 24, 'men cricket': 37, 'cricket team': 10, 'team win': 68, 'win world': 81, 'world cup': 84, 'cup says': 13, 'says capt': 59, 

## Tf-idf Normalisation 
- Avoid features that come very often as they contain very less information
- Information decreases as no. of occurences increases across DIFFERENT TYPES OF DOCUMENTS.
- So , we define other term : term-document-frequency which associates a weight with every term.
- Want to give more weights to words which are more specific to that sentence(category)
eg : Indian came thrice in the above corpus. But it doesnt tell the genre/gist of the sentence. Basically if indian is written , we cannot guess if its related to cricket , politics or movies. So , it will have less weight.

In [212]:
# CountVectorizer works only on frequency part. We also need to focus on weighting factor.

from sklearn.feature_extraction.text import TfidfVectorizer

In [218]:
mycorpus = [
    "Indian men's cricket team will win World Cup , says Capt. Virat Kohli, World Cup is being held in England",
    "We will win next Lok Sabha elections , says confident Indian PM" ,
    "The nobel laurate won the hearts of the people" ,
    "The movie Raazi is an exciting Indian spy thriller based upon a real story"
]

tfidf_vectorizer = TfidfVectorizer(tokenizer=DataPrep , ngram_range=(1,2),norm='l2')
vectorized_corpus_tfidf = tfidf_vectorizer.fit_transform(mycorpus).toarray()
print(vectorized_corpus_tfidf)

[[0.         0.         0.17238665 0.17238665 0.         0.
  0.17238665 0.17238665 0.34477329 0.17238665 0.17238665 0.
  0.         0.17238665 0.         0.         0.         0.
  0.17238665 0.17238665 0.11003216 0.17238665 0.         0.
  0.17238665 0.17238665 0.         0.         0.         0.
  0.17238665 0.17238665 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.13591161 0.17238665
  0.         0.         0.         0.         0.17238665 0.17238665
  0.         0.         0.         0.         0.17238665 0.17238665
  0.13591161 0.         0.17238665 0.34477329 0.34477329]
 [0.         0.         0.         0.         0.25277526 0.25277526
  0.         0.         0.         0.         0.         0.25277526
  0.25277526 0.         0.         0.         0.         0.
  0.         0.         0.16134317 0.         0.25277526 0.
  0.         0.         0.         0.         0.25277526 0.252

In [219]:
print(tfidf_vectorizer.vocabulary_)

{'indian': 20, 'men': 30, 'cricket': 6, 'team': 52, 'win': 60, 'world': 63, 'cup': 8, 'say': 46, 'capt': 2, 'virat': 58, 'kohli': 24, 'held': 18, 'england': 13, 'indian men': 21, 'men cricket': 31, 'cricket team': 7, 'team win': 53, 'win world': 62, 'world cup': 64, 'cup say': 10, 'say capt': 47, 'capt virat': 3, 'virat kohli': 59, 'kohli world': 25, 'cup held': 9, 'held england': 19, 'next': 34, 'lok': 28, 'sabha': 44, 'elect': 11, 'confid': 4, 'pm': 39, 'win next': 61, 'next lok': 35, 'lok sabha': 29, 'sabha elect': 45, 'elect say': 12, 'say confid': 48, 'confid indian': 5, 'indian pm': 22, 'nobel': 36, 'laurat': 26, 'heart': 16, 'peopl': 38, 'nobel laurat': 37, 'laurat heart': 27, 'heart peopl': 17, 'movi': 32, 'raazi': 40, 'excit': 14, 'spi': 49, 'thriller': 54, 'base': 0, 'upon': 56, 'real': 42, 'stori': 51, 'movi raazi': 33, 'raazi excit': 41, 'excit indian': 15, 'indian spi': 23, 'spi thriller': 50, 'thriller base': 55, 'base upon': 1, 'upon real': 57, 'real stori': 43}


In [220]:
print(vectorized_corpus_tfidf.shape)

(4, 65)
