In [1]:
from nltk.corpus import brown

In [2]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [3]:
# get all sentences as words in array of category editorial
data = brown.sents(categories = 'editorial')
print(data)

[['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.'], ...]


In [4]:
print(data[0])

['Assembly', 'session', 'brought', 'much', 'good']


In [5]:
# Tokenization

In [6]:
text = "It was a very pleasant day, the weather was cool and there were light showers. I went to the market to buy some fruits."
print(text)

It was a very pleasant day, the weather was cool and there were light showers. I went to the market to buy some fruits.


In [7]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [9]:
# get sentences
sents = sent_tokenize(text)
print(sents)

['It was a very pleasant day, the weather was cool and there were light showers.', 'I went to the market to buy some fruits.']


In [22]:
# get words from 1st sentence
word_list = word_tokenize(sents[0].lower())
print(word_list)

['it', 'was', 'a', 'very', 'pleasant', 'day', ',', 'the', 'weather', 'was', 'cool', 'and', 'there', 'were', 'light', 'showers', '.']


In [23]:
# stopword removal
from nltk.corpus import stopwords

In [24]:
sw = set(stopwords.words('english'))

In [25]:
print(sw)

{'his', 'this', 'down', 'are', "she's", 'by', 'once', 't', 'wouldn', 'isn', 'were', 'hadn', 'those', 'a', 'her', "it's", 'be', 'did', 'wasn', 'herself', 'ma', 'can', 'whom', 'its', 'themselves', "mustn't", 'before', 'which', 'some', 'having', 'aren', "shouldn't", 'through', 'while', 'no', 'their', 'such', 'as', 'been', "wouldn't", 'nor', 'then', 'after', 'to', 'each', 'an', 'when', 'here', 'couldn', "that'll", 'am', 'other', 'weren', 'these', "you'd", "you've", "you'll", 'where', 'won', 'it', 'all', 'do', 'over', 'too', "shan't", 'into', 'has', 'against', 'from', "aren't", 'have', 'what', "couldn't", 'for', 'ourselves', 'm', 'at', 'mustn', 'doesn', "didn't", "won't", 'until', 'does', 'very', 'or', 'your', 'of', 'most', 'ours', 'because', 'don', 'who', 'mightn', "haven't", "needn't", 'again', 'just', "mightn't", 'yours', 'is', 'than', "wasn't", 'above', 'the', 'both', 'had', 'in', 'd', 'between', 'now', 're', 'we', 'll', 'itself', 'not', 'hasn', 'yourselves', 'so', 'yourself', "doesn't"

In [26]:
print(len(sw))

179


In [27]:
# get all the useful words from sentence not in stopwords
useful_words = [w for w in word_list if w not in sw]
print(useful_words)

['pleasant', 'day', ',', 'weather', 'cool', 'light', 'showers', '.']


In [33]:
def filter_words(word_list):
    useful_words = [w for w in word_list if w not in sw]
    return useful_words

    
useful_words= filter_words(word_list)
print(useful_words)

['pleasant', 'day', ',', 'weather', 'cool', 'light', 'showers', '.']


In [34]:
# word_tokenizer cannot remove every unwanted words like comma, fullstop
# Regex Tokenizer is used to handle such cases

In [35]:
from nltk.tokenize import RegexpTokenizer

In [36]:
tokenizer = RegexpTokenizer("[a-zA-Z@]+")

In [37]:
text = "Send all the 50 documents related to clauses 1,2,3 at abc@xyz.com"

print(tokenizer.tokenize(text))

['Send', 'all', 'the', 'documents', 'related', 'to', 'clauses', 'at', 'abc@xyz', 'com']


In [38]:
# Stemming 
# 1) Process that transforms particular words(verbs,plurals)into their radical form
# 2) Preserve the semantics of the sentence without increasing the number of unique tokens
# 3) jumps, jumping, jumped, jump ==> jump

In [39]:
text= """Foxes love to make jumps.The quick brown fox was seen jumping over the 
        lovely dog from a 6ft feet high wall"""

words_list = tokenizer.tokenize(text.lower())
print(words_list)

['foxes', 'love', 'to', 'make', 'jumps', 'the', 'quick', 'brown', 'fox', 'was', 'seen', 'jumping', 'over', 'the', 'lovely', 'dog', 'from', 'a', 'ft', 'feet', 'high', 'wall']


In [40]:
word_list = filter_words(words_list) #Remove the stopwords
print(word_list)

['foxes', 'love', 'make', 'jumps', 'quick', 'brown', 'fox', 'seen', 'jumping', 'lovely', 'dog', 'ft', 'feet', 'high', 'wall']


In [41]:
# Stemming is converting a word into its radical form
# 1) Snowball Stemmer (Multilingual)
# 2) Porter Stemmer  (English only)
# 3) LancasterStemmer  (English only)

In [42]:
from nltk.stem.snowball import PorterStemmer,SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

ps = PorterStemmer()

In [43]:
ps.stem("jumped")

'jump'

In [44]:
ps.stem("jumping")

'jump'

In [45]:
ps.stem("lovely")

'love'

In [46]:
ps.stem("awesome")

'awesom'

In [47]:
ls = LancasterStemmer()
ls.stem("teenager")

'teen'

In [48]:
ss = SnowballStemmer('english')
print(ss.stem("teenager"))
print(ps.stem("teenager"))

teenag
teenag


In [49]:
ss_french = SnowballStemmer('french')
print(ss.stem('courais'))

courai


In [50]:
# Lemmatization 
from nltk.stem import WordNetLemmatizer

l = WordNetLemmatizer()
l.lemmatize("crying")

'cry'

In [51]:
# steps:
# 1) Tokenization
# 2) StopWord removal
# 3) Stemming

In [52]:
# Building Common Vocabulary and Vectorizing Documents (based upon Bag of Words Model

In [59]:
corpus = [
        'Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
        'We will win next Lok Sabha Elections, says confident Indian PM',
        'The nobel laurate won the hearts of the people',
        'The movie Raazi is an exciting Indian Spy thriller based upon a real story'
]

In [60]:
from sklearn.feature_extraction.text import CountVectorizer

In [61]:
cv = CountVectorizer()

In [62]:
vectorized_corpus = cv.fit_transform(corpus).toarray()

In [63]:
print(vectorized_corpus)
print(len(vectorized_corpus[0]))

[[0 1 0 1 1 0 1 2 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1
  0 2 0 1 0 2]
 [0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0
  1 1 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 3 0 0 0
  0 0 0 0 1 0]
 [1 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 1 1 1 0
  0 0 0 0 0 0]]
42


In [64]:
print(cv.vocabulary_)           # Dictionary

{'indian': 12, 'cricket': 6, 'team': 31, 'will': 37, 'wins': 39, 'world': 41, 'cup': 7, 'says': 27, 'capt': 4, 'virat': 35, 'kohli': 14, 'be': 3, 'held': 11, 'at': 1, 'sri': 29, 'lanka': 15, 'we': 36, 'win': 38, 'next': 19, 'lok': 17, 'sabha': 26, 'elections': 8, 'confident': 5, 'pm': 23, 'the': 32, 'nobel': 20, 'laurate': 16, 'won': 40, 'hearts': 10, 'of': 21, 'people': 22, 'movie': 18, 'raazi': 24, 'is': 13, 'an': 0, 'exciting': 9, 'spy': 28, 'thriller': 33, 'based': 2, 'upon': 34, 'real': 25, 'story': 30}


In [65]:
# given a vector what is the sentence
import numpy as np
vector = np.ones((42, ))
vector[3:7] = 0

print(vector)

[1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [66]:
print(cv.inverse_transform(vector))

[array(['an', 'at', 'based', 'cup', 'elections', 'exciting', 'hearts',
       'held', 'indian', 'is', 'kohli', 'lanka', 'laurate', 'lok',
       'movie', 'next', 'nobel', 'of', 'people', 'pm', 'raazi', 'real',
       'sabha', 'says', 'spy', 'sri', 'story', 'team', 'the', 'thriller',
       'upon', 'virat', 'we', 'will', 'win', 'wins', 'won', 'world'],
      dtype='<U9')]


In [68]:
cv.vocabulary_["an"]

0

In [70]:
# myTokenizer
# from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("[a-zA-Z@]+")
# from nltk.corpus import stopwords
sw = set(stopwords.words('english'))

def filter_words(word_list):
    useful_words = [w for w in word_list if w not in sw]
    return useful_words

def myTokenizer(sentence):
    words = tokenizer.tokenize(sentence.lower())
    return filter_words(words)

In [71]:
myTokenizer(corpus[0])

['indian',
 'cricket',
 'team',
 'wins',
 'world',
 'cup',
 'says',
 'capt',
 'virat',
 'kohli',
 'world',
 'cup',
 'held',
 'sri',
 'lanka']

In [72]:
# unigram model: only one word is stored as feature
cv = CountVectorizer(tokenizer = myTokenizer)
vectorized_corpus = cv.fit_transform(corpus)
vc = vectorized_corpus.toarray()
print(vc[0])
print(len(vc[0]))

[0 1 0 1 2 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 2]
33


In [73]:
v = vc[0]
cv.inverse_transform(v)

[array(['capt', 'cricket', 'cup', 'held', 'indian', 'kohli', 'lanka',
        'says', 'sri', 'team', 'virat', 'wins', 'world'], dtype='<U9')]

In [74]:
# this is unigram bag of words model
# the order of sentence is not preserved

In [75]:
print(cv.vocabulary_)

{'indian': 9, 'cricket': 3, 'team': 26, 'wins': 31, 'world': 32, 'cup': 4, 'says': 22, 'capt': 1, 'virat': 29, 'kohli': 10, 'held': 8, 'sri': 24, 'lanka': 11, 'win': 30, 'next': 15, 'lok': 13, 'sabha': 21, 'elections': 5, 'confident': 2, 'pm': 18, 'nobel': 16, 'laurate': 12, 'hearts': 7, 'people': 17, 'movie': 14, 'raazi': 19, 'exciting': 6, 'spy': 23, 'thriller': 27, 'based': 0, 'upon': 28, 'real': 20, 'story': 25}


In [76]:
# Features in Bag of Words Model:
# 1. Unigrams
# 2. Bigrams, Trigrams
# 3. N-Grams

In [81]:
cv = CountVectorizer(tokenizer = myTokenizer, ngram_range = (1, 2))
# here 1, 2 means feature contains 1 word and 2 words
# 1, 1 would mean only 1 word as feature
vc = cv.fit_transform(corpus).toarray()

print(cv.vocabulary_)

{'indian': 19, 'cricket': 6, 'team': 52, 'wins': 62, 'world': 64, 'cup': 8, 'says': 44, 'capt': 2, 'virat': 58, 'kohli': 23, 'held': 17, 'sri': 49, 'lanka': 25, 'indian cricket': 20, 'cricket team': 7, 'team wins': 53, 'wins world': 63, 'world cup': 65, 'cup says': 10, 'says capt': 45, 'capt virat': 3, 'virat kohli': 59, 'kohli world': 24, 'cup held': 9, 'held sri': 18, 'sri lanka': 50, 'win': 60, 'next': 32, 'lok': 28, 'sabha': 42, 'elections': 11, 'confident': 4, 'pm': 37, 'win next': 61, 'next lok': 33, 'lok sabha': 29, 'sabha elections': 43, 'elections says': 12, 'says confident': 46, 'confident indian': 5, 'indian pm': 21, 'nobel': 34, 'laurate': 26, 'hearts': 15, 'people': 36, 'nobel laurate': 35, 'laurate hearts': 27, 'hearts people': 16, 'movie': 30, 'raazi': 38, 'exciting': 13, 'spy': 47, 'thriller': 54, 'based': 0, 'upon': 56, 'real': 40, 'story': 51, 'movie raazi': 31, 'raazi exciting': 39, 'exciting indian': 14, 'indian spy': 22, 'spy thriller': 48, 'thriller based': 55, 'b

In [82]:
print(len(vc[0]))

66


In [83]:
# Tf-idf Normalisation
# 1. Avoid features that occur very often accross different documents, becauase they contain less information
# 2. Information decreases as the number of occurences increases across different type of documents
# 3. So we define another term - term-document-frequency which associates a weight with every term

In [84]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [86]:
tfidf = TfidfVectorizer(tokenizer = myTokenizer, ngram_range = (1, 1))
vectorised_corpus = tfidf.fit_transform(corpus).toarray()
print(vectorised_corpus)

[[0.         0.2355126  0.         0.2355126  0.4710252  0.
  0.         0.         0.2355126  0.15032464 0.2355126  0.2355126
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.18568084 0.
  0.2355126  0.         0.2355126  0.         0.         0.2355126
  0.         0.2355126  0.4710252 ]
 [0.         0.         0.35291425 0.         0.         0.35291425
  0.         0.         0.         0.22526059 0.         0.
  0.         0.35291425 0.         0.35291425 0.         0.
  0.35291425 0.         0.         0.35291425 0.27824164 0.
  0.         0.         0.         0.         0.         0.
  0.35291425 0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.5        0.         0.         0.         0.
  0.5        0.         0.         0.         0.5        0.5
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0. 

In [87]:
print(tfidf.vocabulary_)

{'indian': 9, 'cricket': 3, 'team': 26, 'wins': 31, 'world': 32, 'cup': 4, 'says': 22, 'capt': 1, 'virat': 29, 'kohli': 10, 'held': 8, 'sri': 24, 'lanka': 11, 'win': 30, 'next': 15, 'lok': 13, 'sabha': 21, 'elections': 5, 'confident': 2, 'pm': 18, 'nobel': 16, 'laurate': 12, 'hearts': 7, 'people': 17, 'movie': 14, 'raazi': 19, 'exciting': 6, 'spy': 23, 'thriller': 27, 'based': 0, 'upon': 28, 'real': 20, 'story': 25}
