In [4]:
##use these 2 line for downloading brown corpus in NLTK.
import nltk
nltk.download('brown')

In [26]:
from nltk.corpus import brown

In [27]:
##A text corpus is a large body of text. Many corpora are designed to contain a careful balance of material in diff genres.
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [28]:
## brown.sents? = return: the given file(s) as a list of sentences.
data = brown.sents(categories='editorial')[:100]

In [29]:
print(data)

[['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.'], ...]


# NLP PIPELINE- BASIC STEPS

1. Data Collection
2. Tokenization, Stopword, Stemming
3. Building a common vocab
4. Vectorizing the documents
5. Performing Classification/ Clustering

# Tokenization


In [30]:
##Tokenization - To convert given texts into words or sentences.
text = "It was a very pleasant day, the weather was cool and there were light showers. I went to the market to buy some flowers."

In [33]:
##files required to use tokenizers.
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Abhi
[nltk_data]     Bhatia\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [34]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [36]:
##tokenize every sentence.
sents = sent_tokenize(text)
print(sents)

['It was a very pleasant day, the weather was cool and there were light showers.', 'I went to the market to buy some flowers.']


In [46]:
##tokenze every word.
new_sents = word_tokenize(text.lower())
print(new_sents)

['it', 'was', 'a', 'very', 'pleasant', 'day', ',', 'the', 'weather', 'was', 'cool', 'and', 'there', 'were', 'light', 'showers', '.', 'i', 'went', 'to', 'the', 'market', 'to', 'buy', 'some', 'flowers', '.']


# Stopword Removal

In [65]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Abhi
[nltk_data]     Bhatia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [66]:
from nltk.corpus import stopwords

In [67]:
sw = set(stopwords.words('english'))

In [68]:
print(sw)

{'out', 'after', 'over', 'into', 'theirs', 'haven', 'those', 'now', 'being', 'mustn', 'nor', 'won', 'it', 's', 'themselves', 'itself', 'who', 'herself', 'hers', 'ourselves', 'again', 'own', "aren't", 'was', 't', 'of', 'they', 'and', 'have', "mightn't", 'off', "it's", "won't", 'yours', 'our', 'his', 'weren', 'their', 'these', 'because', 'from', "mustn't", 'same', 'shan', "wasn't", 'whom', 'doing', 'once', "you'll", 'most', 'with', 'further', "don't", 'between', 'through', 'ours', 'them', "you've", 'when', 'o', 'that', "shouldn't", 'ma', 'just', "isn't", 've', 'did', 'under', 'didn', 'wasn', 'below', 'before', 'do', 're', 'some', 'himself', 'up', "hasn't", 'until', 'ain', 'my', "should've", 'he', 'both', 'where', "doesn't", "you'd", "couldn't", 'm', 'or', 'hadn', 'mightn', 'aren', 'yourself', 'if', 'were', 'above', 'why', 'the', 'such', 'her', 'about', 'your', 'is', 'will', 'here', 'at', 'this', 'we', "shan't", 'an', 'each', 'yourselves', 'any', 'can', 'don', "needn't", "didn't", 'are', 

# Filtering out words from your sentence

In [69]:
def filter_words(new_sents):
    useful_words = [w for w in new_sents if w not in sw]
    return useful_words

In [70]:
without_sw = filter_words(new_sents)

In [71]:
print(without_sw)

['pleasant', 'day', ',', 'weather', 'cool', 'light', 'showers', '.', 'went', 'market', 'buy', 'flowers', '.']


Generally we use regular expression word tokenizer

In [72]:
from nltk.tokenize import RegexpTokenizer

In [76]:
tokenizer = RegexpTokenizer("[a-zA-Z@]+")
print(tokenizer.tokenize(text))

['It', 'was', 'a', 'very', 'pleasant', 'day', 'the', 'weather', 'was', 'cool', 'and', 'there', 'were', 'light', 'showers', 'I', 'went', 'to', 'the', 'market', 'to', 'buy', 'some', 'flowers']


# Stemming


1.)Process that transforms particular words(verbs,plurals)into their radical form

2.)Preserve the semantics of the sentence without increasing the number of unique tokens

3.) jumps, jumping, jumped, jump ==> jump

In [77]:
text= """Foxes love to make jumps.The quick brown fox was seen jumping over the 
        lovely dog from a 6ft feet high wall"""
words_list = tokenizer.tokenize(text.lower())
print(words_list)

['foxes', 'love', 'to', 'make', 'jumps', 'the', 'quick', 'brown', 'fox', 'was', 'seen', 'jumping', 'over', 'the', 'lovely', 'dog', 'from', 'a', 'ft', 'feet', 'high', 'wall']


In [85]:
word_list = filter_words(words_list) #Remove the stopwords
print(word_list)

['foxes', 'love', 'make', 'jumps', 'quick', 'brown', 'fox', 'seen', 'jumping', 'lovely', 'dog', 'ft', 'feet', 'high', 'wall']


Stemming

1) Snowball Stemmer (Multilingual)

2) Porter Stemmer

3) LancasterStemmer

In [86]:
from nltk.stem.snowball import PorterStemmer,SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
ps = PorterStemmer()

In [87]:
print(ps.stem('running'))

run


In [89]:
##FUNCTION TO PERFORM STEMMING ON A WHOLE LIST OF TOKENIZED WORDS.

def stemming(word_list):
    final = []
    for w in word_list:
        n = ps.stem(w)
        final.append(n)
    return final

In [92]:
stemming(word_list)

['fox',
 'love',
 'make',
 'jump',
 'quick',
 'brown',
 'fox',
 'seen',
 'jump',
 'love',
 'dog',
 'ft',
 'feet',
 'high',
 'wall']

# Lemmatization

In [94]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\Abhi
[nltk_data]     Bhatia\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [95]:
##it is same as stemming.

from nltk.stem import WordNetLemmatizer

l = WordNetLemmatizer()
l.lemmatize("crying")

'cry'

# Building Common Vocabulary and Vectorizing Documents (based upon Bag of Words Model

In [96]:
corpus = [
        'Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
        'We will win next Lok Sabha Elections, says confident Indian PM',
        'The nobel laurate won the hearts of the people',
        'The movie Raazi is an exciting Indian Spy thriller based upon a real story'
]

In [97]:
from sklearn.feature_extraction.text import CountVectorizer

In [101]:
cv = CountVectorizer()
cv.fit_transform?

In [102]:
vectorized_corpus = cv.fit_transform(corpus).toarray()

In [108]:
##this array contains frequency of all the words in the vocabulary that are present in this array.

print(vectorized_corpus)
print(len(vectorized_corpus[0]))

[[0 1 0 1 1 0 1 2 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1
  0 2 0 1 0 2]
 [0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0
  1 1 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 3 0 0 0
  0 0 0 0 1 0]
 [1 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 1 1 1 0
  0 0 0 0 0 0]]
42


In [116]:
print(cv.vocabulary_)

{'indian': 12, 'cricket': 6, 'team': 31, 'will': 37, 'wins': 39, 'world': 41, 'cup': 7, 'says': 27, 'capt': 4, 'virat': 35, 'kohli': 14, 'be': 3, 'held': 11, 'at': 1, 'sri': 29, 'lanka': 15, 'we': 36, 'win': 38, 'next': 19, 'lok': 17, 'sabha': 26, 'elections': 8, 'confident': 5, 'pm': 23, 'the': 32, 'nobel': 20, 'laurate': 16, 'won': 40, 'hearts': 10, 'of': 21, 'people': 22, 'movie': 18, 'raazi': 24, 'is': 13, 'an': 0, 'exciting': 9, 'spy': 28, 'thriller': 33, 'based': 2, 'upon': 34, 'real': 25, 'story': 30}


In [117]:
# Given a Vector what is the sentenence 
import numpy as np
vector = np.ones((42,))
vector[3:7] = 0

print(vector)
print(len(vector))

[1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
42


In [119]:
print(cv.inverse_transform(vector))

[array(['an', 'at', 'based', 'cup', 'elections', 'exciting', 'hearts',
       'held', 'indian', 'is', 'kohli', 'lanka', 'laurate', 'lok',
       'movie', 'next', 'nobel', 'of', 'people', 'pm', 'raazi', 'real',
       'sabha', 'says', 'spy', 'sri', 'story', 'team', 'the', 'thriller',
       'upon', 'virat', 'we', 'will', 'win', 'wins', 'won', 'world'],
      dtype='<U9')]


In [121]:
cv.vocabulary_["kohli"] ##this gives the index of the word in the vocabulary.

14

In [131]:
### Effectively reduce the size of the vector - by stopword removal

def myTokenizer(sentence):
    words = tokenizer.tokenize(sentence.lower())
    return filter_words(words) #Filter out the stopwords

myTokenizer(corpus[0]) ##this is the final corpus or vocabulary

['indian',
 'cricket',
 'team',
 'wins',
 'world',
 'cup',
 'says',
 'capt',
 'virat',
 'kohli',
 'world',
 'cup',
 'held',
 'sri',
 'lanka']


# Features in Bag of Words Model¶

Unigrams

Bigrams, Trigrams

N-Grams

In [129]:
cv = CountVectorizer(tokenizer=myTokenizer,ngram_range=(1,1))
vectorized_corpus = cv.fit_transform(corpus)
vc = vectorized_corpus.toarray()
print(cv.vocabulary_) 

{'indian': 9, 'cricket': 3, 'team': 26, 'wins': 31, 'world': 32, 'cup': 4, 'says': 22, 'capt': 1, 'virat': 29, 'kohli': 10, 'held': 8, 'sri': 24, 'lanka': 11, 'win': 30, 'next': 15, 'lok': 13, 'sabha': 21, 'elections': 5, 'confident': 2, 'pm': 18, 'nobel': 16, 'laurate': 12, 'hearts': 7, 'people': 17, 'movie': 14, 'raazi': 19, 'exciting': 6, 'spy': 23, 'thriller': 27, 'based': 0, 'upon': 28, 'real': 20, 'story': 25}


In [130]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [133]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=myTokenizer,ngram_range=(1,1),norm='l2')

vectorized_corpus = tfidf_vectorizer.fit_transform(corpus).toarray()
print(vectorized_corpus)

[[0.         0.2355126  0.         0.2355126  0.4710252  0.
  0.         0.         0.2355126  0.15032464 0.2355126  0.2355126
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.18568084 0.
  0.2355126  0.         0.2355126  0.         0.         0.2355126
  0.         0.2355126  0.4710252 ]
 [0.         0.         0.35291425 0.         0.         0.35291425
  0.         0.         0.         0.22526059 0.         0.
  0.         0.35291425 0.         0.35291425 0.         0.
  0.35291425 0.         0.         0.35291425 0.27824164 0.
  0.         0.         0.         0.         0.         0.
  0.35291425 0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.5        0.         0.         0.         0.
  0.5        0.         0.         0.         0.5        0.5
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0. 

In [134]:
print(tfidf_vectorizer.vocabulary_)

{'indian': 9, 'cricket': 3, 'team': 26, 'wins': 31, 'world': 32, 'cup': 4, 'says': 22, 'capt': 1, 'virat': 29, 'kohli': 10, 'held': 8, 'sri': 24, 'lanka': 11, 'win': 30, 'next': 15, 'lok': 13, 'sabha': 21, 'elections': 5, 'confident': 2, 'pm': 18, 'nobel': 16, 'laurate': 12, 'hearts': 7, 'people': 17, 'movie': 14, 'raazi': 19, 'exciting': 6, 'spy': 23, 'thriller': 27, 'based': 0, 'upon': 28, 'real': 20, 'story': 25}
