In [5]:
#!pip install nltk

In [6]:
import nltk

In [7]:
#nltk.download()

In [8]:
# corpus - A large collection of text

from nltk.corpus import brown

In [9]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [10]:
len(brown.categories())

15

In [11]:
data = brown.sents(categories='adventure')

In [12]:
len(data)

4637

In [13]:
" ".join(data[3])

"If he had married her , he'd have been asking for trouble ."

# Bag of Words Pipeline
- Get the data/corpus
- Tokenisation, stopward removal
- stemming
- building a vocab
- vectorization
- classification

# tokenisation and stopward removal

In [14]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [15]:
document = '''It was a very plesant day. The weather was cool and there were light showers. 
I went to the market to buy some fruits'''

sentence = "Send all the 50 documents related to chapters 1,2,3 at prateek@cb.com"

In [16]:
sents = sent_tokenize(document)

In [17]:
print(sents)

['It was a very plesant day.', 'The weather was cool and there were light showers.', 'I went to the market to buy some fruits']


In [18]:
sentence.split()

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapters',
 '1,2,3',
 'at',
 'prateek@cb.com']

In [19]:
words = word_tokenize(sentence)

In [20]:
words

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapters',
 '1,2,3',
 'at',
 'prateek',
 '@',
 'cb.com']

# stopwards removal

In [21]:
from nltk.corpus import stopwords

sw = set(stopwords.words('english'))

In [22]:
print(sw)

{'then', 'our', 'do', 'which', 'with', 'these', 'in', 'm', 'weren', 'having', 'after', 'out', 'does', 'ours', 'so', 'where', 'this', 'did', 'other', 'isn', 'what', 'the', 'o', "you've", "won't", 'himself', 'just', 'ain', 'hers', 'his', 'same', 'more', 'has', 'them', 'here', 'while', "shan't", 's', 'below', "aren't", 'are', 'doesn', 'both', 'against', "needn't", 'yourselves', 'herself', 'yourself', 'any', 'were', 'ourselves', 'to', 'very', 'under', 'didn', 'wouldn', 'until', "that'll", 'but', 'mustn', 'me', 'don', 'into', 'hadn', 'off', 'have', 'her', 'and', 'been', 'i', 'is', 'some', 'couldn', 'wasn', 'haven', 'can', 'doing', 'will', 'about', 'theirs', 'my', 'before', 'for', 'she', "wouldn't", 'ma', 'all', 't', 'a', 'whom', "haven't", "mustn't", 'no', 'own', 'not', 'they', 'on', 'at', "didn't", 'itself', 'during', 'nor', 'd', 'shan', "you'll", "don't", 'll', "weren't", 'once', 'or', "doesn't", 'above', 'between', 'it', 'those', 'was', 'from', "hadn't", "mightn't", 'be', 'up', 'than', '

In [23]:
type(sw)

set

In [24]:
def remove_stopwords(txt, stopword):
    useful_words=[]
    for w in txt:
        if w not in stopword:
            useful_words.append(w)
    return useful_words

In [25]:
sentence = "i am not borthered about her very much".split()
remove_stopwords(sentence, sw)

['borthered', 'much']

# Tokenisation using Regular expression

In [26]:
sentance = "Send all the 50 documents related to chapters 1,2,3 at prateek@cb.com"

In [27]:
from nltk.tokenize import RegexpTokenizer

In [28]:
tokenizer = RegexpTokenizer('[a-zA-Z@.]+')

In [29]:
useful_text = tokenizer.tokenize(sentance)

In [30]:
useful_text  # Removed all the numbers

['Send',
 'all',
 'the',
 'documents',
 'related',
 'to',
 'chapters',
 'at',
 'prateek@cb.com']

# Website for getting regular expression is 
-https://www.regexpal.com/

# Stemming
- Process that transforms particular words(verbs, plurals) into their radical forms.
- Preserves the semantics of the sentance without increasing the number of unique tokens.
- Example- jumps,jumping, jumped, jump => jump

In [31]:
text = """Foxes love to make jumps. The quick brown fox was seen jumping over the lovely dog from a 6ft high wall."""

In [32]:
# Snowball stemmer, Porter, Lancaster Stemmer
from nltk.stem.snowball import SnowballStemmer, PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

ps = PorterStemmer()



In [33]:
ps.stem('jumping')

'jump'

In [34]:
ps.stem('jumps')

'jump'

In [35]:
ps.stem('lovely')

'love'

In [36]:
#Snowball stemmer
ss = SnowballStemmer('english')
ss.stem('lovely')

'love'

In [37]:
ss.stem('colding')

'cold'

In [38]:
# Lemmatisation
from nltk.stem import WordNetLemmatizer

wn = WordNetLemmatizer()
wn.lemmatize('jumping')

'jumping'

# Building a Vocab and vectorization

In [39]:
# sample corpus contains 4 document
corpus = [
    'Indian cricket team will wins World Cup, says Capt. Virat Kohli.',
    'We will win next Lok Sabha Elections, says confident Indian PM',
    'The nobel laurate won the heart of the people',
    'The movie Raazi is an exciting Indian Spy thriller based upon a real story.'
]

In [40]:
from sklearn.feature_extraction.text import CountVectorizer

In [41]:
cv = CountVectorizer()

In [42]:
vectorized_corpus = cv.fit_transform(corpus)

In [43]:
vectorized_corpus = vectorized_corpus.toarray()

In [44]:
vectorized_corpus

array([[0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1],
       [0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
        1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
        0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])

In [45]:
cv.vocabulary_

{'indian': 9,
 'cricket': 4,
 'team': 26,
 'will': 32,
 'wins': 34,
 'world': 36,
 'cup': 5,
 'says': 23,
 'capt': 2,
 'virat': 30,
 'kohli': 11,
 'we': 31,
 'win': 33,
 'next': 15,
 'lok': 13,
 'sabha': 22,
 'elections': 6,
 'confident': 3,
 'pm': 19,
 'the': 27,
 'nobel': 16,
 'laurate': 12,
 'won': 35,
 'heart': 8,
 'of': 17,
 'people': 18,
 'movie': 14,
 'raazi': 20,
 'is': 10,
 'an': 0,
 'exciting': 7,
 'spy': 24,
 'thriller': 28,
 'based': 1,
 'upon': 29,
 'real': 21,
 'story': 25}

In [46]:
# Reverse Mapping
numbers = vectorized_corpus
numbers

array([[0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1],
       [0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
        1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
        0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])

In [47]:
s = cv.inverse_transform(numbers)

In [48]:
s[0]

array(['capt', 'cricket', 'cup', 'indian', 'kohli', 'says', 'team',
       'virat', 'will', 'wins', 'world'], dtype='<U9')

# Tf-idf Normalisation(Term frequency-Inverse Document frequency)
- Avoid feature that occur very often, because they contain less information.
- Information decreases as the number of occurences increases accross different type of document.
- So we define another term-term-document-frequency which associates a weight with every term.

In [49]:
sent_1 = ["this is good movie"]
sent_2 = ["this was good movie"]
sent_3 = ["this is not good movie"]

corpus = [sent_1,sent_2, sent_3]

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [51]:
tfidf = TfidfVectorizer()

In [55]:
vc= tfidf.fit_transform(sent_1).toarray()

In [56]:
vc

array([[0.5, 0.5, 0.5, 0.5]])

In [57]:
tfidf.vocabulary_

{'this': 3, 'is': 1, 'good': 0, 'movie': 2}