# Basic NLP Pipelining
- *Data Collection*
- *Tokenization, stopword,stemming*
- *Building a common vocab*
- *Vectorizing the documents*
- *Performing Classification/Clustering*

### 1. **Data Collection**

In [1]:
from nltk.corpus import brown

In [5]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [6]:
data = brown.sents(categories='editorial')
print(data[1])

['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.']


### 2. **Tokenization**

In [7]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [135]:
text='''It was a rainy day, I was not stasnding alone on the balcony of my house in switzerland.A flock of birds was flying in the sky.'''

In [136]:
sents= sent_tokenize(text)
words= word_tokenize(sents[0].lower())

In [137]:
print(words)

['it', 'was', 'a', 'rainy', 'day.day', ',', 'i', 'was', 'not', 'standing', 'alone', 'on', 'the', 'balcony', 'of', 'my', 'house', 'in', 'switzerland.a', 'flock', 'of', 'birds', 'was', 'flying', 'in', 'the', 'sky', '.']


In [11]:
## regexTokenizer is used to tokenize sentences with regular expression of our choice

from nltk.tokenize import RegexpTokenizer

In [12]:
tokenizer = RegexpTokenizer('[a-zA-Z]+')

words= tokenizer.tokenize(text.lower())
print(words)

['it', 'was', 'a', 'rainy', 'day', 'i', 'was', 'not', 'standing', 'alone', 'on', 'the', 'balcony', 'of', 'my', 'house', 'in', 'switzerland', 'a', 'flock', 'of', 'birds', 'was', 'flying', 'in', 'the', 'sky']


### 3. **Removing Stopwords**

In [13]:
from nltk.corpus import stopwords

In [14]:
stopWords=list(stopwords.words('english'))
#print(len(stopWords))

def removeStopWords(sent):
    extract = [w for w in sent if w not in stopWords ]
    return extract

In [15]:
useful_words=removeStopWords(words)

In [16]:
print(useful_words)

['rainy', 'day', 'standing', 'alone', 'balcony', 'house', 'switzerland', 'flock', 'birds', 'flying', 'sky']


### 4. **Stemming**
    1. Snowball Stemmer (Multilingual)
    2. Porter Stemmer
    3. Lancaster Stemmer

In [118]:
text1='''It was a lovely day , a man came and jumped over a pile of garbage he seemed to be going a way fast somewhere'''
words=tokenizer.tokenize(text1.lower())
words=removeStopWords(words)
print(words)

['lovely', 'day', 'man', 'came', 'jumped', 'pile', 'garbage', 'seemed', 'going', 'way', 'fast', 'somewhere']


In [113]:
from nltk.stem.snowball import PorterStemmer,SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

ps=PorterStemmer()
ls=LancasterStemmer()
ss=SnowballStemmer('english')

print(ps.stem('soundly'))
print(ls.stem('teenager'))
print(ss.stem('teenager'))
print()
print(ps.stem('having'))
print(ls.stem('having'))
print(ss.stem('having'))
print()
print(ps.stem('awesome'))
print(ls.stem('awesome'))
print(ss.stem('awesome'))

soundli
teen
teenag

have
hav
have

awesom
awesom
awesom


In [19]:
words=[ps.stem(w) for w in words ]
print(words)

['love', 'day', 'man', 'came', 'jump', 'pile', 'garbag', 'seem', 'go', 'way', 'fast', 'somewher']


## Function to perform all these steps when given a text.

In [1]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
def prepare(text):
    # Tokenize
    tokenizer=RegexpTokenizer('[a-zA-Z]+')
    wordsList=tokenizer.tokenize(text.lower())
    
    #Remove Stopword
    stopWords=list(stopwords.words('english'))
    wordsList = [w for w in wordsList if w not in stopWords ]
    
      #Stemming & Lemmatization
#     ps=PorterStemmer()
#     wordsList= [ps.stem(w) for w in wordsList ]
#     lm=WordNetLemmatizer()    
#     wordsList= [lm.lemmatize(w,pos='v') for w in wordsList ]
#     wordsList= [lm.lemmatize(w,pos='a') for w in wordsList ]
#     wordsList= [lm.lemmatize(w,pos='n') for w in wordsList ]
#     wordsList= [lm.lemmatize(w,pos='s') for w in wordsList ]
#     wordsList= [lm.lemmatize(w,pos='r') for w in wordsList ]
    
    return wordsList

In [2]:
text1='''It was a lovely day, a man   came and jumped   over
a pile of garbage he seemed to be going a way fastest somewhere.'''
print(prepare(text1))

['lovely', 'day', 'man', 'came', 'jumped', 'pile', 'garbage', 'seemed', 'going', 'way', 'fastest', 'somewhere']


## Building common vocabolary and vectorizing document (Based upon bag of words model)

In [3]:
import numpy as np
corpus=[
    'It was raining raining raining raining on the semifinals of the cricket world cup held in india,so the match was postponed.',
    'India has many mesmerizing historical monuments that every one should visit',
    'The teacher that was teaching in south asian university was from india unlike other teachers that were native.',
    'News editing team tried there level best to show india as a developing country with fast growing GDP.'
]

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(tokenizer=prepare)

In [5]:
vectorized_corpus=cv.fit_transform(corpus)
#print(type(vectorized_corpus))
#print(vectorized_corpus)  # sparse matrix
vc=vectorized_corpus.toarray()   # dense matrix
print(vc)

[[0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 1 4 1 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 1 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 1 1 0 0]
 [0 1 1 0 0 1 1 0 1 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0]]


In [6]:
print(cv.get_feature_names())

['asian', 'best', 'country', 'cricket', 'cup', 'developing', 'editing', 'every', 'fast', 'gdp', 'growing', 'held', 'historical', 'india', 'level', 'many', 'match', 'mesmerizing', 'monuments', 'native', 'news', 'one', 'postponed', 'raining', 'semifinals', 'show', 'south', 'teacher', 'teachers', 'teaching', 'team', 'tried', 'university', 'unlike', 'visit', 'world']


In [7]:
print(cv.vocabulary_)

{'raining': 23, 'semifinals': 24, 'cricket': 3, 'world': 35, 'cup': 4, 'held': 11, 'india': 13, 'match': 16, 'postponed': 22, 'many': 15, 'mesmerizing': 17, 'historical': 12, 'monuments': 18, 'every': 7, 'one': 21, 'visit': 34, 'teacher': 27, 'teaching': 29, 'south': 26, 'asian': 0, 'university': 32, 'unlike': 33, 'teachers': 28, 'native': 19, 'news': 20, 'editing': 6, 'team': 30, 'tried': 31, 'level': 14, 'best': 1, 'show': 25, 'developing': 5, 'country': 2, 'fast': 8, 'growing': 10, 'gdp': 9}


In [8]:
newSentence='This is completely different sentence "cricket".'
cv.transform([newSentence]).toarray()

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [9]:
## given a vector , getting the sentence
vect=np.ones((22,))
vect[3:5]=0
vect[11:17]=0
print(cv.inverse_transform(vect))

[array(['asian', 'best', 'country', 'developing', 'editing', 'every',
       'fast', 'gdp', 'growing', 'mesmerizing', 'monuments', 'native',
       'news', 'one'], dtype='<U11')]


In [10]:
cv.inverse_transform(vc[0])

[array(['cricket', 'cup', 'held', 'india', 'match', 'postponed', 'raining',
        'semifinals', 'world'], dtype='<U11')]

## This is Unigram Bag of Words model: in this each unique word goes into the bag and used as a feature
## In Bigram Bag of words model pair of adjacent words together are used as feature , this is done basically to also get a little meaning from sentence (or atleast get the sentiment of the sentence like negative ,positive,etc  )

In [11]:
## Bigrams
cv1=CountVectorizer(tokenizer=prepare,ngram_range=(2,2))
vc1=cv1.fit_transform(corpus)
vc1=vc1.toarray()   # dense matrix
print(vc1,end='\n\n')
print(cv1.get_feature_names())

[[0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 3 1 1 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1 1 0]
 [0 1 1 0 0 1 1 0 1 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 0]]

['asian university', 'best show', 'country fast', 'cricket world', 'cup held', 'developing country', 'editing team', 'every one', 'fast growing', 'growing gdp', 'held india', 'historical monuments', 'india developing', 'india many', 'india match', 'india unlike', 'level best', 'many mesmerizing', 'match postponed', 'mesmerizing historical', 'monuments every', 'news editing', 'one visit', 'raining raining', 'raining semifinals', 'semifinals cricket', 'show india', 'south asian', 'teacher teaching', 'teachers native', 'teaching south', 'team tried', 'tried level', 'university india', 'unlike teachers', 'world cup']


In [12]:
## Bigrams and unigrams both
cv1=CountVectorizer(tokenizer=prepare,ngram_range=(1,2))
vc1=cv1.fit_transform(corpus)
vc1=vc1.toarray()   # dense matrix
print(vc1,end='\n\n')
print(cv1.get_feature_names())

[[0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 1 1
  0 0 0 0 0 0 0 0 0 1 4 3 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 1 0 1 0 0 0 0 1 1 0 0
  1 1 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0
  0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 0 0 0]
 [0 0 1 1 1 1 0 0 0 0 1 1 1 1 0 0 1 1 1 1 1 0 0 0 0 1 1 0 0 0 1 1 0 0 0 0
  0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0]]

['asian', 'asian university', 'best', 'best show', 'country', 'country fast', 'cricket', 'cricket world', 'cup', 'cup held', 'developing', 'developing country', 'editing', 'editing team', 'every', 'every one', 'fast', 'fast growing', 'gdp', 'growing', 'growing gdp', 'held', 'held india', 'historical', 'historical monuments', 'india', 'india developing', 'india many', 'india match', 'india unlike', '

In [13]:
## Trigrams
cv1=CountVectorizer(tokenizer=prepare,ngram_range=(3,3))
vc1=cv1.fit_transform(corpus)
vc1=vc1.toarray()   # dense matrix
print(vc1,end='\n\n')
print(cv1.get_feature_names())

[[0 0 0 1 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 2 1 1 1 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 1 0]
 [0 1 1 0 0 1 1 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0]]

['asian university india', 'best show india', 'country fast growing', 'cricket world cup', 'cup held india', 'developing country fast', 'editing team tried', 'every one visit', 'fast growing gdp', 'held india match', 'historical monuments every', 'india developing country', 'india many mesmerizing', 'india match postponed', 'india unlike teachers', 'level best show', 'many mesmerizing historical', 'mesmerizing historical monuments', 'monuments every one', 'news editing team', 'raining raining raining', 'raining raining semifinals', 'raining semifinals cricket', 'semifinals cricket world', 'show india developing', 'south asian university', 'teacher teaching south', 'teaching south asian', 'team tried level', 'trie

In [14]:
## Trigrams and bigrams
cv1=CountVectorizer(tokenizer=prepare,ngram_range=(2,3))
vc1=cv1.fit_transform(corpus)
vc1=vc1.toarray()   # dense matrix
print(vc1,end='\n\n')
print(cv1.get_feature_names())

[[0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1
  0 0 0 0 0 0 0 3 2 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 1 1 0 0 0 0 0 0 1 1 0
  1 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 0 0]
 [0 0 1 1 1 1 0 0 0 0 1 1 1 1 0 0 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 1 0 0 0
  0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0]]

['asian university', 'asian university india', 'best show', 'best show india', 'country fast', 'country fast growing', 'cricket world', 'cricket world cup', 'cup held', 'cup held india', 'developing country', 'developing country fast', 'editing team', 'editing team tried', 'every one', 'every one visit', 'fast growing', 'fast growing gdp', 'growing gdp', 'held india', 'held india match', 'historical monuments', 'historical

# TF-IDF Normalisation
**TF (Term frequency)** of a term is the number of time that term appears in the document.

Formula for calculating **Idf (term,corpus) = log ( (n+1)/ (1 + count(D,t) )) + 1**

Where,

count(D,t) = number of documents containing the term t

n= number of documents in corpus

**Weight** (for each feature) = **Tf * IDf** 

**The resulting tf-idf vectors are then normalized by the Euclidean norm (when norm='l2'):**

$ v_{norm} = \frac{v}{||v||_2} = \frac{v}{\sqrt{v{_1}^2 + v{_2}^2 + \dots + v{_n}^2}} $

- It is used for avoiding features that occur very often , because they contain less information, like when in a corpus there are mant different documents having info about different topics and almost each of the document contains words like books, water etc. So these words should have very less weight in performing any ML algo like clustering and classification on the document.
- Information aparted by a particular word decreases as the occurences of the word increases across the document.
- So we use TF-IDF(Term Frequency - Inverse Document Frequency) normalisation to associate weight to every feature or term in the dictionary

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
tfidf_vectorizer=TfidfVectorizer(ngram_range=(1,1))
tv=tfidf_vectorizer.fit_transform(corpus).toarray()
print(tfidf_vectorizer.get_feature_names())

['as', 'asian', 'best', 'country', 'cricket', 'cup', 'developing', 'editing', 'every', 'fast', 'from', 'gdp', 'growing', 'has', 'held', 'historical', 'in', 'india', 'it', 'level', 'many', 'match', 'mesmerizing', 'monuments', 'native', 'news', 'of', 'on', 'one', 'other', 'postponed', 'raining', 'semifinals', 'should', 'show', 'so', 'south', 'teacher', 'teachers', 'teaching', 'team', 'that', 'the', 'there', 'to', 'tried', 'university', 'unlike', 'visit', 'was', 'were', 'with', 'world']


In [23]:
print(tfidf_vectorizer.vocabulary_)

{'it': 18, 'was': 49, 'raining': 31, 'on': 27, 'the': 42, 'semifinals': 32, 'of': 26, 'cricket': 4, 'world': 52, 'cup': 5, 'held': 14, 'in': 16, 'india': 17, 'so': 35, 'match': 21, 'postponed': 30, 'has': 13, 'many': 20, 'mesmerizing': 22, 'historical': 15, 'monuments': 23, 'that': 41, 'every': 8, 'one': 28, 'should': 33, 'visit': 48, 'teacher': 37, 'teaching': 39, 'south': 36, 'asian': 1, 'university': 46, 'from': 10, 'unlike': 47, 'other': 29, 'teachers': 38, 'were': 50, 'native': 24, 'news': 25, 'editing': 7, 'team': 40, 'tried': 45, 'there': 43, 'level': 19, 'best': 2, 'to': 44, 'show': 34, 'as': 0, 'developing': 6, 'country': 3, 'with': 51, 'fast': 9, 'growing': 12, 'gdp': 11}


In [24]:
print(tfidf_vectorizer.inverse_transform(tv[2]))
print(tv[2][38])

[array(['asian', 'from', 'in', 'india', 'native', 'other', 'south',
       'teacher', 'teachers', 'teaching', 'that', 'the', 'university',
       'unlike', 'was', 'were'], dtype='<U11')]
0.23912604584443742


In [25]:
print(tv)

[[0.         0.         0.         0.         0.16672546 0.16672546
  0.         0.         0.         0.         0.         0.
  0.         0.         0.16672546 0.         0.13144827 0.08700426
  0.16672546 0.         0.         0.16672546 0.         0.
  0.         0.         0.16672546 0.16672546 0.         0.
  0.16672546 0.66690183 0.16672546 0.         0.         0.16672546
  0.         0.         0.         0.         0.         0.
  0.3943448  0.         0.         0.         0.         0.
  0.         0.26289653 0.         0.         0.16672546]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.31791864 0.         0.         0.
  0.         0.31791864 0.         0.31791864 0.         0.16590314
  0.         0.         0.31791864 0.         0.31791864 0.31791864
  0.         0.         0.         0.         0.31791864 0.
  0.         0.         0.         0.31791864 0.         0.
  0.         0.         0.         0.         0.         0.250