In [1]:
import nltk
import numpy as np

In [2]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [2]:
from nltk.corpus import brown

In [3]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [4]:
words=brown.words(categories="mystery")

In [6]:
print(len(words))
words[:30]

57169


['There',
 'were',
 'thirty-eight',
 'patients',
 'on',
 'the',
 'bus',
 'the',
 'morning',
 'I',
 'left',
 'for',
 'Hanover',
 ',',
 'most',
 'of',
 'them',
 'disturbed',
 'and',
 'hallucinating',
 '.',
 'An',
 'interne',
 ',',
 'a',
 'nurse',
 'and',
 'two',
 'attendants',
 'were']

In [8]:
sentences=brown.sents(categories="news")
sentences[:2]

[['The',
  'Fulton',
  'County',
  'Grand',
  'Jury',
  'said',
  'Friday',
  'an',
  'investigation',
  'of',
  "Atlanta's",
  'recent',
  'primary',
  'election',
  'produced',
  '``',
  'no',
  'evidence',
  "''",
  'that',
  'any',
  'irregularities',
  'took',
  'place',
  '.'],
 ['The',
  'jury',
  'further',
  'said',
  'in',
  'term-end',
  'presentments',
  'that',
  'the',
  'City',
  'Executive',
  'Committee',
  ',',
  'which',
  'had',
  'over-all',
  'charge',
  'of',
  'the',
  'election',
  ',',
  '``',
  'deserves',
  'the',
  'praise',
  'and',
  'thanks',
  'of',
  'the',
  'City',
  'of',
  'Atlanta',
  "''",
  'for',
  'the',
  'manner',
  'in',
  'which',
  'the',
  'election',
  'was',
  'conducted',
  '.']]

In [9]:
" ".join(sentences[0])

"The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place ."

## Basic NLP Pipeline
* Data Collection
* Tokenization,stopword removal,stemming
* Building a common vocabulary
* vectorize the documents
* Perform Classification

## Tokenization

In [9]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [10]:
from nltk.tokenize import word_tokenize,sent_tokenize

In [11]:
s="Dog is running on the grass"

In [12]:
word_tokenize(s)

['Dog', 'is', 'running', 'on', 'the', 'grass']

In [13]:
sen="Dog is running on the grass. Weather is awesome. Cat is bad"

In [14]:
sent_tokenize(sen)

['Dog is running on the grass.', 'Weather is awesome.', 'Cat is bad']

# Stopwords

In [15]:
from nltk.corpus import stopwords

In [16]:
sw=stopwords.words("english")

In [17]:
sents="I am not your yaar mind your langvez"

In [18]:
[i for i in word_tokenize(sents) if i not in sw]

['I', 'yaar', 'mind', 'langvez']

# Stemming/Lematization

In [19]:
from nltk.stem import PorterStemmer,SnowballStemmer,WordNetLemmatizer

In [20]:
ps=PorterStemmer()
ss=SnowballStemmer("english")
Wnl=WordNetLemmatizer()

In [21]:
ps.stem("jumps")

'jump'

In [22]:
ss.stem("roses")

'rose'

In [24]:
Wnl.lemmatize("simplest")

'simplest'

In [25]:
def filter_words(s):
    return [ps.stem(i) for i in s if  i not in sw]

In [26]:
filter_words(["dog","is","running","on","grass"])

['dog', 'run', 'grass']

In [27]:
from nltk.tokenize import RegexpTokenizer

In [28]:
rg=RegexpTokenizer("[a-zA-Z0-9]+")

In [29]:
rg.tokenize("Dog is running ,1,2,3, $^#@(*( cat is also running))")

['Dog', 'is', 'running', '1', '2', '3', 'cat', 'is', 'also', 'running']

# Building Common Vocabulary

In [30]:
def my_tokenize(s):
    sent=rg.tokenize(s)
    return filter_words(sent)

In [31]:
corpus=["Indian cricket Indian team will not win World Cup",
        "We will win next Lok Sabha Election ,says Indian PM",
        "Razzi is an exciting Indian spy movie based on real incident",
        "Apj won heart of many Indians"]

In [32]:
from sklearn.feature_extraction.text import CountVectorizer

In [33]:
cv=CountVectorizer(ngram_range=(1,2))

In [34]:
cv.fit(corpus)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [35]:
len(cv.vocabulary_)

63

In [36]:
cv.vocabulary_

{'indian': 16,
 'cricket': 6,
 'team': 49,
 'will': 53,
 'not': 32,
 'win': 56,
 'world': 61,
 'cup': 8,
 'indian cricket': 17,
 'cricket indian': 7,
 'indian team': 20,
 'team will': 50,
 'will not': 54,
 'not win': 33,
 'win world': 58,
 'world cup': 62,
 'we': 51,
 'next': 30,
 'lok': 24,
 'sabha': 43,
 'election': 9,
 'says': 45,
 'pm': 38,
 'we will': 52,
 'will win': 55,
 'win next': 57,
 'next lok': 31,
 'lok sabha': 25,
 'sabha election': 44,
 'election says': 10,
 'says indian': 46,
 'indian pm': 18,
 'razzi': 39,
 'is': 22,
 'an': 0,
 'exciting': 11,
 'spy': 47,
 'movie': 28,
 'based': 4,
 'on': 36,
 'real': 41,
 'incident': 15,
 'razzi is': 40,
 'is an': 23,
 'an exciting': 1,
 'exciting indian': 12,
 'indian spy': 19,
 'spy movie': 48,
 'movie based': 29,
 'based on': 5,
 'on real': 37,
 'real incident': 42,
 'apj': 2,
 'won': 59,
 'heart': 13,
 'of': 34,
 'many': 26,
 'indians': 21,
 'apj won': 3,
 'won heart': 60,
 'heart of': 14,
 'of many': 35,
 'many indians': 27}

In [37]:
vc=cv.transform(["I am an Indian"])

In [38]:
cv.inverse_transform(vc.toarray())

[array(['an', 'indian'], dtype='<U15')]

## with our own tokenizer

In [39]:
cv2=CountVectorizer(tokenizer=my_tokenize,ngram_range=(1,2))

In [40]:
vc=cv2.fit_transform(corpus)

In [41]:
print(vc[0])

  (0, 14)	2
  (0, 4)	1
  (0, 38)	1
  (0, 40)	1
  (0, 43)	1
  (0, 6)	1
  (0, 15)	1
  (0, 5)	1
  (0, 18)	1
  (0, 39)	1
  (0, 42)	1
  (0, 44)	1


In [42]:
len(cv2.vocabulary_)

45

# TFIDF

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [44]:
tfidf=TfidfVectorizer(tokenizer=my_tokenize)

In [45]:
tfidf.fit_transform(corpus).toarray()

array([[0.        , 0.        , 0.41845521, 0.41845521, 0.        ,
        0.        , 0.        , 0.        , 0.43673458, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.41845521,
        0.3299149 , 0.41845521],
       [0.        , 0.        , 0.        , 0.        , 0.38086157,
        0.        , 0.        , 0.        , 0.19874937, 0.38086157,
        0.        , 0.        , 0.38086157, 0.38086157, 0.        ,
        0.        , 0.38086157, 0.38086157, 0.        , 0.        ,
        0.30027564, 0.        ],
       [0.        , 0.37082034, 0.        , 0.        , 0.        ,
        0.37082034, 0.        , 0.37082034, 0.19350944, 0.        ,
        0.        , 0.37082034, 0.        , 0.        , 0.37082034,
        0.37082034, 0.        , 0.        , 0.37082034, 0.        ,
        0.        , 0.        ],
       [0.55280532, 0.        , 0.        , 0.        , 0.        ,
        0.       