In [1]:
from nltk.corpus import brown

In [2]:
### data collection

In [3]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [4]:
data=brown.sents(categories='editorial')

In [5]:
print(data[0])

['Assembly', 'session', 'brought', 'much', 'good']


In [6]:
print(len(data))

2997


# Basic NLP Pipeline
- Data collection
- Tokenization, Stopword, Stemming
- Building a common vocab
- Vectorizing the documents
- Performing classification/clustering

### 2. Tokenization

In [7]:
text="It was a very pleasant day, the weather was cool and their were light showers. I went to the market to buy some fruits."
text=text.split(".")
print(text)

['It was a very pleasant day, the weather was cool and their were light showers', ' I went to the market to buy some fruits', '']


In [8]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [9]:
text="It was a very pleasant day, the weather was cool and their were light showers. I went to the market to buy some fruits."
sents=sent_tokenize(text)
print(sents)
words=word_tokenize(text)
print(words)

['It was a very pleasant day, the weather was cool and their were light showers.', 'I went to the market to buy some fruits.']
['It', 'was', 'a', 'very', 'pleasant', 'day', ',', 'the', 'weather', 'was', 'cool', 'and', 'their', 'were', 'light', 'showers', '.', 'I', 'went', 'to', 'the', 'market', 'to', 'buy', 'some', 'fruits', '.']


In [10]:
word_list=word_tokenize(sents[0].lower())

In [11]:
type(sents)

list

# Stopword removal

In [12]:
from nltk.corpus import stopwords
sw=set(stopwords.words('english'))

In [13]:
print(sw)

{'it', 'doesn', "mustn't", 'than', 'did', 'is', 'which', 'some', 'i', 'this', 'what', 'can', 'where', 'out', 'should', 'when', 'mustn', 'am', 'down', 'over', 'didn', 'will', "wouldn't", 'why', 'to', 'being', 'm', 'yours', 'both', 'most', 'these', 'but', "shouldn't", "didn't", 'won', 'we', 'hers', 'above', "hadn't", "you'd", 'himself', 'be', "aren't", 'that', 'them', 'does', 'who', 'are', 'at', 'whom', 'your', 'off', 'on', 'an', 'ours', 'between', 'their', 'any', 'you', 'there', "couldn't", 'ma', 'myself', 'shan', 'they', 'll', 'nor', 'more', "she's", 'about', 'no', "wasn't", 'up', 'not', "hasn't", "mightn't", "you've", 'my', 'only', "you'll", 'his', 'few', 'and', 'same', 'don', 'weren', 'for', 'has', "shan't", 'hadn', 'very', 't', "should've", 'by', 'then', 'aren', 'its', "doesn't", 'wouldn', 'of', 'ain', 'under', 'in', "isn't", 's', 'had', 'haven', 'each', 'how', 'isn', 'into', 'him', "that'll", 'me', "you're", "it's", 'through', 'yourself', 'having', 'just', 'theirs', 're', 'too', 'a

In [14]:
len(sw)

179

### Filter the words from the sentences

In [25]:
def filter_words(word_list):
    useful_words=[w for w in word_list if w not in sw]
    return useful_words

### Tokenization using Regular expression

Problem with word tokenizer- can't handle complex tokenizations. So we use Regexp tokenizer class in nltk

In [16]:
from nltk.tokenize import RegexpTokenizer

In [17]:
tokenizer= RegexpTokenizer("[a-zA-z]+")

In [19]:
text="Send all the 50 documents related to clauses 1,2,3 at abc@xyz.com"
print(tokenizer.tokenize(text))

['Send', 'all', 'the', 'documents', 'related', 'to', 'clauses', 'at', 'abc', 'xyz', 'com']


### Stemming
- Process that transforms particular words[verbs,plurals] into their radical form.
- Preserve the semantics of the sentence without increasing the number of unique tokens.
- jumps, jumping, jumped,jump==>jump

In [27]:
text="Foxes love to make jumps. The quick brown fox was seen jumping over the lovely dog from a 6ft high wall."
text=text.lower()
word_list=tokenizer.tokenize(text)

In [28]:
word_list=filter_words(word_list)
print(word_list)

['foxes', 'love', 'make', 'jumps', 'quick', 'brown', 'fox', 'seen', 'jumping', 'lovely', 'dog', 'ft', 'high', 'wall']


In [29]:
# Stemming - 1) Snowball Stemmer, Porter, Lancaster

In [41]:
from nltk.stem.snowball import PorterStemmer,SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
ps=PorterStemmer()

In [42]:
ps.stem("Jumps")

'jump'

In [43]:
ps.stem("Jumping")

'jump'

In [44]:
ps.stem("Lovely")

'love'

In [45]:
ps.stem("awesome")

'awesom'

In [46]:
ls=LancasterStemmer()
ls.stem("teeth")
print(ps.stem("teenager")) #english
print(ls.stem("teenager")) #english

teenag
teen


In [48]:
#Snowball stemmer
ss=SnowballStemmer('english')
print(ss.stem('Lovely'))
print(ss.stem('teenager'))

love
teenag


In [49]:
ss_french=SnowballStemmer('french')
print(ss_french.stem('Courias'))

couri


### Work: 
write a function which perfroms all the three steps- Tokenization, Stopword Removal,Stemming. Remove and leading or trailing white spaces 

In [66]:
def data_fixer(word_list):
    tokenizer= RegexpTokenizer("[a-zA-z]+")
    word_list=tokenizer.tokenize(word_list)
    sw=set(stopwords.words('english'))
    word_list=[w for w in word_list if w not in sw]
    ps=PorterStemmer()
    word_list=[ps.stem(w) for w in word_list ]
    word_list=set(word_list)
    word_list=list(word_list)
    return word_list
    

In [67]:
print(text)
print(data_fixer(text))

foxes love to make jumps. the quick brown fox was seen jumping over the lovely dog from a 6ft high wall.
['wall', 'seen', 'quick', 'dog', 'make', 'fox', 'jump', 'brown', 'love', 'high', 'ft']
