# Tokenizing Sentences-Word_tokenize() module

In [1]:
import nltk
from nltk import word_tokenize
word_tokenize("This module can be used for basic tokenizing of sentences into words.")


['This',
 'module',
 'can',
 'be',
 'used',
 'for',
 'basic',
 'tokenizing',
 'of',
 'sentences',
 'into',
 'words',
 '.']

# Tokenizing Sentences-TreebankWordTokenizer()

In [2]:
import nltk
from nltk.tokenize import TreebankWordTokenizer
sentence = '''Good vegan pizza cost $12.25\nin Pheonix, Arizona.  Please buy me\ntwo of them.\nThank you.'''
TreebankWordTokenizer().tokenize(sentence)



['Good',
 'vegan',
 'pizza',
 'cost',
 '$',
 '12.25',
 'in',
 'Pheonix',
 ',',
 'Arizona.',
 'Please',
 'buy',
 'me',
 'two',
 'of',
 'them.',
 'Thank',
 'you',
 '.']

In [3]:
sentence1 = "He'll save and invest for his retirement."
TreebankWordTokenizer().tokenize(sentence1)



['He', "'ll", 'save', 'and', 'invest', 'for', 'his', 'retirement', '.']

In [4]:
sentence2 = "Hello, he can't go to market,"
TreebankWordTokenizer().tokenize(sentence2)


['Hello', ',', 'he', 'ca', "n't", 'go', 'to', 'market', ',']

# Tokenizing Sentences-WordPunctTokenizer()

In [5]:
import nltk
from nltk.tokenize import WordPunctTokenizer
sentence = "He'll save and invest for his retirement."
WordPunctTokenizer().tokenize(sentence)


['He', "'", 'll', 'save', 'and', 'invest', 'for', 'his', 'retirement', '.']

# Tokenizing Sentences-RegexpTokenizer()

In [6]:
import nltk
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("[\w']+")
sentence = "He'll save and invest for his retirement."
tokenizer.tokenize(sentence)


["He'll", 'save', 'and', 'invest', 'for', 'his', 'retirement']

# Tokenizing Paragraphs-sent_tokenize()

In [7]:
import nltk
from nltk.tokenize import sent_tokenize
text = "It shows the difference between word tokenizer and sentence tokenizer. It's a simple example."
sent_tokenize(text)


['It shows the difference between word tokenizer and sentence tokenizer.',
 "It's a simple example."]

# Stemming-PorterStemmer

In [8]:
import nltk
from nltk.stem import PorterStemmer
stemming_word = PorterStemmer()

stemming_word.stem('writing')


'write'

In [9]:
stemming_word.stem('working')

'work'

# Stemming-LancasterStemmer

In [10]:
import nltk
from nltk.stem import LancasterStemmer
stemming_Lanc = LancasterStemmer()

stemming_Lanc.stem('reads')


'read'

In [11]:
stemming_Lanc.stem('sweets')

'sweet'

# Stemming-RegexpStemmer

In [12]:
import nltk
from nltk.stem import RegexpStemmer
Regexp_stemmer = RegexpStemmer('ing')

Regexp_stemmer.stem('enjoying')


'enjoy'

In [13]:
Regexp_stemmer.stem('ingenjoy')

'enjoy'

# Stemming-SnowballStemmer

In [14]:
import nltk

from nltk.stem import SnowballStemmer

SnowballStemmer.languages #Languge supported by Snowball Stemmer


('arabic',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'hungarian',
 'italian',
 'norwegian',
 'porter',
 'portuguese',
 'romanian',
 'russian',
 'spanish',
 'swedish')

In [15]:
Language_French = SnowballStemmer('french')

Language_French.stem ('Bonjoura')


'bonjour'

In [16]:
Language_English = SnowballStemmer('english')


Language_English.stem ('Eating')


'eat'

In [17]:
Language_English.stem ('Reading')

'read'

# Lemmatization

In [18]:
import nltk 

from nltk.stem import WordNetLemmatizer

ex_lemmatizer = WordNetLemmatizer()


ex_lemmatizer.lemmatize('reading')


'reading'

In [19]:
ex_lemmatizer.lemmatize('sweets')

'sweet'

# Difference between Lemmatization and Stemming

In [20]:
#Implementing Stemming
import nltk

from nltk.stem import PorterStemmer

ex_wordstemmer = PorterStemmer()

ex_wordstemmer.stem('believe')


'believ'

In [21]:
#Implementing Lemmatization
import nltk

from nltk.stem import WordNetLemmatizer

ex_lemmatizer = WordNetLemmatizer()

ex_lemmatizer.lemmatize(' believe ')


' believe '

# Chunking

In [22]:
import nltk

S = [("This", "DT"),("book", "NN"),("has","VBZ"),("ten","JJ"),("chapters","NNS")]

chunker=nltk.RegexpParser(r''' 
NP:{<DT><NN.*><.*>*<NN.*>} 
}<VB.*>{ 
''')

chunker.parse(S)

Output=chunker.parse(S)

Output.draw()


# Bag-of-Words(BoW) model

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

Sentences=['Bag of Words model is very useful NLP technique.', 'Bag of Words model is used to extract the features from text.']

vector_count = CountVectorizer()

text_feature = vector_count.fit_transform(Sentences).todense()

print(vector_count.vocabulary_)



{'bag': 0, 'of': 7, 'words': 15, 'model': 5, 'is': 4, 'very': 14, 'useful': 13, 'nlp': 6, 'technique': 8, 'used': 12, 'to': 11, 'extract': 1, 'the': 10, 'features': 2, 'from': 3, 'text': 9}


In [25]:
print(text_feature)

[[1 0 0 0 1 1 1 1 1 0 0 0 0 1 1 1]
 [1 1 1 1 1 1 0 1 0 1 1 1 1 0 0 1]]


# Example1: Predicting the Category

In [30]:
#Import the required packages
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
#Defining five different category maps
c_map = {'talk.religion.misc': 'Religion', 'rec.autos': 'Autos','rec.sport.hockey':'Hockey','sci.electronics':'Electronics',        'sci.space': 'Space'} 
#Creating the training set
t_data = fetch_20newsgroups(subset='train', 
        categories=c_map.keys(), shuffle=True, random_state=5)
#Building a count vectorizer and extracting the term counts
v_count = CountVectorizer()
train_tc = v_count.fit_transform(t_data.data)
print("\nDimensions of training data:", train_tc.shape)
#Creating tf-idf transformer
tfidf = TfidfTransformer()
train_tfidf = tfidf.fit_transform(train_tc)
#Defining the test data
input_data = [
    'Columbia is the name of a space shuttle',
    'Hindu, isai, Sikh, Muslim all are religions',
    'We shoul drive safely',
    'Puck is a round disk made of hard rubber',
    'Television, Microwave, Mixer Grinder, Refrigrator, all uses electricity']
#Multimonial Naïve Bayes classifier training
classifier = MultinomialNB().fit(train_tfidf, t_data.target)
#Transforming input data by using count vectorizer
input_tc = v_count.transform(input_data)
#Transforming vectorized data by using tf-idf transformer
input_tfidf = tfidf.transform(input_tc)
#Predicting output categories
predictions = classifier.predict(input_tfidf)
for sent, category in zip(input_data, predictions):
    print('\nThe Input Data is:', sent, '\n Category:', \
            c_map[t_data.target_names[category]])   



Dimensions of training data: (2755, 39297)

The Input Data is: Columbia is the name of a space shuttle 
 Category: Space

The Input Data is: Hindu, isai, Sikh, Muslim all are religions 
 Category: Religion

The Input Data is: We shoul drive safely 
 Category: Autos

The Input Data is: Puck is a round disk made of hard rubber 
 Category: Hockey

The Input Data is: Television, Microwave, Mixer Grinder, Refrigrator, all uses electricity 
 Category: Electronics


# Example2: Gender Finding

In [49]:
#Import the required packages
import random
from nltk.corpus import names
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy
names_M = [(name, 'male') for name in names.words(r"C:/Users/Leekha/Desktop/malenames.txt")]
names_F = [(name, 'female') for name in names.words(r"C:/Users/Leekha/Desktop/femalenames.txt")]
names_labels = names_M + names_F
random.shuffle(names_labels)
#Defining the function to calculate features
def features(word):
      return {'last_letter': word[-1]}
featuresets = [(features(n), gender) for (n, gender) in names_labels]
# Splitting  the dataset into training set and testing set.
train_set, test_set = featuresets[5:], featuresets[:5]
# Training the Naive Bayes classifier
classifier = NaiveBayesClassifier.train(train_set)
male_gender = classifier.classify(features('Aarav'))
female_gender = classifier.classify(features('Shilpi'))
print("Aarav is a {}.".format(male_gender))
print("Shilpi is a {}.".format(female_gender))
#Getting the accuracy
print(accuracy(classifier, test_set))
#Printing first 15 feature sets
classifier.show_most_informative_features(15)


Aarav is a male.
Shilpi is a male.
0.8
Most Informative Features
             last_letter = 'k'              male : female =     14.3 : 1.0
             last_letter = 'o'              male : female =      5.5 : 1.0
             last_letter = 'z'              male : female =      4.8 : 1.0
             last_letter = 'f'              male : female =      4.3 : 1.0
             last_letter = 'm'              male : female =      4.0 : 1.0
             last_letter = 'j'              male : female =      3.8 : 1.0
             last_letter = 'g'              male : female =      3.1 : 1.0
             last_letter = 'r'              male : female =      2.9 : 1.0
             last_letter = 'p'              male : female =      2.6 : 1.0
             last_letter = 'd'              male : female =      2.6 : 1.0
             last_letter = 'v'              male : female =      2.1 : 1.0
             last_letter = 'u'              male : female =      2.1 : 1.0
             last_letter = 's'     