In [3]:
import nltk

In [4]:
#nltk.download()

# Tokenize words
A sentence or data can be split into words using the method word_tokenize():

In [5]:
data = "All work and no play makes jack a dull boy, all work and no play"

In [6]:
from nltk.tokenize import word_tokenize,sent_tokenize

In [7]:
print(word_tokenize(data))

['All', 'work', 'and', 'no', 'play', 'makes', 'jack', 'a', 'dull', 'boy', ',', 'all', 'work', 'and', 'no', 'play']


In [8]:
data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."

In [9]:
print(sent_tokenize(data))

['All work and no play makes jack dull boy.', 'All work and no play makes jack a dull boy.']


# Natural Language Processing: remove stop words

In [10]:
from nltk.corpus import stopwords

In [11]:
stopWords=set(stopwords.words('english'))

In [12]:
print(stopWords)

{'did', 'those', "don't", 'my', 'yourself', 'now', 'hers', 'is', 'ma', 'very', 'haven', 'couldn', 'both', 'itself', 'mightn', 'before', 'had', 'ourselves', 'then', 'further', 'having', 'has', 'most', 'by', 'hasn', 'myself', "you'd", 'doing', "didn't", 'he', 'such', 'during', 'when', 'shan', 'against', 'too', "it's", 'other', "hadn't", 'few', 'how', 'below', "that'll", "she's", 'themselves', 'just', 'are', 'ain', 'of', 'she', 'own', 'who', 'once', 'aren', 'her', 'down', 'will', 'these', 'there', 'here', 'after', 'wasn', 'nor', 'didn', 'theirs', 't', 'shouldn', 'or', "shouldn't", 'we', 'was', 'be', 'but', "mustn't", 'whom', 'only', "shan't", 'through', 'if', "doesn't", 'about', 'y', 'they', 'the', 'any', "wasn't", 'until', 'this', 'no', 'have', 'off', 'an', 'd', 'it', 'same', 'needn', 'won', 'each', "should've", "mightn't", 'in', "haven't", 'his', 'does', 'that', 'its', 's', 'wouldn', 'your', 'again', "wouldn't", 'weren', 'as', 'hadn', 'him', 'over', 'been', 'so', 'why', 've', 'll', 're'

In [13]:
words=word_tokenize(data)

In [14]:
wordfilter=[]

In [15]:
for word in words:
    if word not in stopWords:
        wordfilter.append(word)

In [16]:
print(wordfilter)

['All', 'work', 'play', 'makes', 'jack', 'dull', 'boy', '.', 'All', 'work', 'play', 'makes', 'jack', 'dull', 'boy', '.']


# NLTK Stemming

In [17]:
words = ["game","gaming","gamed","games"]

In [18]:
from nltk.stem import PorterStemmer

In [19]:
ps=PorterStemmer()

In [20]:
for word in words:
    print(word,':',ps.stem(word))

game : game
gaming : game
gamed : game
games : game


In [21]:
sentence = "gaming, the gamers play games"
words = word_tokenize(sentence)
 
for word in words:
    print(word + ":" + ps.stem(word))

gaming:game
,:,
the:the
gamers:gamer
play:play
games:game


# NLTK – speech tagging example
Given a sentence or paragraph, it can label words such as verbs, nouns and so on.

In [22]:
from nltk.tokenize import PunktSentenceTokenizer

In [23]:
document = 'Whether you\'re new to programming or an experienced developer, it\'s easy to learn and use Python.'

In [24]:
sentences = nltk.sent_tokenize(document)   
for sent in sentences:
    print(nltk.pos_tag(nltk.word_tokenize(sent)))

[('Whether', 'IN'), ('you', 'PRP'), ("'re", 'VBP'), ('new', 'JJ'), ('to', 'TO'), ('programming', 'VBG'), ('or', 'CC'), ('an', 'DT'), ('experienced', 'JJ'), ('developer', 'NN'), (',', ','), ('it', 'PRP'), ("'s", 'VBZ'), ('easy', 'JJ'), ('to', 'TO'), ('learn', 'VB'), ('and', 'CC'), ('use', 'VB'), ('Python', 'NNP'), ('.', '.')]


https://pythonspot.com/wp-content/uploads/2016/08/nltk-speech-codes.png

We can filter this data based on the type of word:

In [25]:
from nltk.corpus import state_union
document = 'Today the Netherlands celebrates King\'s Day. To honor this tradition, the Dutch embassy in San Francisco invited me to'
sentences = nltk.sent_tokenize(document)   
 
data = []
for sent in sentences:
    data = data + nltk.pos_tag(nltk.word_tokenize(sent))
 
for word in data: 
    if 'NNP' in word[1]: 
        print(word)

('Netherlands', 'NNP')
('King', 'NNP')
('Day', 'NNP')
('San', 'NNP')
('Francisco', 'NNP')


# NLP Prediction

To create our analysis program, we have several steps:

~Data preparation
~Feature extraction
~Training
~Prediction

In [26]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import names

In [27]:
def gender_features(word): 
    return {'last_letter': word[-1]} 

In [28]:
# Load data and training 
name = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])

In [29]:
featuresets = [(gender_features(n), g) for (n,g) in name] 
train_set = featuresets
classifier = nltk.NaiveBayesClassifier.train(train_set) 

In [30]:
#predict
print(classifier.classify(gender_features('James')))

male


# Sentiment Analysis

In [31]:
#We start by defining 3 classes: positive, negative and neutral.
#Each of these is defined by a vocabulary:
positive_vocab = [ 'awesome', 'outstanding', 'fantastic', 'terrific', 'good', 'nice', 'great', ':)' ]
negative_vocab = [ 'bad', 'terrible','useless', 'hate', ':(' ]
neutral_vocab = [ 'movie','the','sound','was','is','actors','did','know','words','not' ]

In [32]:
#Every word is converted into a feature using a simplified bag of words model:
def word_feats(words):
    return dict([(word, True) for word in words])

In [33]:

positive_features = [(word_feats(pos), 'pos') for pos in positive_vocab]
negative_features = [(word_feats(neg), 'neg') for neg in negative_vocab]
neutral_features = [(word_feats(neu), 'neu') for neu in neutral_vocab]

In [34]:
#Our training set is then the sum of these three feature sets:
train_set = negative_features + positive_features + neutral_features

In [35]:
classifier=NaiveBayesClassifier.train(train_set)

In [36]:
# Predict
def Review(sentence):
    neg = 0
    pos = 0
    sentence = sentence.lower()
    words = sentence.split(' ')
    for word in words:
        classResult = classifier.classify( word_feats(word))
    if classResult == 'neg':
        neg = neg + 1
    if classResult == 'pos':
        pos = pos + 1
 
    print('Positive: ' + str(float(pos)/len(words)))
    print('Negative: ' + str(float(neg)/len(words)))

In [44]:
a='The movie was bad'

In [45]:
Review(a)

Positive: 0.0
Negative: 0.25
