# Basic introduction to NLTK#

*AUTHOR : Aditya Ojha*

**The prerequistes of this notebook is that you love Python :)**

In [20]:
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\ABC\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.


True

In [1]:
!pip install nltk



### TOKENIZING

Tokenizing means to group in words or sentences

*Importing necessary packages*

In [1]:
import nltk
from nltk import sent_tokenize, word_tokenize

**Sample Data**

In [2]:
para = "This is sample text. We are testing nltk packages. Do not disappoint us."

**Sentence Tokenization**

In [3]:
a = sent_tokenize(para)

In [4]:
a

['This is sample text.',
 'We are testing nltk packages.',
 'Do not disappoint us.']

In [5]:
for j in sent_tokenize(para):
    print(j)

This is sample text.
We are testing nltk packages.
Do not disappoint us.


**Words Tokenizatin**

In [6]:
b = word_tokenize(para)

In [7]:
b


['This',
 'is',
 'sample',
 'text',
 '.',
 'We',
 'are',
 'testing',
 'nltk',
 'packages',
 '.',
 'Do',
 'not',
 'disappoint',
 'us',
 '.']

In [8]:
for i in word_tokenize(para):
    print(i)

This
is
sample
text
.
We
are
testing
nltk
packages
.
Do
not
disappoint
us
.


### STOP_WORDS 





They are words that does not have an impact on sentence analysis. For example is, am, was, will, etc

In [9]:
from nltk.corpus import stopwords


In [10]:
example_sentence = "This is an example of stop words filtaration. Hope it will run"

In [11]:
stop_words = set(stopwords.words("english"))

In [12]:
print(stop_words)

{'where', 'have', 'than', 'or', "haven't", 'some', 'because', "mightn't", 'your', 'its', "weren't", "couldn't", 'been', 'did', 'these', 'it', 'for', 'wouldn', 'being', 'my', 'wasn', 'mightn', "shouldn't", 'his', 'ma', 'same', 'has', 'do', 'will', 'couldn', 'when', 'won', 'each', 'doing', 'over', "that'll", 'all', 'below', 're', 'any', 'you', 'which', "didn't", "wouldn't", 'as', 'once', "should've", "you'll", 'before', 'why', 'at', 'after', "won't", 'a', 'then', 'above', 'most', 'hadn', 'y', 'now', 'shouldn', 'only', 'itself', 'ours', 'aren', 'her', 'those', 'in', 'just', 'up', 'this', 'themselves', "doesn't", 'an', 'there', 'she', 'own', 'whom', 'how', 'not', 'were', "she's", 'should', 'd', 'so', "it's", 'himself', 'me', 'are', 'if', 'who', 'mustn', 'with', 'few', 'haven', 'that', 'by', "don't", 'm', 'theirs', 'the', 'between', 's', 'nor', "mustn't", "needn't", 'weren', 'to', 'out', 'here', 'we', 'until', 've', "shan't", 'further', 'into', 'is', "you're", 'our', 'yourselves', 'while', 

In [13]:
words = word_tokenize(example_sentence)

filtered_sentence = []

for w in words:
    if w not in stop_words:
        filtered_sentence.append(w)
print(filtered_sentence)        

['This', 'example', 'stop', 'words', 'filtaration', '.', 'Hope', 'run']


### STEMMING

In stemming we take root words. For example, root word of 'Riding' is 'Ride'.

**PorterStemming**

In [14]:
from nltk.stem import PorterStemmer

In [15]:
ps = PorterStemmer()

In [16]:
example_words = ["python","pythoner","pythoning","pythoned","pythonly"]

for w in example_words:
   print(ps.stem(w))

python
python
python
python
pythonli


In [17]:
sample = "You are not considering the considerable performance considerably.Its consequences will not be considerd"
words = sent_tokenize(sample)
sample = sample.split(' ')

for w in sample:
    print(ps.stem(w))


you
are
not
consid
the
consider
perform
considerably.it
consequ
will
not
be
considerd


**SnowBall Stemming**

In [18]:
from nltk.stem import SnowballStemmer
sb = SnowballStemmer("english")

In [19]:
example_words = ["python","pythoner","pythoning","pythoned","pythonly"]

for w in example_words:
    print(sb.stem(w))

python
python
python
python
python


In [20]:
sample = ("You are not considering the considerable performance considerably. Its consequences will not be considerd")
words = word_tokenize(sample)
sample = sample.split(' ')

for w in sample:
    print(sb.stem(w))


you
are
not
consid
the
consider
perform
considerably.
it
consequ
will
not
be
considerd


### PARTS OF SPEECH TAGGING

In [3]:
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import state_union #state union adresses by various america​PunktSentenceTokenizern presidents
#from nltk.tokenize import sent_tokenize 
from nltk.tokenize import PunktSentenceTokenizer

In [22]:
train_text = state_union.raw("2006-GWBush.txt")
sample_text = state_union.raw("2005-GWBush.txt")

In [23]:
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)


In [24]:



def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            print(tagged)
            
    except Exception as e:
        print(str(e))
        
process_content()        

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('February', 'NNP'), ('2', 'CD'), (',', ','), ('2005', 'CD'), ('9:10', 'CD'), ('P.M', 'NNP'), ('.', '.')]
[('EST', 'IN'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('As', 'IN'), ('a', 'DT'), ('new', 'JJ'), ('Congress', 'NNP'), ('gathers', 'NNS'), (',', ','), ('all', 'DT'), ('of', 'IN'), ('us', 'PRP'), ('in', 'IN'), ('the', 'DT'), ('elected', 'JJ'), ('branches', 'NNS'), ('of', 'IN'), ('government', 'NN'), ('share', 'NN'), ('a'

***POS tag list:***

CC	coordinating conjunction

CD	cardinal digit

DT	determiner

EX	existential there (like: "there is" ... think of it like "there exists")

FW	foreign word

IN	preposition/subordinating conjunction

JJ	adjective	'big'

JJR	adjective, comparative	'bigger'

JJS	adjective, superlative	'biggest'

LS	list marker	1)

MD	modal	could, will

NN	noun, singular 'desk'

NNS	noun plural	'desks'

NNP	proper noun, singular	'Harrison'

NNPS	proper noun, plural	'Americans'

PDT	predeterminer	'all the kids'

POS	possessive ending	parent\'s

PRP	personal pronoun	I, he, she

PRP$	possessive pronoun	my, his, hers

RB	adverb	very, silently,

RBR	adverb, comparative	better

RBS	adverb, superlative	best

RP	particle	give up

TO	to	go 'to' the store.

UH	interjection	errrrrrrrm

VB	verb, base form	take

VBD	verb, past tense	took

VBG	verb, gerund/present participle	taking

VBN	verb, past participle	taken

VBP	verb, sing. present, non-3d	take

VBZ	verb, 3rd person sing. present	takes

WDT	wh-determiner	which

WP	wh-pronoun	who, what

WP$	possessive wh-pronoun	whose

WRB	wh-abverb	where, when




### CHUNKING

*One of the main goals of chunking is to group into what are known as "noun phrases." These are phrases of one or more words that contain a noun, maybe some descriptive words, maybe a verb, and maybe something like an adverb. The idea is to group nouns with the words that are in relation to them.*

*In order to chunk, we combine the part of speech tags with regular expressions. Mainly from regular expressions, we are going to utilize the following:*
    

*+ = match 1 or more*

? = match 0 or 1 repetitions.

** = match 0 or MORE repetitions*	

. = Any character except a new line
	  

In [25]:
train_text = state_union.raw("2006-GWBush.txt")
sample_text = state_union.raw("2005-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized[:3]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            
            print(chunked)
            for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):
                print(subtree)

            chunked.draw()

    except Exception as e:
        print(str(e))

process_content()

(S
  (Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
  'S/POS
  (Chunk ADDRESS/NNP)
  BEFORE/IN
  (Chunk A/NNP JOINT/NNP SESSION/NNP)
  OF/IN
  (Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
  OF/IN
  (Chunk THE/NNP UNION/NNP February/NNP)
  2/CD
  ,/,
  2005/CD
  9:10/CD
  (Chunk P.M/NNP)
  ./.)
(Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
(Chunk ADDRESS/NNP)
(Chunk A/NNP JOINT/NNP SESSION/NNP)
(Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
(Chunk THE/NNP UNION/NNP February/NNP)
(Chunk P.M/NNP)
(S
  EST/IN
  (Chunk THE/NNP PRESIDENT/NNP)
  :/:
  (Chunk Mr./NNP Speaker/NNP)
  ,/,
  (Chunk Vice/NNP President/NNP Cheney/NNP)
  ,/,
  members/NNS
  of/IN
  (Chunk Congress/NNP)
  ,/,
  fellow/JJ
  citizens/NNS
  :/:
  As/IN
  a/DT
  new/JJ
  (Chunk Congress/NNP)
  gathers/NNS
  ,/,
  all/DT
  of/IN
  us/PRP
  in/IN
  the/DT
  elected/JJ
  branches/NNS
  of/IN
  government/NN
  share/NN
  a/DT
  great/JJ
  privilege/NN
  :/:
  We/PRP
  've/VBP
  been/VBN
  placed/VBN
  in/

### CHINKING

Chinking is a lot like chunking, it is basically a way for you to remove a chunk from a chunk. The chunk that you remove from your chunk is your chink.

The code is very similar, you just denote the chink, after the chunk, with }{ instead of the chunk's {}.


In [6]:
#train_text = state_union.raw("2005-GWBush.txt")
#sample_text = state_union.raw("2006-GWBush.txt")

#custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

#tokenized = custom_sent_tokenizer.tokenize(sample_text)

#def process_content():
#    try:
#        for i in tokenized[:5]:
#            words = nltk.word_tokenize(i)
#            tagged = nltk.pos_tag(words)

#            chunkGram = r"""Chunk: {<.*>+}
#                                    }<VB.?|IN|DT|TO>+{"""

#            chunkParser = nltk.RegexpParser(chunkGram)
#            chunked = chunkParser.parse(tagged)

#            chunked.draw()

#    except Exception as e:
#        print(str(e))

#process_content()

### NAMED ENTITY RECOGNITION

The idea is to have the machine immediately be able to pull out "entities" like people, places, things, locations, monetary figures, and more.

In [7]:
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import state_union #state union adresses by various america​PunktSentenceTokenizern presidents
from nltk.tokenize import PunktSentenceTokenizer

In [10]:
train_text = state_union.raw("2006-GWBush.txt")
sample_text = state_union.raw("2005-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized[:3]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt = nltk.ne_chunk(tagged, binary=False)
            namedEnt.draw()
    except Exception as e:
        print(str(e))


process_content()
            
            

### LEMMATIZING


Lemmatizing is similar to stemming. The major difference between these is, as you saw earlier, stemming can often create non-existent words, whereas lemmas are actual words.

In [52]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


In [53]:
print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))

cat
cactus
goose
rock
python


In [56]:
#(lemmatizer.lemmatize("better"))
print(lemmatizer.lemmatize("worse", pos='a')) # a= adjective
#print(lemmatizer.lemmatize("best", pos='a'))

bad


### WORDNET

WordNet is a lexical database for the English language. 

We can use WordNet alongside the NLTK module to find the meanings of words, synonyms, antonyms, and more. Let's cover some examples.

In [1]:
from nltk.corpus import wordnet

In [20]:
syns = wordnet.synsets("good")
print(syns)

[Synset('good.n.01'), Synset('good.n.02'), Synset('good.n.03'), Synset('commodity.n.01'), Synset('good.a.01'), Synset('full.s.06'), Synset('good.a.03'), Synset('estimable.s.02'), Synset('beneficial.s.01'), Synset('good.s.06'), Synset('good.s.07'), Synset('adept.s.01'), Synset('good.s.09'), Synset('dear.s.02'), Synset('dependable.s.04'), Synset('good.s.12'), Synset('good.s.13'), Synset('effective.s.04'), Synset('good.s.15'), Synset('good.s.16'), Synset('good.s.17'), Synset('good.s.18'), Synset('good.s.19'), Synset('good.s.20'), Synset('good.s.21'), Synset('well.r.01'), Synset('thoroughly.r.02')]


In [13]:
print(syns[0].name())

good.n.01


In [14]:
print(syns[0].lemmas()[0].name())

good


In [22]:
print(syns[0].definition())

benefit


In [21]:
print(syns[0].examples())

['for your own good', "what's the good of worrying?"]


We might be discern synonyms and antonyms to a word. The lemmas will be synonyms, and then we can use .antonyms to find the antonyms to the lemmas. As such, we can populate some lists like

In [23]:
synonyms = []

antonyms = []

for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
            
print(set(synonyms))
print(set(antonyms))

{'serious', 'goodness', 'near', 'estimable', 'undecomposed', 'effective', 'upright', 'full', 'skillful', 'thoroughly', 'ripe', 'proficient', 'salutary', 'good', 'commodity', 'in_force', 'honorable', 'unspoiled', 'just', 'in_effect', 'beneficial', 'secure', 'expert', 'well', 'dependable', 'trade_good', 'honest', 'safe', 'adept', 'practiced', 'respectable', 'dear', 'unspoilt', 'soundly', 'sound', 'skilful', 'right'}
{'evilness', 'ill', 'evil', 'badness', 'bad'}


Next, we can also easily use WordNet to compare the similarity of two words and their tenses, by incorporating the **Wu and Palmer** method for semantic related-ness.

Let's compare the noun of "ship" and "boat:"


In [24]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('boat.n.01')
print(w1.wup_similarity(w2))

0.9090909090909091


In [27]:
w1 = wordnet.synset('sheep.n.01')
w2 = wordnet.synset('dog.n.01')
print(w1.wup_similarity(w2))

0.7333333333333333


In [6]:
#w1 = wordnet.synset('ship.n.01')
#w2 = wordnet.synset('cat.n.01')
#print(w1.wup_similarity(w2))

### TEXT CLASSIFICATION

We're going to start by trying to use the movie reviews database that is part of the NLTK corpus. From there we'll try to use words as "features" which are a part of either a positive or negative movie review. The NLTK corpus movie_reviews data set has the reviews, and they are labeled already as positive or negative.

In [3]:
import nltk
import random
from nltk.corpus import movie_reviews

In [4]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

print(documents[1])

all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)
print(all_words.most_common(15))
print(all_words["stupid"])

(['he', 'has', 'spent', 'his', 'entire', 'life', 'in', 'an', 'awful', 'little', 'apartment', ',', 'raised', 'and', 'cared', 'for', 'and', 'imprisoned', 'by', 'his', 'domineering', 'mother', '.', 'she', 'inspires', 'his', 'love', 'and', 'his', 'fear', ',', 'and', 'instills', 'in', 'him', 'a', 'similar', 'love', 'and', 'fear', 'of', 'jesus', '.', 'he', 'has', 'a', 'rudimentary', 'grasp', 'of', 'language', ',', 'mouthing', 'monosyllables', 'and', 'repetitions', 'of', 'his', 'mother', "'", 's', 'phrases', '.', 'he', 'is', 'taught', 'that', 'the', 'world', 'outside', 'is', 'fatally', 'poisonous', ';', 'his', 'mother', 'dons', 'a', 'gasmask', 'whenever', 'she', 'goes', 'out', 'the', 'door', '.', 'he', 'is', '35', '-', 'years', '-', 'old', 'in', 'body', ',', 'but', 'a', 'child', 'in', 'mind', 'and', 'spirit', '.', 'he', 'is', 'the', 'premise', 'for', 'bad', 'boy', 'bubby', ',', 'a', 'defiantly', 'original', 'australian', 'movie', 'about', 'a', 'man', 'called', 'bubby', '(', 'nicholas', 'hope'

In [5]:
all_words = nltk.FreqDist(all_words)
#print(all_words.most_common(15))
print(all_words["python"])

15


### Words as Feature for Learning

We're going to be building and compiling feature lists of words from positive reviews and words from the negative reviews to hopefully see trends in specific types of words in positive or negative reviews.

In [6]:
import nltk
import random
from nltk.corpus import movie_reviews

In [7]:


documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

all_words = []

for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:2500]

In [8]:
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

In [9]:
print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))

{'plot': True, ':': True, 'two': True, 'teen': True, 'couples': True, 'go': True, 'to': True, 'a': True, 'church': True, 'party': True, ',': True, 'drink': True, 'and': True, 'then': True, 'drive': True, '.': True, 'they': True, 'get': True, 'into': True, 'an': True, 'accident': True, 'one': True, 'of': True, 'the': True, 'guys': True, 'dies': True, 'but': True, 'his': True, 'girlfriend': True, 'continues': True, 'see': True, 'him': True, 'in': True, 'her': True, 'life': True, 'has': True, 'nightmares': True, 'what': True, "'": True, 's': True, 'deal': True, '?': True, 'watch': True, 'movie': True, '"': True, 'sorta': True, 'find': True, 'out': True, 'critique': True, 'mind': True, '-': True, 'fuck': True, 'for': True, 'generation': True, 'that': True, 'touches': True, 'on': True, 'very': True, 'cool': True, 'idea': True, 'presents': True, 'it': True, 'bad': True, 'package': True, 'which': True, 'is': True, 'makes': True, 'this': True, 'review': True, 'even': True, 'harder': True, 'wri

In [10]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]

### NAIVE BAYES CLASSIFIER 

The algorithm that we're going to use first is the Naive Bayes classifier. Before we can train and test our algorithm, we need to go ahead and split up the data into a training set and a testing set. This is called supervised machine learning, because we're showing the machine data, and telling it "this data is positive," or "this data is negative." Then, after that training is done, we show the machine some new data and ask the computer, based on what we taught the computer before, what the computer thinks the category of the new data is.

In [11]:
training_set =featuresets[:1900]
testing_set = featuresets[1900:]

In [12]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [13]:
print("Naive Bayes Algo accuracy:", (nltk.classify.accuracy(classifier, testing_set))*100)

Naive Bayes Algo accuracy: 78.0


In [14]:
classifier.show_most_informative_features(15)

Most Informative Features
                  annual = True              pos : neg    =      9.0 : 1.0
           unimaginative = True              neg : pos    =      7.7 : 1.0
                 frances = True              pos : neg    =      7.6 : 1.0
              schumacher = True              neg : pos    =      7.0 : 1.0
                  shoddy = True              neg : pos    =      7.0 : 1.0
                    mena = True              neg : pos    =      7.0 : 1.0
               atrocious = True              neg : pos    =      7.0 : 1.0
                  suvari = True              neg : pos    =      7.0 : 1.0
                  regard = True              pos : neg    =      7.0 : 1.0
                  turkey = True              neg : pos    =      6.4 : 1.0
                 kidding = True              neg : pos    =      6.4 : 1.0
                 singers = True              pos : neg    =      6.3 : 1.0
                  stinks = True              neg : pos    =      5.8 : 1.0

*This tells is the ratio of occurences in negative to positive, or visa versa, for every word. So here, we can see that the term "insulting" appears 10.6 more times as often in negative reviews as it does in positive reviews. Ludicrous, 10.1*

### Saving Classifiers with NLTK

We dont want to use the classifier everytime we want to use it. what we can do is use the Pickle module to go ahead and serialize our classifier object, so that all we need to do is load that file.

The first step is to save the object. To do this, first we need to import pickle at the top of our script, then, after we have trained with .train() the classifier we can then call the following lines:

In [15]:
import pickle

In [16]:
save_classifier = open("naivebayes.pickle","wb") #wb- write in bytes
pickle.dump(classifier, save_classifier)
save_classifier.close()

This opens up a pickle file, preparing to write in bytes some data. Then, we use pickle.dump() to dump the data. The first parameter to pickle.dump() is what we are dumping, the second parameter is where are we dumping it.

In [17]:
classifier_f = open("naivebayes.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()

Here, we do a very similar process. We open the file to read as bytes. Then, we use pickle.load() to load the file, and we save the data to the classifier variable. Then we close the file, and that is that. We now have the same classifier object as before!

### SCIKIT LEARN WITH NLTK

Below I have mentioned few Classifiers from Scikit learn library

In [18]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [19]:
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

Original Naive Bayes Algo accuracy percent: 78.0
Most Informative Features
                  annual = True              pos : neg    =      9.0 : 1.0
           unimaginative = True              neg : pos    =      7.7 : 1.0
                 frances = True              pos : neg    =      7.6 : 1.0
              schumacher = True              neg : pos    =      7.0 : 1.0
                  shoddy = True              neg : pos    =      7.0 : 1.0
                    mena = True              neg : pos    =      7.0 : 1.0
               atrocious = True              neg : pos    =      7.0 : 1.0
                  suvari = True              neg : pos    =      7.0 : 1.0
                  regard = True              pos : neg    =      7.0 : 1.0
                  turkey = True              neg : pos    =      6.4 : 1.0
                 kidding = True              neg : pos    =      6.4 : 1.0
                 singers = True              pos : neg    =      6.3 : 1.0
                  stinks 



LogisticRegression_classifier accuracy percent: 81.0




SGDClassifier_classifier accuracy percent: 81.0




SVC_classifier accuracy percent: 82.0




LinearSVC_classifier accuracy percent: 82.0
NuSVC_classifier accuracy percent: 85.0
