In [1]:
from nltk import ngrams
from nltk.corpus import stopwords 
import string
 
stopwords_english = stopwords.words('english')
 
# clean words, i.e. remove stopwords and punctuation
def clean_words(words, stopwords_english):
    words_clean = []
    for word in words:
        word = word.lower()
        if word not in stopwords_english and word not in string.punctuation:
            words_clean.append(word)    
    return words_clean 
 
# feature extractor function for unigram
def bag_of_words(words):    
    words_dictionary = dict([word, True] for word in words)    
    return words_dictionary
 
# feature extractor function for ngrams (bigram)
def bag_of_ngrams(words, n=2):
    words_ng = []
    for item in iter(ngrams(words, n)):
        words_ng.append(item)
    words_dictionary = dict([word, True] for word in words_ng)    
    return words_dictionary

from nltk.tokenize import word_tokenize
text = "It was a very good movie."
words = word_tokenize(text.lower())
 
print ("Words of the text: ",words)
print ("\nAfter using feature extractor function for bigram: ",bag_of_ngrams(words))

words_clean = clean_words(words, stopwords_english)
print ("\nAfter cleaning the words of text: ",words_clean)

important_words = ['above', 'below', 'off', 'over', 'under', 'more', 'most', 'such', 'no', 'nor',
                   'not', 'only', 'so', 'than', 'too', 'very', 'just', 'but','really','non']
 
stopwords_english_for_bigrams = set(stopwords_english) - set(important_words)
 
words_clean_for_bigrams = clean_words(words, stopwords_english_for_bigrams)
print ("\nCleaned words for bigrams: ",words_clean_for_bigrams)

unigram_features = bag_of_words(words_clean)
print ("\nUnigram features: ",unigram_features)

bigram_features = bag_of_ngrams(words_clean_for_bigrams)
print ("\nBigram features: ",bigram_features)

all_features = unigram_features.copy()
all_features.update(bigram_features)
print ("\nAll features : ",all_features)

def bag_of_all_words(words, n=2):
    words_clean = clean_words(words, stopwords_english)
    words_clean_for_bigrams = clean_words(words, stopwords_english_for_bigrams)
 
    unigram_features = bag_of_words(words_clean)
    bigram_features = bag_of_ngrams(words_clean_for_bigrams)
 
    all_features = unigram_features.copy()
    all_features.update(bigram_features)
 
    return all_features
 
print ("\nBag of all words: ",bag_of_all_words(words))


Words of the text:  ['it', 'was', 'a', 'very', 'good', 'movie', '.']

After using feature extractor function for bigram:  {('it', 'was'): True, ('was', 'a'): True, ('a', 'very'): True, ('very', 'good'): True, ('good', 'movie'): True, ('movie', '.'): True}

After cleaning the words of text:  ['good', 'movie']

Cleaned words for bigrams:  ['very', 'good', 'movie']

Unigram features:  {'good': True, 'movie': True}

Bigram features:  {('very', 'good'): True, ('good', 'movie'): True}

All features :  {'good': True, 'movie': True, ('very', 'good'): True, ('good', 'movie'): True}

Bag of all words:  {'good': True, 'movie': True, ('very', 'good'): True, ('good', 'movie'): True}


In [2]:
from nltk.corpus import movie_reviews 
 
pos_reviews = []
for fileid in movie_reviews.fileids('pos'):
    words = movie_reviews.words(fileid)
    pos_reviews.append(words)
 
neg_reviews = []
for fileid in movie_reviews.fileids('neg'):
    words = movie_reviews.words(fileid)
    neg_reviews.append(words)

In [3]:

# positive reviews feature set
pos_reviews_set = []
for words in pos_reviews:
    pos_reviews_set.append((bag_of_all_words(words), 'The review for this movie is POSITIVE'))
 
# negative reviews feature set
neg_reviews_set = []
for words in neg_reviews:
    neg_reviews_set.append((bag_of_all_words(words), 'The review for this movie is NEGATIVE'))

In [4]:
print ("Length of positive review set:",len(pos_reviews_set)) 
print ("Length of negitive review set:",len(neg_reviews_set)) 
# radomize pos_reviews_set and neg_reviews_set
# doing so will output different accuracy result everytime we run the program
from random import shuffle 
shuffle(pos_reviews_set)
shuffle(neg_reviews_set)
 
test_set = pos_reviews_set[:125] + neg_reviews_set[:125]
train_set = pos_reviews_set[125:] + neg_reviews_set[125:]

print("Length of testing set:",len(test_set))
print("Length of training set:",len(train_set))  

Length of positive review set: 1000
Length of negitive review set: 1000
Length of testing set: 250
Length of training set: 1750


In [5]:
from nltk import classify
from nltk import NaiveBayesClassifier
 
classifier = NaiveBayesClassifier.train(train_set)
 
accuracy = classify.accuracy(classifier, test_set)
print("Accuracy: ",accuracy) 

Accuracy:  0.8


In [6]:
from nltk.tokenize import word_tokenize


# Negative review 
 
custom_review = "I hated the film. It was a disaster. Poor direction, bad acting."
custom_review_tokens = word_tokenize(custom_review)
custom_review_set = bag_of_all_words(custom_review_tokens)
print (classifier.classify(custom_review_set)) 

# probability result
prob_result = classifier.prob_classify(custom_review_set)
print (prob_result.max()) 
print (prob_result.prob("The review for this movie is NEGATIVE")) 
print (prob_result.prob("The review for this movie is POSITIVE")) 

# Positive review correctly classified as positive

custom_review = "It was a wonderful and amazing movie. I loved it. it was quite thrilling and interesting."
custom_review_tokens = word_tokenize(custom_review)
custom_review_set = bag_of_all_words(custom_review_tokens)
print (classifier.classify(custom_review_set)) 
 
# probability result
prob_result = classifier.prob_classify(custom_review_set)
print (prob_result.max()) 
print (prob_result.prob("The review for this movie is NEGATIVE")) 
print (prob_result.prob("The review for this movie is POSITIVE")) 

The review for this movie is NEGATIVE
The review for this movie is NEGATIVE
0.8058020534880993
0.19419794651190214
The review for this movie is POSITIVE
The review for this movie is POSITIVE
0.0044838034248832915
0.9955161965751139


In [7]:
from nltk.tokenize import word_tokenize


def clicked():
    custom_review = txt.get()
    custom_review_tokens = word_tokenize(custom_review)
    custom_review_set = bag_of_all_words(custom_review_tokens)
    output=classifier.classify(custom_review_set)
    l2.configure(text=output)

import tkinter
from tkinter import *
window=tkinter.Tk()
window.title("TEXT BASED SENTIMENT ANALYSIS-USING BIGRAM")
label=tkinter.Label(window, text=" Enter your movie review-", font=(40))
label.pack()

l2=tkinter.Label(window)
l2.place(relx = 0.5, rely = 0.6, anchor = 's') 
window.geometry('500x300')

txt=Entry(window,width=40)
txt.place(relx=0.5,rely=0.2,anchor='center')


bt= Button(window, text = 'Enter',bg="black",fg="white", command=clicked)
bt.place(relx=0.5,rely=0.4,anchor='center')



window.mainloop()

