In [1]:
import os 
import nltk 
import random
from sklearn.metrics import confusion_matrix

In [2]:
os.chdir("/home/sagar/cpee/sentiment analysis")

### Reading the text file

In [3]:
# Read the file
f1 = open("positive.txt","r",encoding='latin-1')   # "r" is for reading
short_pos = f1.read() 
f2 = open("negative.txt","r",encoding='latin-1')
short_neg = f2.read()

### Converting to lower case

In [4]:
short_pos = short_pos.lower()
short_neg = short_neg.lower()

### Converting a long string into list of string with new line as delimiter

In [5]:
posidocuments = short_pos.split('\n')
print(posidocuments[0])
negadocuments = short_neg.split('\n')
print(negadocuments[0])

the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . 
simplistic , silly and tedious . 


### Reducing data size for faster processing

In [6]:
posidocuments = posidocuments[:1000]
negadocuments = negadocuments[:1000]

### Making common document database

In [7]:
documents = []
for p in posidocuments:
    documents.append((p, "pos"))
for n in negadocuments:
    documents.append((n, "neg"))

### Initializing Tokenizer and Stop-Words configuration

In [8]:
from nltk.corpus import RegexpTokenizer as regextoken
tokenizer = regextoken(r'\w+')

from nltk.corpus import stopwords
stopwords = stopwords.words('english')

### POS Tagging - Parts of speech tagging

In [9]:
words = tokenizer.tokenize(posidocuments[0])
print(nltk.pos_tag(words))

[('the', 'DT'), ('rock', 'NN'), ('is', 'VBZ'), ('destined', 'VBN'), ('to', 'TO'), ('be', 'VB'), ('the', 'DT'), ('21st', 'JJ'), ('century', 'NN'), ('s', 'VBD'), ('new', 'JJ'), ('conan', 'NN'), ('and', 'CC'), ('that', 'IN'), ('he', 'PRP'), ('s', 'VBZ'), ('going', 'VBG'), ('to', 'TO'), ('make', 'VB'), ('a', 'DT'), ('splash', 'NN'), ('even', 'RB'), ('greater', 'JJR'), ('than', 'IN'), ('arnold', 'RB'), ('schwarzenegger', 'JJ'), ('jean', 'JJ'), ('claud', 'NN'), ('van', 'NN'), ('damme', 'NN'), ('or', 'CC'), ('steven', 'JJ'), ('segal', 'NN')]


### Collecting adjectives

In [11]:
allowed_word_types = ["JJ"]
all_words = []
for doc, label in documents:
    # tokens with one or more alpha-numerics (special characters will not be included)
    words = tokenizer.tokenize(doc)
    tagged_words = nltk.pos_tag(words)
    for word in tagged_words:
        if word[1] in allowed_word_types:
            # extracting the adjectives
            all_words.append(word[0])

In [12]:
print(len(documents),'Documents')
print(len(all_words),'Adjectives')

2000 Documents
4579 Adjectives


### Checking structure of documents variable

In [14]:
documents[998:1002]

[("it's a bittersweet and lyrical mix of elements . ", 'pos'),
 ('subversive , meditative , clinical and poetic , the piano teacher is a daring work of genius . ',
  'pos'),
 ('simplistic , silly and tedious . ', 'neg'),
 ("it's so laddish and juvenile , only teenage boys could possibly find it funny . ",
  'neg')]

### Creating Frequency Distribution of word-features

In [15]:
all_words = nltk.FreqDist(all_words)

In [16]:
all_words.most_common(10)

[('s', 92),
 ('good', 84),
 ('bad', 51),
 ('little', 50),
 ('much', 48),
 ('new', 41),
 ('other', 41),
 ('funny', 38),
 ('t', 37),
 ('old', 36)]

### Creating list of Features

In [17]:
word_features = list(all_words.keys())
## Alternative method using set()
# print(len(set(all_words)))
print(len(word_features))
print(word_features[:10])

1751
['21st', 'new', 'schwarzenegger', 'jean', 'steven', 'elaborate', 'huge', 'describe', 'co', 'middle']


### Removing stopwords and creating Feature-Set from Documents

In [18]:
def find_features(document):
    document_tokens = tokenizer.tokenize(document)
    
    features = {}
    for w in word_features:
        features[w] = (w in document_tokens and w not in stopwords)     
    return features

featuresets  = [(find_features(rev), category) for (rev, category) in documents]
print(len(featuresets),'Documents converted into Feature-Set')

2000 Documents converted into Feature-Set


### Shuffling the documents and creating Train-Test split

In [19]:
random.shuffle(featuresets)
training_set = featuresets[:1800] 
testing_set =  featuresets[1800:]

### Building a Naive Bayes Classifier

In [20]:
classifier = nltk.NaiveBayesClassifier.train(training_set)
train_acc = nltk.classify.accuracy(classifier, training_set)*100
print('Training Accuracy',train_acc)

Training Accuracy 87.8888888888889


### Most Informative Features

In [21]:
classifier.show_most_informative_features(20)

Most Informative Features
                  stupid = True              neg : pos    =      6.8 : 1.0
                personal = True              pos : neg    =      5.8 : 1.0
                animated = True              pos : neg    =      5.8 : 1.0
           psychological = True              pos : neg    =      5.1 : 1.0
               adventure = True              pos : neg    =      5.1 : 1.0
                     bad = True              neg : pos    =      5.0 : 1.0
               contrived = True              neg : pos    =      4.9 : 1.0
                   crazy = True              neg : pos    =      4.9 : 1.0
                  strong = True              pos : neg    =      4.5 : 1.0
                    nice = True              pos : neg    =      4.5 : 1.0
                  deeply = True              pos : neg    =      4.5 : 1.0
                   aware = True              pos : neg    =      4.5 : 1.0
                possible = True              pos : neg    =      4.5 : 1.0

### Making predictions using Naive Bayes Classifier

In [22]:
predicted=classifier.classify_many([x[0] for x in testing_set])
conf_matrix = confusion_matrix(predicted, [x[1] for x in testing_set])
print(conf_matrix) 

print("Testing Accuracy:", (nltk.classify.accuracy(classifier, testing_set))*100)

[[59 44]
 [29 68]]
Testing Accuracy: 63.5
