In [1]:
from nltk.corpus import movie_reviews

In [2]:
import nltk
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Ananthapadmanabha\AppData\Roaming\nltk_data..
[nltk_data]     .
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [4]:
# to access all the words present in the file 
movie_reviews.words(movie_reviews.fileids('pos'))

['films', 'adapted', 'from', 'comic', 'books', 'have', ...]

In [5]:
movie_reviews.words(movie_reviews.fileids()[1])

['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...]

What is stemming and lemmatization?

Stemming: The process of reducing the words to their stem. Its basically converting all the words into their nearest meaning root word which will help us in doing analysis.
Using the stem word we will be able to determine wheather a word is positive or negative.
Application: detecting spam words
Lemmatization : Same as stemming but the stem will have meaning that can be understood by a human.

In [6]:
from nltk.corpus import wordnet

def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


In [7]:
# to append all the words present in the files to a document 
document = []
for category in movie_reviews.categories():
    for files in movie_reviews.fileids(category):
        document.append((movie_reviews.words(files),category))
print(document[0:4])


[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'), (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'), (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'), (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg')]


In [8]:
#lets shuffle the documents because we will have a mix of positive and negative documents
import random
random.shuffle(document)

In [9]:
print(document[0:4])

[(['bad', '.', 'bad', '.', 'bad', '.', 'that', 'one', ...], 'neg'), (['that', 'is', ',', 'unless', 'you', "'", 're', 'one', ...], 'neg'), (['two', 'party', 'guys', 'bob', 'their', 'heads', 'to', ...], 'neg'), (['say', ',', 'tell', 'me', 'if', 'you', "'", 've', ...], 'neg')]


In [42]:
# to remove the stopwords and apply lemmatization and to get the part of speech of each of the words that the data belongs to
def clean_code(words):
    output_words = []
    for word in words:
        if word.lower() not in stops:
            pos = pos_tag([word])
            clean_word = lemmatizer.lemmatize(word, get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words






In [37]:
#importing the pos_tag 
#note: the input to this pos tag shouls always be an array []
from nltk import pos_tag

In [38]:
#example of getting pos tag for a word 
word = "enemy"
pos_tag([word])

[('enemy', 'NN')]

In [39]:
#importing a lemmatizer 
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


In [28]:
#creating a list of stop words and also punctuation 
from nltk.corpus import stopwords
import string 
stops = set(stopwords.words('english'))
puntuations = list(string.punctuation)
stops.update(puntuations)

In [43]:
stops
import time 

In [44]:
#nltk.download('wordnet')
start = time.time()
document = [(clean_code(documents), category) for documents,category in document]
end = time.time()
print("Cleaning time: ",end - start)

Cleaning time:  351.25865745544434


In [46]:
training_documents = document[0:1500]
testing_documents = document[1500:]

In [47]:
#making a list of all the words in the training document 
all_words = []
for doc in training_documents:
    all_words += doc[0]

In [48]:
import nltk

In [53]:
#creating a list of most common features present in the training data set
freq = nltk.FreqDist(all_words)
common = freq.most_common(3000)
features = [feature for feature, number in common]


In [59]:
#creating a dictionary to specify if the words in the features are present in the input given
def get_dictionary(words):
    dict = {}
    set_words = set(words)
    for w in features:
        dict[w] = w in set_words
    return dict


In [58]:
#get_dictionary(training_documents[0][0])

{'film': True,
 'movie': True,
 'one': True,
 'make': True,
 'like': True,
 'character': False,
 'get': True,
 'see': True,
 'go': True,
 'time': True,
 'well': True,
 'scene': True,
 'even': True,
 'good': True,
 'story': False,
 'take': True,
 'would': True,
 'much': True,
 'also': True,
 'come': True,
 'life': True,
 'way': True,
 'two': True,
 'give': True,
 'bad': True,
 'end': False,
 'look': True,
 'first': True,
 'know': True,
 'seem': True,
 '--': True,
 'year': False,
 'thing': True,
 'work': False,
 'say': True,
 'play': True,
 'really': True,
 'plot': False,
 'show': False,
 'little': False,
 'people': False,
 'man': True,
 'u': False,
 'love': False,
 'star': False,
 'best': False,
 'director': True,
 'could': False,
 'never': True,
 'try': True,
 'great': False,
 'action': False,
 'performance': False,
 'big': True,
 'new': False,
 'want': True,
 'many': True,
 'actor': False,
 'watch': True,
 'find': True,
 'think': True,
 'act': True,
 'role': True,
 'another': True,
 '

In [63]:
training_data = [(get_dictionary(data),category)for data, category in training_documents]
testing_data = [(get_dictionary(data),category)for data, category in testing_documents]

# Using the naive bayes from nltk classifier

In [66]:
from nltk import NaiveBayesClassifier

In [67]:
classifier = NaiveBayesClassifier.train(training_data)

In [69]:
nltk.classify.accuracy(classifier, testing_data)

0.772

In [73]:
classifier.show_most_informative_features(10)

Most Informative Features
               ludicrous = True              neg : pos    =     19.5 : 1.0
                   anger = True              pos : neg    =     11.5 : 1.0
            breathtaking = True              pos : neg    =     11.1 : 1.0
                 idiotic = True              neg : pos    =     10.9 : 1.0
                   jolie = True              neg : pos    =      9.9 : 1.0
             outstanding = True              pos : neg    =      9.5 : 1.0
             beautifully = True              pos : neg    =      9.3 : 1.0
                  random = True              neg : pos    =      8.7 : 1.0
                 balance = True              pos : neg    =      8.5 : 1.0
                ordinary = True              pos : neg    =      7.4 : 1.0


In [74]:
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier

In [75]:
svc = SVC()
classifier_sklearn = SklearnClassifier(svc)

In [76]:
classifier_sklearn.train(training_data)

<SklearnClassifier(SVC())>

In [77]:
nltk.classify.accuracy(classifier_sklearn, testing_data)

0.834

# Using modules from sklearn to train the data

In [78]:
from sklearn.ensemble import RandomForestClassifier

In [79]:
random_forest = RandomForestClassifier()
classy_random = SklearnClassifier(random_forest)

In [83]:
classy_random.train(training_data)

<SklearnClassifier(RandomForestClassifier())>

In [84]:
nltk.classify.accuracy(classy_random, testing_data)

0.788

In [85]:
from sklearn.feature_extraction.text import CountVectorizer

In [119]:
train_set = {"the sky is blue ", "sun is dark"}
cv = CountVectorizer(max_features=3, ngram_range=(2,3))
a = cv.fit_transform(train_set)


In [120]:
cv.get_feature_names()

['is blue', 'is dark', 'sky is']

In [92]:
categories = [category for documents, category in document]

In [94]:
text_doc = [" ".join(documents) for documents, categories in document]

In [100]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(text_doc, categories)

In [121]:
cv = CountVectorizer(max_features=2000, ngram_range=(1,2))
train_features = cv.fit_transform(X_train)


In [122]:
test_features = cv.transform(X_test)

In [123]:
from sklearn.svm import SVC

In [124]:
scc = SVC()
scc.fit(train_features,y_train)

SVC()

In [125]:
scc.score(test_features, y_test)

0.824