# loading dataset:

In [1]:
from nltk.corpus import movie_reviews

In [2]:
import nltk
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [3]:
movie_reviews.categories()

['neg', 'pos']

In [4]:
movie_reviews.fileids()[0:5]

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt']

In [5]:
len(movie_reviews.fileids())

2000

In [6]:
#printing all neg docs:
movie_reviews.fileids('neg')[0:5]

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt']

In [7]:
#therefore we have all the files

In [8]:
#to read first doc:
print(movie_reviews.words(movie_reviews.fileids()[1]))

['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...]


# cleaning data:

In [9]:
#first adding all the documents in the array in form of tupples where tupple hold words and category.
documents=[]
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid),category))

In [10]:
#printing first 5 docs:
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [11]:
#shuffling them for train_test_split:
import random
random.shuffle(documents)

In [12]:
#getting all stopwords and punctuations:
from nltk.corpus import stopwords
import string
stops=set(stopwords.words('english'))
punctuations=string.punctuation
stops.update(punctuations)

In [13]:
from nltk import pos_tag

In [14]:
#defining function which will convert pos_tag parts of speech into wordnet part of speech:
from nltk import WordNetLemmatizer as wnl
from nltk.corpus import wordnet
lemmatizer=wnl()
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [15]:
#defining function to clean documents:
def clean_review(words):
    output_words=[]
    for w in words:
        if w.lower() not in stops:
            pos=pos_tag([w])
            clean_word=lemmatizer.lemmatize(w,pos=get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words


In [16]:
documents=[(clean_review(document),category) for document,category in documents]

# buildind features dataset:

In [17]:
#now we will create array of tuples where each tuple will hold dictionary having all features and their value if in form of true or false and the category for each document in documents.


In [18]:
#splititng the data in 75-25 split.
dtrain=documents[0:1500]
dtest=documents[1500:]

In [19]:
#creating array of all words:
all_words=[]
for doc in dtrain:
    all_words+=doc[0]

In [20]:
#getting freq distribution:
freq=nltk.FreqDist(all_words)
common=freq.most_common(3000)
#therefore we took top 3k most occuring words
features=[i[0] for i in common]

In [21]:
features[0:5]
#top 5 features

['film', 'movie', 'one', 'make', 'like']

In [22]:
#for each doc in documents we will create a dictionary having features and their value:
def get_feature_dict(words):
    current_features={}
    words_set=set(words)
    for w in features:
        current_features[w]=w in words_set
    #it will return true or false value according to if word in feature or not:
    return current_features

In [23]:
#storing dic and category for each documents in tuple in the array:
training_data=[(get_feature_dict(doc),category) for doc,category in dtrain]

In [24]:
test_data=[(get_feature_dict(doc),category) for doc,category in dtest]

# building classifier

In [25]:
#using nltk inbuilt classifier naive bayes:
from nltk import NaiveBayesClassifier as nbc

In [26]:
classifier=nbc.train(training_data)

In [27]:
nltk.classify.accuracy(classifier,test_data)

0.788

In [28]:
#thereofore we got decent accuracy.

In [29]:
classifier.show_most_informative_features(15)

Most Informative Features
                   poker = True              pos : neg    =      9.1 : 1.0
                   anger = True              pos : neg    =      8.5 : 1.0
                   mulan = True              pos : neg    =      8.4 : 1.0
             beautifully = True              pos : neg    =      8.3 : 1.0
             outstanding = True              pos : neg    =      7.8 : 1.0
                  belief = True              pos : neg    =      7.8 : 1.0
                lifeless = True              neg : pos    =      7.8 : 1.0
                  seagal = True              neg : pos    =      6.9 : 1.0
              schumacher = True              neg : pos    =      6.5 : 1.0
                   ideal = True              pos : neg    =      6.4 : 1.0
                 idiotic = True              neg : pos    =      6.3 : 1.0
                 comfort = True              pos : neg    =      6.0 : 1.0
                 pattern = True              pos : neg    =      5.9 : 1.0

In [32]:
#applying randomforestclassifier from sklearn:
from sklearn.ensemble import RandomForestClassifier as rf
from nltk.classify.scikitlearn import SklearnClassifier as sc

  from numpy.core.umath_tests import inner1d


In [33]:
clf=rf()
classifier=sc(clf)

In [34]:
classifier.train(training_data)

<SklearnClassifier(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))>

In [37]:
nltk.classify.accuracy(classifier,test_data)

0.674

# applying count vectorizer:

In [38]:
#here we gonna use documents and apply count vectorizer on them and then apply sklearn classifier.

In [39]:
categories=[category for document,category in documents]

In [40]:
documents_text=[' '.join(document) for document,category in documents]

In [41]:
from sklearn.feature_extraction.text import CountVectorizer as cv

In [42]:
from sklearn.model_selection import train_test_split as tt

In [43]:
x_train,x_test,y_train,y_test=tt(documents_text,categories)

In [49]:
count_vec=cv(max_features=3000,ngram_range=(1,2))
train_data=count_vec.fit_transform(x_train)
test_data=count_vec.transform(x_test)

In [46]:
#applying svm
from sklearn.svm import SVC


In [50]:
svc=SVC()
svc.fit(train_data,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [51]:
svc.score(test_data,y_test)

0.778