In [1]:
import pandas as pd
import nltk 
from nltk.corpus import movie_reviews as mr      # taking nltk inbuilt dataset
from nltk.corpus import stopwords
from sklearn.utils import shuffle

In [2]:
stopword= set(stopwords.words('english'))

In [3]:
pos_raw= mr.raw(mr.fileids('pos'))       # raw positive sentences from corpus (unformated form)
neg_raw= mr.raw(mr.fileids('neg'))       # raw negtive sentences from corpus

In [6]:
pos= pos_raw.split('\n')              # splliting sentences from raw data file
neg= pos_raw.split('\n')

In [7]:
pos_tag= []       # giving each sentence label 
neg_tag= []
for i in pos:
    pos_tag.append([i,1])
for i in neg:
    neg_tag.append([i,0])

In [8]:
pos= pd.DataFrame(pos_tag,columns=['txt','status'])
neg= pd.DataFrame(neg_tag,columns=['txt','status'])

In [9]:
data= pd.concat([pos,neg],axis=0)          # creating processesable dataframe
data=shuffle(data)

In [10]:
data['status'].value_counts()

1    32938
0    32938
Name: status, dtype: int64

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer as tfid
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score



In [12]:
vectorizer= tfid(stop_words= stopword,max_features=21)       # using TfIdf to make words as features by making word vectors
x= vectorizer.fit_transform(data['txt'])
y= data.status

In [13]:
x_train,x_test,y_train,y_test= train_test_split(x,y,random_state= 42)         # splitting data for cross validation

In [15]:
from sklearn.naive_bayes import MultinomialNB        # using multiNomial Naive Bayes as classifier

clf= MultinomialNB()

clf.fit(x_train,y_train)
pred= clf.predict(x_test)
print(roc_auc_score(pred,y_test))

0.4966238635292005


In [16]:
x.shape

(65876, 21)

In [17]:
for i in range(20,40):
    vectorizer= tfid(stop_words= stopword,max_features=i)          # checking for optimum max features
    x= vectorizer.fit_transform(data['txt'])
    y= data.status
    x_train,x_test,y_train,y_test= train_test_split(x,y,random_state= 42)
    clf= MultinomialNB()
    clf.fit(X=x_train,y=y_train)
    pred= clf.predict(x_test)
    print(i,roc_auc_score(pred,y_test))


20 0.4951045347972237
21 0.4966238635292005
22 0.49416746519807087
23 0.49510524278776513
24 0.4934397185873044
25 0.4942005374568356
26 0.49182745136393036
27 0.4922044015054945
28 0.49339790097915387
29 0.493979901834336
30 0.4930821160074844
31 0.49318418962573746
32 0.49164338919925515
33 0.4906535152544381
34 0.48984682061218426
35 0.49000221733765237
36 0.4906749758821638
37 0.48933792764931416
38 0.4884977410255855
39 0.4888862013813785


### Thus 21 is optimum no. of max features for this dataset and TfIdf takes most informative word by setting higher weights to unique words in a corpus.