In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('Train.csv')
df.head()

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http://video.google.com/videoplay?docid=211772...,pos
2,Title: Opera (1987) Director: Dario Argento Ca...,pos
3,I think a lot of people just wrote this off as...,pos
4,This is a story of two dogs and a cat looking ...,pos


In [3]:
X_train = df['review'].values
Y_train = df['label'].values

In [4]:
df_test = pd.read_csv('Test.csv')
X_test = list(df_test.review)

In [5]:
# Cleaning
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer

In [6]:
en_stopwords = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
ps = PorterStemmer()

In [7]:
def getCleanReview(review):
    review = review.lower()
    review = review.replace('<br /><br />',' ')
    
    #Tokenize
    tokens = tokenizer.tokenize(review)
    new_tokens = [token for token in tokens if token not in en_stopwords]
    stemmed_tokens = [ps.stem(token) for token in new_tokens]
    
    cleaned_review = ' '.join(stemmed_tokens)
    return cleaned_review

In [8]:
X_train = [getCleanReview(review) for review in X_train]
X_test = [getCleanReview(review) for review in X_test]

In [9]:
# Vectorization
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
cv = CountVectorizer(ngram_range = (1,2))
x_vec = cv.fit_transform(X_train)
x_test_vec = cv.transform(X_test)
x_vec.shape

(40000, 2270363)

In [11]:
# MultinomialNB
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(x_vec,Y_train)
pred = mnb.predict(x_test_vec)
pred

array(['neg', 'neg', 'neg', ..., 'pos', 'pos', 'neg'], dtype='<U3')

In [12]:
Id = np.arange(10000)
f = pd.DataFrame(pred,columns = ['label'])
f['Id'] = np.arange(10000)
f = f[['Id','label']]
f.to_csv('pred_multi.csv',index = False)

In [13]:
# BernoulliNB
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(x_vec,Y_train)
pred2 = bnb.predict(x_test_vec)
pred

array(['neg', 'neg', 'neg', ..., 'pos', 'pos', 'neg'], dtype='<U3')

In [14]:
f2 = pd.DataFrame(pred2,columns = ['label'], index = Id)
f2.to_csv('pred_bernoulli',index_label = 'Id')