In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [2]:
df = pd.read_csv("Train.csv")

In [7]:
df.head()

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http://video.google.com/videoplay?docid=211772...,pos
2,Title: Opera (1987) Director: Dario Argento Ca...,pos
3,I think a lot of people just wrote this off as...,pos
4,This is a story of two dogs and a cat looking ...,pos


In [4]:
tokenizer = RegexpTokenizer('[a-zA-Z]+')
ps = PorterStemmer()
en_stopwords = set(stopwords.words('english'))


In [5]:
def getStemmedReview(review):
    
    review = review.lower()
    review = review.replace('<br /><br />', ' ')
    tokens = tokenizer.tokenize(review)
    new_tokens = [token for token in tokens if token not in en_stopwords]
    stemmed_tokens = [ps.stem(token) for token in new_tokens]
    
    cleaned_review = ' '.join(stemmed_tokens)
    return cleaned_review

In [6]:
s = df.iloc[1, 0]
getStemmedReview(s)

'http video googl com videoplay docid hl en distribut tri opt mass appeal want best possibl view rang forgo profit continu manual labor job gladli entertain work view texa tale pleas write like like alex like stuie texa texa tale write opinion rule'

In [12]:
X = df['review'].values

In [13]:
X.shape

(40000,)

In [14]:
X[0]

"mature intelligent and highly charged melodrama unbelivebly filmed in China in 1948. wei wei's stunning performance as the catylast in a love triangle is simply stunning if you have the oppurunity to see this magnificent film take it"

In [16]:
document = []
for i in range(X.shape[0]):
    review = X[i]
    text = getStemmedReview(review)
    document.append(text)

In [18]:
df_test = pd.read_csv('Test.csv')

In [19]:
df_test.head(n = 10)

Unnamed: 0,review
0,Remember those old kung fu movies we used to w...
1,This movie is another one on my List of Movies...
2,How in the world does a thing like this get in...
3,"""Queen of the Damned"" is one of the best vampi..."
4,The Caprica episode (S01E01) is well done as a...
5,I usually really enjoy Steven Seagal movies. T...
6,JiÃ¸Ã­ Trnka made his last animated short an i...
7,This is so bad it will be my contribution to t...
8,Watching this hilariously retro but very enter...
9,"Excellent political thriller, played much quie..."


In [20]:
X_test = df_test['review'].values

In [21]:
X_test.shape

(10000,)

In [22]:
test_doc = [getStemmedReview(X_test[i]) for i in range(X_test.shape[0])]

### Vectorization

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

In [55]:
cv = CountVectorizer(ngram_range=(1, 2))

X_vec = cv.fit_transform(document)
print(X_vec.shape)

(40000, 2235661)


In [56]:
X_test_vec = cv.transform(test_doc)

In [57]:
print(X_test_vec.shape)

(10000, 2235661)


### Multinomial Naive Bayes

In [59]:
from sklearn.naive_bayes import MultinomialNB

In [60]:
mnb = MultinomialNB()

In [61]:
Y_train = df['label']

In [62]:
Y_train

0        pos
1        pos
2        pos
3        pos
4        pos
        ... 
39995    neg
39996    neg
39997    neg
39998    pos
39999    pos
Name: label, Length: 40000, dtype: object

In [64]:
Y_train = np.array(Y_train == 'pos', dtype = 'int32')

In [65]:
Y_train

array([1, 1, 1, ..., 0, 1, 1])

In [66]:
mnb.fit(X_vec, Y_train)

MultinomialNB()

In [69]:
Y_test = mnb.predict(X_test_vec)

In [70]:
print(Y_test.shape)

(10000,)


In [72]:
ans_doc = []
for i in range(Y_test.shape[0]):
    if(Y_test[i] == 1):
        ans_doc.append("pos")
    else:
        ans_doc.append("neg")

In [74]:
Y_test = np.array(ans_doc)

In [75]:
Y_test

array(['neg', 'neg', 'neg', ..., 'pos', 'pos', 'neg'], dtype='<U3')

In [76]:
new_df = pd.DataFrame()

In [77]:
new_df['id'] = np.arange(10000)

In [78]:
new_df['label'] = Y_test

In [79]:
new_df.head(10)

Unnamed: 0,id,label
0,0,neg
1,1,neg
2,2,neg
3,3,pos
4,4,pos
5,5,neg
6,6,pos
7,7,neg
8,8,pos
9,9,pos


In [80]:
new_df.to_csv('MovieReview.csv', index=False)