In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('Train.csv')

In [3]:
data.head(n = 10)

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http://video.google.com/videoplay?docid=211772...,pos
2,Title: Opera (1987) Director: Dario Argento Ca...,pos
3,I think a lot of people just wrote this off as...,pos
4,This is a story of two dogs and a cat looking ...,pos
5,Steve Carell comes into his own in his first s...,pos
6,I'm only going to write more because it's requ...,neg
7,"OK, it was a ""risky"" move to rent this flick, ...",neg
8,"Cannibalism, a pair of cinematic references to...",pos
9,This is one of the great modern kung fu films....,pos


In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
le = LabelEncoder()

In [6]:
data['label'] = le.fit_transform(data['label'])

In [7]:
data.head(n = 10)

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,1
1,http://video.google.com/videoplay?docid=211772...,1
2,Title: Opera (1987) Director: Dario Argento Ca...,1
3,I think a lot of people just wrote this off as...,1
4,This is a story of two dogs and a cat looking ...,1
5,Steve Carell comes into his own in his first s...,1
6,I'm only going to write more because it's requ...,0
7,"OK, it was a ""risky"" move to rent this flick, ...",0
8,"Cannibalism, a pair of cinematic references to...",1
9,This is one of the great modern kung fu films....,1


In [8]:
data = data.values

In [9]:
x = data[:, 0]
y = data[:, 1]
print(x.shape, y.shape)

(40000,) (40000,)


In [10]:
# CLEANING THE DATA

In [11]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [12]:
tokenizer = RegexpTokenizer(r'\w+')
en_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

In [13]:
def getCleanedReview(review):
    review = review.lower()
    review = review.replace('<br /><br />', ' ')
    review = review.replace('http://video.google.com/videoplay?docid=211772166650071408&hl=en', ' ')
    
    tokens = tokenizer.tokenize(review)
    new_tokens = [token for token in tokens if token not in en_stopwords]
    stemmed_tokens = [ps.stem(token) for token in new_tokens]
    
    cleaned_review = ' '.join(stemmed_tokens)
    return cleaned_review

In [14]:
rev = x[1]
print(rev)

http://video.google.com/videoplay?docid=211772166650071408&hl=en Distribution was tried.<br /><br />We opted for mass appeal.<br /><br />We want the best possible viewing range so, we forgo profit and continue our manual labor jobs gladly to entertain you for working yours.<br /><br />View Texas tale, please write about it... If you like it or not, if you like Alex or not, if you like Stuie, Texas or Texas tale... Just write about it.<br /><br />Your opinion rules.


In [15]:
rev = getCleanedReview(rev)
print(rev)

distribut tri opt mass appeal want best possibl view rang forgo profit continu manual labor job gladli entertain work view texa tale pleas write like like alex like stuie texa texa tale write opinion rule


In [16]:
for i in range(x.shape[0]):
    x[i] = getCleanedReview(x[i])

In [17]:
print(x[:3])

['matur intellig highli charg melodrama unbelivebl film china 1948 wei wei stun perform catylast love triangl simpli stun oppurun see magnific film take'
 'distribut tri opt mass appeal want best possibl view rang forgo profit continu manual labor job gladli entertain work view texa tale pleas write like like alex like stuie texa texa tale write opinion rule'
 'titl opera 1987 director dario argento cast cristina masillach ian charleson urbano barberini daria nicolodi review argento movi seen suspiria one blew away style color spooki stori line next decid go opera told one best man think discov ultim one favorit horror director opera young opera singer get big break main star creepi modern opera take mc beth get hit car betti understudi get part bad psycho make watch brutal murder friend co worker wow id heard good thing flick prepar level great film would take yeah movi shortcom ill get later part movi blew away first movi fill lot color suspiria expect bit like suspiria depart surpri

In [18]:
# TOKENIZING DATA

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
# 1st TOKEN
cv1 = CountVectorizer()
x_vec_1 = cv1.fit_transform(x[:1000]).toarray()
# print(x_vec[:5])
print(x_vec_1.shape)

(1000, 12585)


In [21]:
# 2nd TOKEN
cv2 = CountVectorizer()
x_vec_2 = cv2.fit_transform(x[1000:2000]).toarray()
print(x_vec_2.shape)

(1000, 12807)


In [22]:
# 3rd TOKEN
cv3 = CountVectorizer()
x_vec_3 = cv3.fit_transform(x[2000:3000]).toarray()
print(x_vec_3.shape)

(1000, 12723)


In [23]:
# NAIVE BAYES

In [24]:
from sklearn.naive_bayes import MultinomialNB

In [25]:
mnb1 = MultinomialNB()
mnb2 = MultinomialNB()
mnb3 = MultinomialNB()

In [26]:
y_new_1 = y[:1000].tolist()
y_new_2 = y[1000:2000].tolist()
y_new_3 = y[2000:3000].tolist()

In [28]:
mnb1.fit(x_vec_1, y_new_1)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [29]:
mnb2.fit(x_vec_2, y_new_2)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [30]:
mnb3.fit(x_vec_3, y_new_3)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [31]:
# PREDICTING TRAINING DATA

In [32]:
# mnb1.predict(x_vec_1)

In [33]:
mnb1.score(x_vec_1, y_new_1)

0.988

In [34]:
# TESTING DATA

In [35]:
x_test = pd.read_csv('Test.csv')

In [36]:
x_test.head(n = 10)

Unnamed: 0,review
0,Remember those old kung fu movies we used to w...
1,This movie is another one on my List of Movies...
2,How in the world does a thing like this get in...
3,"""Queen of the Damned"" is one of the best vampi..."
4,The Caprica episode (S01E01) is well done as a...
5,I usually really enjoy Steven Seagal movies. T...
6,JiÃ¸Ã­ Trnka made his last animated short an i...
7,This is so bad it will be my contribution to t...
8,Watching this hilariously retro but very enter...
9,"Excellent political thriller, played much quie..."


In [37]:
x_test = x_test.values
x_test = x_test.reshape((-1, ))

In [38]:
print(x_test.shape)

(10000,)


In [39]:
for i in range(x_test.shape[0]):
    x_test[i] = getCleanedReview(x_test[i])

In [41]:
x_test_vec_1 = cv1.transform(x_test).toarray()
# print(x_test_vec[:5])
print(x_test_vec_1.shape)

(10000, 12585)


In [42]:
pred1 = mnb1.predict(x_test_vec_1)

In [43]:
x_test_vec_2 = cv2.transform(x_test).toarray()
print(x_test_vec_2.shape)

(10000, 12807)


In [44]:
pred2 = mnb2.predict(x_test_vec_2)

In [45]:
x_test_vec_3 = cv3.transform(x_test).toarray()
print(x_test_vec_3.shape)

(10000, 12723)


In [46]:
pred3 = mnb3.predict(x_test_vec_3)

In [88]:
print(pred.shape)

(10000,)


In [89]:
print(pred[:10])

[0 1 0 1 1 0 0 0 1 1]


In [48]:
new_pred = []
for i in range(pred1.shape[0]):
    if pred1[i] + pred2[i] + pred3[i] >= 2:
        new_pred.append('pos')
    else:
        new_pred.append('neg')

In [49]:
print(new_pred[:10])

['neg', 'neg', 'neg', 'pos', 'pos', 'neg', 'pos', 'neg', 'pos', 'pos']


In [51]:
ids = np.arange(pred1.shape[0])

In [52]:
ans = np.stack((ids, new_pred))

In [53]:
print(ans.shape)

(2, 10000)


In [54]:
ans = ans.T
print(ans.shape)

(10000, 2)


In [55]:
df = pd.DataFrame(ans, columns = ['Id', 'label'])

In [56]:
df.head()

Unnamed: 0,Id,label
0,0,neg
1,1,neg
2,2,neg
3,3,pos
4,4,pos


In [57]:
df.to_csv('answer.csv', index = False)