In [1]:
import numpy as np
import pandas as pd

In [2]:
from nltk.tokenize import RegexpTokenizer
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


In [3]:
encoder = LabelEncoder()
tokenizer = RegexpTokenizer(r'\w+')
en_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

In [4]:
df = pd.read_csv('Train.csv')
data = df.values

In [5]:
data.shape

(40000, 2)

In [6]:
X = data[:,0]
y = data[:,1]

In [7]:
X_list = list(X)

In [8]:
y = encoder.fit_transform(y)

In [9]:
y.shape

(40000,)

## Data Preprocessing
 - Tokenization
 - Stopword Removal
 - Stemming
 - Vectorization

In [10]:

def getCleanReview(X):
    

    clean_doc = []
    #Tokenize
    for i in range(len(X)):
        tokens = tokenizer.tokenize(X[i])
        new_tokens = [token for token in tokens if token not in en_stopwords]
        stemmed_tokens = [ps.stem(token) for token in new_tokens]
        clean_doc.append(' '.join(stemmed_tokens))

    
    return clean_doc

In [11]:
clean = getCleanReview(X_list)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
cv = CountVectorizer()
X_vect = cv.fit_transform(clean)
X_vect

<40000x65804 sparse matrix of type '<class 'numpy.int64'>'
	with 3983866 stored elements in Compressed Sparse Row format>

In [14]:
X_vect.shape

(40000, 65804)

### Test data

In [15]:
test = pd.read_csv('Test.csv').values.reshape((-1,))
test = list(test)

In [16]:
clean_test = getCleanReview(test) 
test_vect = cv.transform(clean_test)

### Model Training 

In [17]:
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB

### Multinomial Naive Bayes, Gaussian Navie Bayes

In [18]:
mnb = MultinomialNB()
gnb = GaussianNB()

In [19]:
mnb.fit(X_vect,y)

MultinomialNB()

In [20]:
prediction = mnb.predict(test_vect)

In [21]:
pred_list = list(prediction)


In [22]:
pred_list2 = []
for x in pred_list:
    if x == 0:
        pred_list2.append('neg')
    else:
        pred_list2.append('pos')

In [23]:
type(pred_list2)

list

In [24]:
df = pd.DataFrame(data=pred_list2,columns=['label'])
df.to_csv('test_predication.csv',index_label='Id')

### Bernoulli Naive Bayes 

In [25]:
bnb = BernoulliNB()

In [26]:
bnb.fit(X_vect,y)

BernoulliNB()

In [27]:
pred_bnb = bnb.predict(test_vect)

In [28]:
pred_bnb = list(pred_bnb)
pred_bernoulli = []
for x in pred_bnb:
    if x == 0:
        pred_bernoulli.append('neg')
    else:
        pred_bernoulli.append('pos')

In [29]:
df = pd.DataFrame(data=pred_bernoulli,columns=['label'])
df.to_csv('test_predication_bnb.csv',index_label='Id')