## Create a NLP pipeline to 'clean' reviews data
- Load Input files and read reviews
- Tokenize
- Remove stopwords
- perform stemming
- Write clean data to output file

In [None]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from tensorflow.keras.datasets import imdb

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
tokenizer = RegexpTokenizer(r'\w+') # regular expression tokenizer
en_stopwwords = set(stopwords.words("english"))
ps = PorterStemmer()

In [None]:
def get_stemmed_review(review):
    review = review.lower()
    review = review.replace("br br "," ")
    #tokenize
    tokens = tokenizer.tokenize(review)
    new_tokens = [token for token in tokens if token not in en_stopwwords]
    stemmed_token = [ps.stem(token) for token in new_tokens]
    
    cleaned_review = " ".join(stemmed_token)
    return cleaned_review


In [None]:
(x_train,y_train),(x_test,y_test) = imdb.load_data()

In [None]:
vocab = imdb.get_word_index()

In [None]:
inv_vocab = dict(zip(vocab.values(),vocab.keys()))

In [None]:
train_data = []
for row in x_train:
    line = [inv_vocab[index-3] for index in row if index >=3]
    line = " ".join(line)
    train_data.append(line)

In [None]:
test_data = []
for row in x_test:
    line = [inv_vocab[index-3] for index in row if index >=3]
    line = " ".join(line)
    test_data.append(line)

In [None]:
clean_train_data = []
for row in train_data:
    line = get_stemmed_review(row)
    clean_train_data.append(line)

In [11]:
print(train_data[0])
print(clean_train_data[0])

this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert redford's is an amazing actor and now the same being director norman's father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for retail and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also congratulations to the two little boy's that played the part's of norman and paul they were just brilliant children are often left out of the praising list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be p

In [None]:
clean_test_data = []
for row in test_data:
    line = get_stemmed_review(row)
    clean_test_data.append(line)

## Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
cv = CountVectorizer()
X_train_vec = cv.fit_transform(clean_train_data[:10000]).toarray()
print(X_train_vec.shape)

(10000, 35162)


In [15]:
X_test_vec = cv.transform(clean_test_data[:2]).toarray()
print(X_test_vec.shape)

(2, 35162)


## Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB

In [None]:
mnb =MultinomialNB()

In [19]:
mnb.fit(X_train_vec,y_train[:10000])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [20]:
mnb.predict(X_test_vec)

array([0, 1])

In [21]:
y_test[:2]

array([0, 1])

In [29]:
mnb.score(X_train_vec,y_train[:10000])

0.9212

## Bernoulli Naive Bayes

In [None]:
bnb = BernoulliNB()

In [25]:
bnb.fit(X_train_vec,y_train[:10000])

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [26]:
bnb.predict(X_test_vec)

array([0, 1])

In [27]:
bnb.score(X_train_vec,y_train[:10000])

0.9191