## NLP PIPELINE TO CLEAN "REVIEWS"
- LOAD INPUT AND READ REVIEW
- TOKENIZE
- REMOVE STOPWORDS
- PERFORM STEMMING
- WRITE CLEANED DATA TO OUTPUT FILE

In [1]:
SAMPLE_TEXT= """ The first horror movie I ever saw was “Jaws”–an all-time classic filmed in 1975 by Steven Spielberg. My parents did not let me watch “Alien,” “Friday 13,” or any other cult thrillers, so I desperately wanted to see one. Kids in my class were bragging about watching this or that scary movie, and I remember envying them greatly. One day when my parents were out, and I was staying home, lazily clicking the TV remote, when all of a sudden a huge maw with razor-sharp teeth almost jumped out of the screen. It was already the middle of the movie and I did not get all the premise, but I still watched it till the end. Next day, I could finally boast of watching a horror film."""

In [3]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer

In [5]:
rexp=RegexpTokenizer(r'\w+')
sw=set(stopwords.words("english"))
ps=PorterStemmer()

In [8]:
def get_cleaned_review(review):
    review=review.lower()
    review=review.replace("<br/><br>"," ")
    tokenized_review=rexp.tokenize(review)
    cleaned_review=[ token for token in tokenized_review if token not in sw ]
    stemmed_review=[ ps.stem(token) for token in cleaned_review]
    cleaned_review=" ".join(stemmed_review)
    return cleaned_review

In [9]:
get_cleaned_review(SAMPLE_TEXT)

'first horror movi ever saw jaw time classic film 1975 steven spielberg parent let watch alien friday 13 cult thriller desper want see one kid class brag watch scari movi rememb envi greatli one day parent stay home lazili click tv remot sudden huge maw razor sharp teeth almost jump screen alreadi middl movi get premis still watch till end next day could final boast watch horror film'

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
x = ["This was an awesome movie",
     "Great movie! I liked it a lot",
     "Happy Ending! awesome acting by the hero",
     "loved it! truly great",
     "bad not upto the mark",
     "could have better",
     "Surely a Disappointing movie"]
y= [1,1,1,1,0,0,0]

In [18]:
# Textual cleaning
x_cleaned=[get_cleaned_review(i) for i in x]

cv=CountVectorizer()

x_vec=cv.fit_transform(x_cleaned).toarray()



In [19]:
print(cv.get_feature_names())
print(x_vec)
print(x_vec.shape)


['act', 'awesom', 'bad', 'better', 'could', 'disappoint', 'end', 'great', 'happi', 'hero', 'like', 'lot', 'love', 'mark', 'movi', 'sure', 'truli', 'upto']
[[0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 0 0]
 [1 1 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1]
 [0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0]]
(7, 18)


# Multinomial Event Model

In [29]:
from sklearn.naive_bayes import MultinomialNB

In [30]:
nv=MultinomialNB()

In [31]:
# training 
nv.fit(x_vec,y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [32]:
# testing

# test data
test_x = ["I was happy happy and I loved the acting in the movie",
          "The movie I saw was bad"]
test_clean=[ get_cleaned_review(i) for i in test_x]

xt_vec=cv.transform(test_x)

nv.predict(xt_vec)


array([1, 0])

# Bernouli naive bayes

In [33]:
from sklearn.naive_bayes import BernoulliNB

In [34]:
bnb=BernoulliNB()

In [37]:
# training
bnb.fit(x_vec,y)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [38]:
# testing
bnb.predict(xt_vec)

array([1, 0])

In [42]:
bnb.predict_proba(xt_vec)

array([[0.49842269, 0.50157731],
       [0.76810888, 0.23189112]])

In [43]:
nv.predict_proba(xt_vec)

array([[0.42857143, 0.57142857],
       [0.64864865, 0.35135135]])