In [2]:
X_train = ["This was awesome an awesome movie",
	     "Great movie! I liked it a lot",
		"Happy Ending! awesome acting by the hero",
		"loved it! truly great",
		"bad not up to the mark",
		"could have been better",
		"Surel a dissappointing movie"]

y_train = [1,1,1,1,0,0,0] # 1->Positive, 0->Negative

X_test = ["I was happy & happy and I loved the acting in the movie",
		"The movie I saw was bad",
    "The movie was the best thing to watch"]

In [3]:
X_train

['This was awesome an awesome movie',
 'Great movie! I liked it a lot',
 'Happy Ending! awesome acting by the hero',
 'loved it! truly great',
 'bad not up to the mark',
 'could have been better',
 'Surel a dissappointing movie']

##Data Cleaning

In [4]:
from nltk.tokenize import RegexpTokenizer

In [5]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
tokenizer = RegexpTokenizer(r'\w+')
en_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

In [8]:
def getCleanedText(text):
  text.lower()

  #tokenize
  tokens = tokenizer.tokenize(text)
  new_tokens = [token for token in tokens if token not in en_stopwords] #adds words only if not present in stopwords

  #stemming
  stemmed_tokens = [ps.stem(tokens) for tokens in new_tokens]

  clean_text = " ".join(stemmed_tokens)
  return clean_text


In [9]:
X_clean = [getCleanedText(i) for i in X_train]
Xt_clean = [getCleanedText(i) for i in X_test]

In [10]:
X_clean

['thi awesom awesom movi',
 'great movi i like lot',
 'happi end awesom act hero',
 'love truli great',
 'bad mark',
 'could better',
 'surel dissappoint movi']

##Vectorization

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
cv = CountVectorizer(ngram_range = (1,2))

In [13]:
X_vec = cv.fit_transform(X_clean).toarray()
X_vec

array([[0, 0, 2, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1,
        1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0]])

In [14]:
print(cv.get_feature_names_out())

['act' 'act hero' 'awesom' 'awesom act' 'awesom awesom' 'awesom movi'
 'bad' 'bad mark' 'better' 'could' 'could better' 'dissappoint'
 'dissappoint movi' 'end' 'end awesom' 'great' 'great movi' 'happi'
 'happi end' 'hero' 'like' 'like lot' 'lot' 'love' 'love truli' 'mark'
 'movi' 'movi like' 'surel' 'surel dissappoint' 'thi' 'thi awesom' 'truli'
 'truli great']


In [22]:
Xt_vect = cv.transform(Xt_clean).toarray()

##Multinomial Naive Bayes

In [16]:
from sklearn.naive_bayes import MultinomialNB

In [17]:
mn = MultinomialNB()

In [18]:
mn.fit(X_vec,y_train)

In [19]:
y_pred = mn.predict(Xt_vect)

In [24]:
for i ,cnt in enumerate(y_pred):
  if i==1:
    print(f"{X_test[cnt]}. is a Positive sentence")
    print(i)
  else:
    print(f"{X_test[cnt]}. is a Negative sentence")
    print(i)

The movie I saw was bad. is a Negative sentence
0
I was happy & happy and I loved the acting in the movie. is a Positive sentence
1
The movie I saw was bad. is a Negative sentence
2
