In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
def cleaning_data(review):
    data_train_Exclude_tags = re.sub(r'<[^<>]+>', " ", review)      # Excluding the html tags
    data_train_num = re.sub(r'[0-9]+', 'number', data_train_Exclude_tags)  # Converting numbers to "NUMBER"
    data_train_lower = data_train_num.lower()              # Converting to lower case.
    data_train_split = data_train_lower.split()            # Splitting into individual words.
    stopWords = set(stopwords.words("english") )
    stopWords.remove("not")         #removing the "not" form stop words set
    meaningful_words = [w for w in data_train_split if not w in stopWords]     # Removing stop words.
    
    return( " ".join( meaningful_words )) 

In [3]:
# Reading the Data
data_train = pd.read_csv('labeledTrainData.tsv',delimiter = "\t")

In [4]:
x = data_train['review']
y = data_train['sentiment']
print(x.shape, y.shape)

(25000,) (25000,)


In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
y_train

(20000,) (5000,) (20000,) (5000,)


23311    0
23623    0
1020     0
12645    1
1533     1
        ..
21575    0
5390     1
860      1
15795    1
23654    0
Name: sentiment, Length: 20000, dtype: int64

In [6]:
print("List of stop words!")
a = set(stopwords.words("english") )
print(a)
print("---Ended---\n")

List of stop words!
{'that', 'did', 'we', "mightn't", 'should', 'won', 'myself', 're', 'll', 'why', 'are', 'my', 's', "didn't", 'i', 'but', 'most', "hadn't", 'by', "isn't", 'some', 'because', 'mightn', 'does', 'the', 'yourself', 'while', 'few', "should've", 'under', 'he', 'same', 'has', 'again', 'her', 'further', 'each', "shouldn't", "you'd", "don't", 'themselves', 'had', 'such', 'm', 'after', 'to', 'than', 'own', 'an', 'more', 'our', 'with', 'here', 'until', 'for', "shan't", 'hers', "needn't", 'of', 'down', "won't", 'have', 'before', 'shouldn', 'their', 'at', 'there', 'both', 'below', 've', 'were', 'himself', "she's", 'herself', 'am', 'about', 'being', 'on', 'be', 'and', 'when', 'they', 'o', "that'll", 'who', 'now', 'your', 'between', 'those', "aren't", 'doing', 'don', 'against', 'needn', 'whom', 'wouldn', 'what', 'ourselves', 'is', 'during', 'wasn', "you'll", 'isn', 'from', 'over', 'where', 'no', 'd', "doesn't", 'shan', 'ain', "wouldn't", 'so', 'you', 'them', 'these', 'then', 'his', 

In [7]:
# cleaning the data.
cleanWords = []
for i in range(x_train.size):
    cleanWords.append(cleaning_data(x_train.iloc[i]))
print(cleanWords[0])

movie plain dumb. casting ralph meeker mike hammer fatuous climax, film exercise wooden predictability. mike hammer one detective fiction's true sociopaths. unlike marlow spade, put pieces together solve mystery, hammer breaks things apart get truth. film turns hammer boob surrounding bad guys ... well, dumb get away anything. one poorly drawn succumbs popcorn attack. parts movie right three stooges play book. velda's dance barre, instance, bad guy accidentally stabs boss back. continuity breaks shameful: frau blucher running centerline road camera tight lower legs way side camera pulls back wider shot. worst break, however, precedes popcorn attack. bad guy stalking hammer passes clock seconds hero, except clock shows seven minutes behind guy. fair, interesting camera angles lighting, grand finale bad must seen, reason gets two points number.


In [8]:
# Creating features from bags of words.
vectorizer = TfidfVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000)
data_train_features = vectorizer.fit_transform(cleanWords)
data_train_features = data_train_features.toarray()         # 25000x5000 sparse matrix, with 2105457 stored elements in compressed Sparse Row format.


In [9]:
# Training

print("Training the classifier\n")
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
nb_model = BernoulliNB().fit(data_train_features, y_train)
print("accuracy: ",nb_model.score(data_train_features, y_train))

Training the classifier

accuracy:  0.86655


In [10]:
testcleanWords = []
for i in range(x_test.size):
    #print('Processin', i)
    testcleanWords.append( cleaning_data( x_test.iloc[i] ))
print("---Review Processing Done!---\n")

---Review Processing Done!---



In [11]:
# Creating features from bags of words.
data_test_features = vectorizer.transform(testcleanWords)
#data_train_features = data_train_features.toarray()         # 25000x5000 sparse matrix, with 2105457 stored elements in compressed Sparse Row format.
print("Test Features Created!!!\n")

Test Features Created!!!



In [12]:
from sklearn.metrics import confusion_matrix, classification_report
predicted = nb_model.predict(data_test_features)
matrix = confusion_matrix(y_test, predicted)
print(matrix)
report = classification_report(y_test, predicted)
print(report)

[[2124  357]
 [ 363 2156]]
              precision    recall  f1-score   support

           0       0.85      0.86      0.86      2481
           1       0.86      0.86      0.86      2519

    accuracy                           0.86      5000
   macro avg       0.86      0.86      0.86      5000
weighted avg       0.86      0.86      0.86      5000

