In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("Hotel-sentiments.tsv", sep = "\t", quoting = 3)

# cleaning the text

In [3]:
import re 
import nltk
from nltk.corpus import stopwords

In [4]:
nltk.download("stopwords")

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [5]:
data.Review[0]


'I loved the promising staff.'

In [6]:
review = re.sub("[^A-Za-z]", " ", data.Review[0])
review

'I loved the promising staff '

In [7]:
review = review.lower()

In [8]:
review = review.split()

In [9]:
review

['i', 'loved', 'the', 'promising', 'staff']

# Removing stop words

In [10]:
review_without_stopwords = [word for word in review if not word in set(stopwords.words("english"))]

In [11]:
review_without_stopwords

['loved', 'promising', 'staff']

In [12]:
# #  Achieving same result with a for loop
# reviews = []
# for word in review:
#     if word not in stopwords.words("english"):
#         reviews.append(word)
# print(reviews)

# Using Stemmer

In [13]:
from nltk.stem.porter import PorterStemmer

In [14]:
ps = PorterStemmer()

In [15]:
review = [ps.stem(word) for word in review if not word in set(stopwords.words("english"))]

In [16]:
review

['love', 'promis', 'staff']

# convert the review to string

In [17]:
review = " ".join(review)

# doing the process on the entire "Review" series

In [19]:
corpus = []
for i in range(0,1001):
    review = re.sub("[^A-Za-z]", " ", data.Review[i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words("english"))]
    review = " ".join(review)
    corpus.append(review)
    
    


# Creating bag of word model

In [34]:
from sklearn.feature_extraction.text import CountVectorizer

In [35]:
cv = CountVectorizer()

In [36]:
X = cv.fit_transform(corpus).toarray()

In [43]:
Y = data.iloc[:, 1]

# Split the data

In [46]:
from sklearn.model_selection import train_test_split

In [59]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

# Model Building

In [60]:
from sklearn.naive_bayes import GaussianNB

In [49]:
classifier = GaussianNB()

In [50]:
classifier.fit(X_train, Y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [51]:
Y_pred = classifier.predict(X_test)

In [65]:
from sklearn.metrics import confusion_matrix

In [66]:
cm = confusion_matrix(Y_test, Y_pred)

In [71]:
cm

array([[55, 46],
       [17, 83]], dtype=int64)

# Check Accuracy on Test Set

In [None]:
from sklearn.metrics import accuracy_score

In [73]:
accuracy_score(Y_test, Y_pred)

0.6865671641791045

# Check Accuracy on Train Set

In [74]:
accuracy_score(Y_train, classifier.predict(X_train))

0.92375