In [14]:
# Importing the libraries
import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
# To count the iterations 
from tqdm import tqdm

In [26]:
# Importing the dataset
dataset = pd.read_csv(r'C:\Users\HP\Desktop\Reviews1.csv')

In [27]:
# Dropping the dups in dataset
dataset = dataset.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)

In [28]:
def removeHTMLTags(review):
    soup = BeautifulSoup(review, 'lxml')
    return soup.get_text()

In [29]:
def removeApostrophe(review):
    phrase = re.sub(r"won't", "will not", review)
    phrase = re.sub(r"can\'t", "can not", review)
    phrase = re.sub(r"n\'t", " not", review)
    phrase = re.sub(r"\'re", " are", review)
    phrase = re.sub(r"\'s", " is", review)
    phrase = re.sub(r"\'d", " would", review)
    phrase = re.sub(r"\'ll", " will", review)
    phrase = re.sub(r"\'t", " not", review)
    phrase = re.sub(r"\'ve", " have", review)
    phrase = re.sub(r"\'m", " am", review)
    return phrase

def removeAlphaNumericWords(review):
     return re.sub("\S*\d\S*", "", review).strip()
 
def removeSpecialChars(review):
     return re.sub('[^a-zA-Z]', ' ', review)

def scorePartition(x):
    if x < 3:
        return 0
    return 1

def doTextCleaning(review):
    review = removeHTMLTags(review)
    review = removeApostrophe(review)
    review = removeAlphaNumericWords(review)
    review = removeSpecialChars(review) 
    # Lower casing
    review = review.lower()  
    #Tokenization
    review = review.split()
    #Removing Stopwords and Lemmatization
    lmtzr = WordNetLemmatizer()
    review = [lmtzr.lemmatize(word, 'v') for word in review if not word in set(stopwords.words('english'))]
    review = " ".join(review)    
    return review


In [30]:
# Generalizing the score
actualScore = dataset['Score']
positiveNegative = actualScore.map(scorePartition) 
dataset['Score'] = positiveNegative

In [31]:
# creating the document corpus
corpus = []   
for index, row in tqdm(dataset.iterrows()):
    review = doTextCleaning(row['Text'])
    corpus.append(review)



0it [00:00, ?it/s][A[A

3it [00:00, 28.39it/s][A[A

9it [00:00, 33.28it/s][A[A

12it [00:00, 31.29it/s][A[A

18it [00:00, 36.49it/s][A[A

23it [00:00, 38.94it/s][A[A

29it [00:00, 43.41it/s][A[A

34it [00:00, 37.13it/s][A[A

38it [00:00, 37.41it/s][A[A

42it [00:01, 32.24it/s][A[A

48it [00:01, 37.07it/s][A[A

53it [00:01, 36.63it/s][A[A

59it [00:01, 41.27it/s][A[A

64it [00:01, 40.41it/s][A[A

69it [00:01, 40.07it/s][A[A

74it [00:01, 33.43it/s][A[A

79it [00:01, 36.80it/s][A[A

84it [00:02, 30.84it/s][A[A

88it [00:02, 31.70it/s][A[A

93it [00:02, 32.57it/s][A[A

97it [00:02, 32.58it/s][A[A

103it [00:02, 37.35it/s][A[A

109it [00:02, 38.66it/s][A[A

114it [00:03, 34.62it/s][A[A

118it [00:03, 30.70it/s][A[A

122it [00:03, 29.94it/s][A[A

126it [00:03, 30.48it/s][A[A

131it [00:03, 34.51it/s][A[A

135it [00:03, 33.96it/s][A[A

139it [00:03, 33.71it/s][A[A

144it [00:03, 35.71it/s][A[A

148it [00:04, 33.00it/s][A[A

15

In [32]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer

In [33]:
#Creating a tranform
cv = CountVectorizer(ngram_range=(1,3), max_features = 500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,6].values

In [35]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)


In [36]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [37]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [38]:
# Predict the sentiment for new review
def predictNewReview():
    newReview = input("Type the Review: ")
    
    if newReview =='':
        print('Invalid Review')  
    else:
        newReview = doTextCleaning(newReview)
        new_review = cv.transform([newReview]).toarray()  
        prediction =  classifier.predict(new_review)
        print(prediction)
        if prediction[0] == 1:
            print( "Positive Review" )
        else:        
            print( "Negative Review")