# Sentimental Analysis - NLP

1. Sentiment analysis, also referred to as opinion mining, is an approach to natural language processing (NLP) that identifies the emotional tone behind a body of text. This is a popular way for organizations to determine and categorize opinions about a product, service, or idea.
2. Sentiment analysis is often performed on textual data to help businesses monitor brand and product sentiment in customer feedback, and understand customer needs.

In [1]:
# Importing Libraries
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load the data
data = pd.read_csv('Dataset/Restaurant_Reviews.tsv',sep='\t')

In [4]:
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [5]:
# Information of the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [6]:
# Importing essential libraries for performing Natural Language Processing
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [7]:
# we are removing the words from the stop words list: 'no', 'nor', 'not',isn't,"doesn't", "won't"
stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn','hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn','ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won','wouldn', "wouldn't"])

In [8]:
# Cleaning the reviews
corpus = []
for i in range(len(data)):
    #Cleaning Special Character from the reviews
    review = re.sub('[^a-zA-Z]', ' ', data['Review'][i])
    #Converting the entire review into lower case
    review = review.lower()
    
    #Tokenizing the reivew by word
    review = review.split()
    # Removing the stop words and Stemming the words
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if word not in stopwords]
    
    # joining the stemmed words
    review = ' '.join(review)
    
    # creating a corpus
    corpus.append(review)

In [10]:
corpus[:10]

['wow love place',
 'crust not good',
 'not tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price',
 'get angri want damn pho',
 'honeslti tast fresh',
 'potato like rubber could tell made ahead time kept warmer',
 'fri great',
 'great touch']

In [11]:
# Converting text into vector using tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(corpus).toarray()
y=data['Liked'].values

## Model Building

In [13]:
# split the data set into train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

# split the train data set into cross validation 
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=1)

In [14]:
#Traing the model using NBC, Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train, y_train)

In [15]:
# Model Summary
y_pred = model.predict(x_test)

from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
print ("Model Accuracy :", accuracy_score(y_test, y_pred))

Model Accuracy : 0.725


In [16]:
# confusion Matrix
confusion_matrix(y_test, y_pred)

array([[74, 34],
       [21, 71]], dtype=int64)

## Hyperparameter Tuning

hyperparameter optimization or tuning is the problem of choosing a set of optimal hyperparameters for a learning algorithm. A hyperparameter is a parameter whose value is used to control the learning process. By contrast, the values of other parameters (typically node weights) are learned.

In [18]:
best_accuracy = 0.0
alpha_val = 0.0
for i in np.arange(0.0, 1.1, 0.1):
    temp_classifier = MultinomialNB(alpha=i)
    temp_classifier.fit(x_train, y_train)
    # Evaluate CV acccuracy
    temp_y_pred = temp_classifier.predict(x_val)
    score = accuracy_score(y_val, temp_y_pred)
    print("Accuray Score for alpha={} is: {}%".format(round(i,1), round(score*100,2)))
    if score>best_accuracy:
        best_accuracy = score
        alpha_val =1
print('----------------------------------------')
print('Best accuracy is {}% with alpha value as {}'. format(round(best_accuracy*100, 2), round(alpha_val,1)))

Accuray Score for alpha=0.0 is: 75.0%
Accuray Score for alpha=0.1 is: 81.25%
Accuray Score for alpha=0.2 is: 82.5%
Accuray Score for alpha=0.3 is: 82.5%
Accuray Score for alpha=0.4 is: 81.88%
Accuray Score for alpha=0.5 is: 81.25%
Accuray Score for alpha=0.6 is: 81.88%
Accuray Score for alpha=0.7 is: 83.12%
Accuray Score for alpha=0.8 is: 83.12%
Accuray Score for alpha=0.9 is: 83.75%
Accuray Score for alpha=1.0 is: 83.75%
----------------------------------------
Best accuracy is 83.75% with alpha value as 1


In [19]:
# Train the model with tuned parameter
model = MultinomialNB(alpha = 0.9)
model.fit(x_train, y_train)
pred = model.predict(x_test)
# Final Accuracy check on test data
print("Model Accuracy:", accuracy_score(y_test, pred))

Model Accuracy: 0.73


In [22]:
# Predictions
def predict_sentiment(sample_review):
    sample_review = re.sub('[^a-zA-Z]', ' ', string=sample_review)
    sample_review = sample_review.lower()
    sample_review = sample_review.split()
    
    ps = PorterStemmer()
    sample_review = [ps.stem(word) for word in sample_review if word not in stopwords]
    sample_review = ' '.join(sample_review)
    
    temp = vectorizer.transform([sample_review]).toarray()
    return model.predict(temp)

In [23]:
# 1 means Positive review
predict_sentiment('Wow... Loved this place')

array([1], dtype=int64)

In [24]:
# 0 means Negative Review
predict_sentiment('The food quality is very very bad had order some soup it was so terrible could eat more than a spoonful.')

array([0], dtype=int64)

In [25]:
sample = 'The food quality is very very bad had order some soup it was so terrible could eat more than a spoonful.'
if predict_sentiment(sample)==True:
    print('Review is Positive')
else:
    print('Review is Negative')

Review is Negative


In [27]:

# Creating a pickle file for the CountVectorizer
import pickle
pickle.dump(cv, open('cv-transform', 'wb'))

import pickle
pickle.dump(cv, open('cv-transform.pkl', 'wb'))

# save the model to disk
import pickle
pickle.dump(model, open('Sentiment_Prediction_model', 'wb'))

# load the model from disk
loaded_model = pickle.load(open('Sentiment_Prediction_model','rb'))


NameError: name 'cv' is not defined