In [4]:
#This project aims to perform Sentiment analysis based on reviews left on Amazon. We will classify each review into 2 sentiments namely positive and negative.

#The dataset contains the review which is a text field and the sentiment which is labeled whether it is a positive or negative review.

In [5]:
import pandas as pd
amz_review = pd.read_csv('amazon_review.csv')  #Importing dataset csv
amz_review

Unnamed: 0,review,sentiment
0,So there is no way for me to plug it in here i...,negative
1,"Good case, Excellent value.",positive
2,Great for the jawbone.,positive
3,Tied to charger for conversations lasting more...,negative
4,The mic is great.,positive
...,...,...
995,The screen does get smudged easily because it ...,negative
996,What a piece of junk.. I lose more calls on th...,negative
997,Item Does Not Match Picture.,negative
998,The only thing that disappoint me is the infra...,negative


In [8]:
#Importing Count vectorizer class and initializing it

from sklearn.feature_extraction.text import CountVectorizer

In [9]:
words_as_numbers = CountVectorizer()

In [10]:
#fitting all the review data into the vectorizer to obtain all individual words in the reviews with their frequencies
words_as_numbers.fit( amz_review['review'] )

In [11]:
#Obtain a list of all unique words

words = words_as_numbers.get_feature_names_out()
words
words.tolist()

['10',
 '100',
 '11',
 '12',
 '13',
 '15',
 '15g',
 '18',
 '20',
 '2000',
 '2005',
 '2160',
 '24',
 '2mp',
 '325',
 '350',
 '375',
 '3o',
 '42',
 '44',
 '45',
 '4s',
 '50',
 '5020',
 '510',
 '5320',
 '680',
 '700w',
 '8125',
 '8525',
 '8530',
 'abhor',
 'ability',
 'able',
 'abound',
 'about',
 'above',
 'absolutel',
 'absolutely',
 'ac',
 'accept',
 'acceptable',
 'access',
 'accessable',
 'accessing',
 'accessory',
 'accessoryone',
 'accidentally',
 'accompanied',
 'according',
 'activate',
 'activated',
 'activesync',
 'actually',
 'ad',
 'adapter',
 'adapters',
 'add',
 'addition',
 'additional',
 'address',
 'adhesive',
 'adorable',
 'advertised',
 'advise',
 'after',
 'again',
 'against',
 'aggravating',
 'ago',
 'alarm',
 'all',
 'allot',
 'allow',
 'allowing',
 'allows',
 'almost',
 'alone',
 'along',
 'alot',
 'also',
 'although',
 'aluminum',
 'always',
 'am',
 'amazed',
 'amazing',
 'amazon',
 'amp',
 'ample',
 'an',
 'and',
 'angeles',
 'angle',
 'another',
 'answer',
 'ant

In [12]:
#Converting each words into a unique sequence of 0s and 1s and transforming the reviews into a numberical Matrix representation for further processing

words_as_numbers_matrix = words_as_numbers.transform( amz_review['review'] ).toarray() 
words_as_numbers_matrix


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [13]:
#Performing logistic regression to perform classifical of reviews by sentiment

from sklearn.linear_model import LogisticRegression #importing appropriate libraries

In [14]:
model = LogisticRegression() #initializing the model

In [15]:
model.fit(words_as_numbers_matrix, amz_review['sentiment'])  #Fitting the model

In [16]:
model.score(words_as_numbers_matrix, amz_review['sentiment']) #Finding the adjusted r-square value thereby obtaining model fit

0.992

In [17]:
#Analyzing if new review is positive or negative

new_review = ['I love my new purchase from Amazon!']    
new_review_as_number = words_as_numbers.transform(new_review)

print(model.predict(new_review_as_number))

['positive']


In [18]:
#Alternative review

new_review = ['terrible investment. I regret purchasing it!']    
 
new_review_as_number = words_as_numbers.transform(new_review)
print(model.predict(new_review_as_number))
predictions=model.predict(words_as_numbers_matrix)

['negative']


In [19]:
#Creating confusion matrix to evaluate model accuracy of classifications

from sklearn import metrics   #building a confusion matrix

cm = metrics.confusion_matrix(amz_review['sentiment'], predictions)
print(cm)

[[496   4]
 [  4 496]]


In [48]:
#Updating classification threshold from 0.5 to new value for Logistic Regression


import numpy as np

# set a new prediction threshold
threshold = 0.7

words_as_numbers_matrix = words_as_numbers.transform( amz_review['review'] ).toarray() 
amz_review_predict=model.predict(words_as_numbers_matrix)

# predict the probability estimates for the input data
prob = model.predict_proba(words_as_numbers_matrix)

# apply the threshold to obtain the predicted classes
pred = np.where(prob[:,1] > threshold, 'positive', 'negative')
pred

# pred will contain the predicted classes based on the new threshold

cm = metrics.confusion_matrix(amz_review['sentiment'], pred)
print(cm)



In [50]:
#End of Code