In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv("amazon_reviews.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,reviewerName,overall,reviewText,reviewTime,day_diff,helpful_yes,helpful_no,total_vote,score_pos_neg_diff,score_average_rating,wilson_lower_bound
0,0,,4,No issues.,23-07-2014,138,0,0,0,0,0.0,0.0
1,1,0mie,5,"Purchased this for my device, it worked as adv...",25-10-2013,409,0,0,0,0,0.0,0.0
2,2,1K3,4,it works as expected. I should have sprung for...,23-12-2012,715,0,0,0,0,0.0,0.0
3,3,1m2,5,This think has worked out great.Had a diff. br...,21-11-2013,382,0,0,0,0,0.0,0.0
4,4,2&amp;1/2Men,5,"Bought it with Retail Packaging, arrived legit...",13-07-2013,513,0,0,0,0,0.0,0.0


In [3]:
print(df.shape)

(4916, 12)


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import string
import pickle

In [5]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [6]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [7]:
lemmatizer = WordNetLemmatizer()

In [8]:
df.dropna(subset=['reviewText','overall'], inplace=True)

In [9]:
def get_sentiment(rating):
    if rating >=5:
        return 'positive'
    elif rating >=3:
        return 'neutral'
    else:
        return 'negative'

df['sentiment'] = df['overall'].apply(get_sentiment)
df['sentiment'].value_counts()

sentiment
positive    3921
neutral      669
negative     325
Name: count, dtype: int64

In [10]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r"http\S+", "",text)
    text = re.sub(r"<.*?>", "",text)
    text = re.sub(r"[^\w\s]", "",text)
    text = re.sub(r"\d+", "",text)
    text = re.sub(r"\s+", " ",text).strip()

    tokens = word_tokenize(text)
    lemmatized = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(lemmatized)

In [11]:
df['cleanreviews'] = df['reviewText'].apply(preprocess)

In [12]:
maplabel= {'negative': 0, 'neutral': 1, 'positive': 2}
df['label'] = df['sentiment'].map(maplabel)

In [13]:
tfidf=TfidfVectorizer(max_features=5000,ngram_range=(1, 2),lowercase=True, stop_words='english', min_df=2)
X=tfidf.fit_transform(df['cleanreviews'])
y = df['label']

In [14]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=42)

In [15]:
from sklearn.svm import LinearSVC
model = LinearSVC(class_weight='balanced')
model.fit(X_train, y_train)

In [16]:
# model = LogisticRegression(multi_class='multinomial', solver='lbfgs', class_weight='balanced', max_iter=1000)
# model.fit(X_train,y_train)

In [17]:
y_pred=model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy of the model: {accuracy}")

pre=precision_score(y_test,y_pred,average='macro')
re=recall_score(y_test,y_pred,average='macro')
f1=f1_score(y_test,y_pred,average='macro')
print(f"Precision Score: {pre}")
print(f"Recall Score: {re}")
print(f"F1 Score: {f1}")

Accuracy of the model: 0.7955239064089522
Precision Score: 0.5855327592029719
Recall Score: 0.5431594945632126
F1 Score: 0.5610100240747004


In [18]:
cr=classification_report(y_test,y_pred,target_names=['negative','neutral','positive'])
print(f"Classification Report of the model:\n {cr}")

Classification Report of the model:
               precision    recall  f1-score   support

    negative       0.66      0.54      0.60        57
     neutral       0.23      0.17      0.20       127
    positive       0.87      0.91      0.89       799

    accuracy                           0.80       983
   macro avg       0.59      0.54      0.56       983
weighted avg       0.77      0.80      0.78       983



In [19]:
import pickle

with open("SentModel.pkl",'wb') as file:
    pickle.dump(model,file)

with open('SentVectorizer.pkl','wb') as file:
    pickle.dump(tfidf,file)

print("Model and Vectorizerr is dumped successfully")

Model and Vectorizerr is dumped successfully


In [20]:
df.head(50)

Unnamed: 0.1,Unnamed: 0,reviewerName,overall,reviewText,reviewTime,day_diff,helpful_yes,helpful_no,total_vote,score_pos_neg_diff,score_average_rating,wilson_lower_bound,sentiment,cleanreviews,label
0,0,,4,No issues.,23-07-2014,138,0,0,0,0,0.0,0.0,neutral,no issue,1
1,1,0mie,5,"Purchased this for my device, it worked as adv...",25-10-2013,409,0,0,0,0,0.0,0.0,positive,purchased this for my device it worked a adver...,2
2,2,1K3,4,it works as expected. I should have sprung for...,23-12-2012,715,0,0,0,0,0.0,0.0,neutral,it work a expected i should have sprung for th...,1
3,3,1m2,5,This think has worked out great.Had a diff. br...,21-11-2013,382,0,0,0,0,0.0,0.0,positive,this think ha worked out greathad a diff bran ...,2
4,4,2&amp;1/2Men,5,"Bought it with Retail Packaging, arrived legit...",13-07-2013,513,0,0,0,0,0.0,0.0,positive,bought it with retail packaging arrived legit ...,2
5,5,2Cents!,5,It's mini storage. It doesn't do anything els...,29-04-2013,588,0,0,0,0,0.0,0.0,positive,it mini storage it doesnt do anything else and...,2
6,6,2K1Toaster,5,I have it in my phone and it never skips a bea...,19-10-2013,415,0,0,0,0,0.0,0.0,positive,i have it in my phone and it never skip a beat...,2
7,7,"35-year Technology Consumer ""8-tracks to 802.11""",5,It's hard to believe how affordable digital ha...,07-10-2014,62,0,0,0,0,0.0,0.0,positive,it hard to believe how affordable digital ha b...,2
8,8,4evryoung,5,Works in a HTC Rezound. Was running short of ...,24-03-2014,259,1,0,1,1,1.0,0.206549,positive,work in a htc rezound wa running short of spac...,2
9,9,53rdcard,5,"in my galaxy s4, super fast card, and am total...",10-11-2013,393,0,0,0,0,0.0,0.0,positive,in my galaxy s super fast card and am totally ...,2
