Model to classify product reviews as positive, negative, or neural

In [1]:
# Import Libraries
import pandas as pd


In [2]:
#Load the dataset
df=pd.read_csv("amazon_reviews.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,reviewerName,overall,reviewText,reviewTime,day_diff,helpful_yes,helpful_no,total_vote,score_pos_neg_diff,score_average_rating,wilson_lower_bound
0,0,,4.0,No issues.,2014-07-23,138,0,0,0,0,0.0,0.0
1,1,0mie,5.0,"Purchased this for my device, it worked as adv...",2013-10-25,409,0,0,0,0,0.0,0.0
2,2,1K3,4.0,it works as expected. I should have sprung for...,2012-12-23,715,0,0,0,0,0.0,0.0
3,3,1m2,5.0,This think has worked out great.Had a diff. br...,2013-11-21,382,0,0,0,0,0.0,0.0
4,4,2&amp;1/2Men,5.0,"Bought it with Retail Packaging, arrived legit...",2013-07-13,513,0,0,0,0,0.0,0.0


In [3]:
# check the NAN values
df.isna().sum()

Unnamed: 0              0
reviewerName            1
overall                 0
reviewText              1
reviewTime              0
day_diff                0
helpful_yes             0
helpful_no              0
total_vote              0
score_pos_neg_diff      0
score_average_rating    0
wilson_lower_bound      0
dtype: int64

In [4]:
#info of the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4915 entries, 0 to 4914
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            4915 non-null   int64  
 1   reviewerName          4914 non-null   object 
 2   overall               4915 non-null   float64
 3   reviewText            4914 non-null   object 
 4   reviewTime            4915 non-null   object 
 5   day_diff              4915 non-null   int64  
 6   helpful_yes           4915 non-null   int64  
 7   helpful_no            4915 non-null   int64  
 8   total_vote            4915 non-null   int64  
 9   score_pos_neg_diff    4915 non-null   int64  
 10  score_average_rating  4915 non-null   float64
 11  wilson_lower_bound    4915 non-null   float64
dtypes: float64(3), int64(6), object(3)
memory usage: 460.9+ KB


In [5]:
#drop the row which contain nan value
df=df.dropna()

In [6]:
df.isna().sum()

Unnamed: 0              0
reviewerName            0
overall                 0
reviewText              0
reviewTime              0
day_diff                0
helpful_yes             0
helpful_no              0
total_vote              0
score_pos_neg_diff      0
score_average_rating    0
wilson_lower_bound      0
dtype: int64

In [7]:
#Extract the dataset in X and y varible as dependent and independent
X=df[['reviewText']] #independent
y=df['overall'] #dependent
X,y

(                                             reviewText
 1     Purchased this for my device, it worked as adv...
 2     it works as expected. I should have sprung for...
 3     This think has worked out great.Had a diff. br...
 4     Bought it with Retail Packaging, arrived legit...
 5     It's mini storage.  It doesn't do anything els...
 ...                                                 ...
 4910  I bought this Sandisk 16GB Class 10 to use wit...
 4911  Used this for extending the capabilities of my...
 4912  Great card that is very fast and reliable. It ...
 4913  Good amount of space for the stuff I want to d...
 4914  I've heard bad things about this 64gb Micro SD...
 
 [4913 rows x 1 columns],
 1       5.0
 2       4.0
 3       5.0
 4       5.0
 5       5.0
        ... 
 4910    1.0
 4911    5.0
 4912    5.0
 4913    5.0
 4914    5.0
 Name: overall, Length: 4913, dtype: float64)

In [8]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [9]:
#Download tokenize
nltk.download('stopwords') # to remove the punctuation
nltk.download('punkt')

[nltk_data] Downloading package stopwords to C:\Users\Ayush
[nltk_data]     Pathak\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Ayush
[nltk_data]     Pathak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [25]:
#adding a  stopwords
stop_words=set(stopwords.words('english'))
lem=WordNetLemmatizer()

In [26]:
#cleaning the text
def clean_text(text):
    #to keep input is string to avoid errors (though .astype(str) handles this)
    text = str(text)

    #  Convert to lowercase
    text = text.lower()

    # remove the punctuation
    text=re.sub(r'[^\w\s]', '',text)

    # tokenize the text
    tokens=word_tokenize(text)

    # remove stop_words and lemmetize the text
    clr_tokens = [lem.lemmatize(word) for word in tokens if word not in stop_words]
    # Join the cleaned tokens back into a single string
    return " ".join(clr_tokens)

In [27]:
#updating the reviewText Columns with the clean text
df[['clean_text']]=df[['reviewText']].astype(str).apply(clean_text)
df[['clean_text']]

  df[['clean_text']]=df[['reviewText']].astype(str).apply(clean_text)


Unnamed: 0,clean_text
1,1 purchased device worked adv 2 work expected ...
2,1 purchased device worked adv 2 work expected ...
3,1 purchased device worked adv 2 work expected ...
4,1 purchased device worked adv 2 work expected ...
5,1 purchased device worked adv 2 work expected ...
...,...
4910,1 purchased device worked adv 2 work expected ...
4911,1 purchased device worked adv 2 work expected ...
4912,1 purchased device worked adv 2 work expected ...
4913,1 purchased device worked adv 2 work expected ...


In [28]:
#Tf- IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['clean_text'])


In [29]:
def mapping(label):
    if label in [1, 2]:
        return 'Negative'
    elif label == 3:
        return 'Neutral'
    else:
        return 'Positive'

# Apply mapping
df['overall'] = df['overall'].apply(mapping)

In [30]:
#Splitting into train-test
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.2, random_state=42)

In [31]:
#Train the model
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(X_train,y_train)

In [32]:
#Evqluate the model
from sklearn.metrics import accuracy_score, precision_score , recall_score, f1_score
y_pred=model.predict(X_test)
print("Accuracy:",accuracy_score(y_test,y_pred))
print("Precision",precision_score(y_test,y_pred, average='micro'))
print("Recall",recall_score(y_test,y_pred, average='micro'))
print("F1-score",f1_score(y_test,y_pred, average='micro'))


#print("\nClassification_Report:\n",classification_report(y_test,y_pred))


Accuracy: 0.7934893184130214
Precision 0.7934893184130214
Recall 0.7934893184130214
F1-score 0.7934893184130214


In [33]:
import pickle
pickle.dump(model,open("sentiment_model.pkl", "wb"))
pickle.dump(vectorizer,open("tfidf_vectorizer.pkl", "wb"))