In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [2]:
# Load the dataset
df = pd.read_csv("output.csv")
df

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1
...,...,...,...,...,...
3145,5,30-Jul-18,Black Dot,"Perfect for kids, adults and everyone in betwe...",1
3146,5,30-Jul-18,Black Dot,"Listening to music, searching locations, check...",1
3147,5,30-Jul-18,Black Dot,"I do love these things, i have them running my...",1
3148,5,30-Jul-18,White Dot,Only complaint I have is that the sound qualit...,1


In [3]:
# Clean the text data
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    text = re.sub(r"\s+", " ", text)
    words = text.split()
    words = [word for word in words if word not in stopwords.words("english")]
    text = " ".join(words)
    return text

df["verified_reviews"] = df["verified_reviews"].apply(clean_text)

In [4]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df["verified_reviews"], df["verified_reviews"], test_size=0.2)

In [5]:
# Vectorize the text data
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [6]:
# Train the models
nb_model = MultinomialNB().fit(X_train, y_train)


In [7]:
# Evaluate the models
nb_preds = nb_model.predict(X_test)


In [8]:
from sklearn.metrics import precision_score, recall_score
print("Naive Bayes")
print(f"Accuracy: {accuracy_score(y_test, nb_preds)}")
print(f"Precision: {precision_score(y_test, nb_preds, zero_division=1, average='macro')}")
print(f"Recall: {recall_score(y_test, nb_preds, zero_division=1, average='macro')}")


Naive Bayes
Accuracy: 0.18253968253968253
Precision: 0.8958553428812265
Recall: 0.24749163879598662


In [18]:
from textblob import TextBlob

def getPolarity(text):
    return TextBlob(text).polarity
df['Polarity']= df['verified_reviews'].apply(getPolarity)

# apply TextBlob's sentiment analysis function to each review and extract polarity score
df['Polarity'] = [TextBlob(review).polarity for review in df['verified_reviews']]

# create a function to categorize polarity scores as positive, neutral, or negative
def getAnalysis(score):
    if score < 0:
        return 'negative'
    elif score == 0:
        return 'neutral'
    else:
        return 'positive'

# apply the analysis function to the polarity column to create a new column called Analysis
df['Analysis'] = df['Polarity'].apply(getAnalysis)

# show the updated dataframe with the new columns
df



Unnamed: 0,rating,date,variation,verified_reviews,feedback,Polarity,Analysis
0,5,31-Jul-18,Charcoal Fabric,love echo,1,0.500000,positive
1,5,31-Jul-18,Charcoal Fabric,loved,1,0.700000,positive
2,4,31-Jul-18,Walnut Finish,sometimes playing game answer question correct...,1,-0.133333,negative
3,5,31-Jul-18,Charcoal Fabric,lot fun thing 4 yr old learns dinosaurs contro...,1,0.350000,positive
4,5,31-Jul-18,Charcoal Fabric,music,1,0.000000,neutral
...,...,...,...,...,...,...,...
3145,5,30-Jul-18,Black Dot,perfect kids adults everyone,1,1.000000,positive
3146,5,30-Jul-18,Black Dot,listening music searching locations checking t...,1,0.250000,positive
3147,5,30-Jul-18,Black Dot,love things running entire home tv lights ther...,1,0.237662,positive
3148,5,30-Jul-18,White Dot,complaint sound quality great mostly use comma...,1,0.356250,positive
