In [51]:
import re
import pandas as pd 
import nltk
from nltk.corpus import stopwords , opinion_lexicon
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import plotly.graph_objects as go




nltk.download('vader_lexicon')
nltk.download('opinion_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


True

In [63]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("C:/Users/Lenovo/Desktop/ds_tools_api/reviews.csv")
df.head()

Unnamed: 0,asin,title,comment,rate,helpful_votes,review_date,is_verified_purchase
0,B0CL5KNB9M,PlayStation 5 Digital Edition Fortnite Cobalt...,I recently purchased the PlayStation 5 Pro Di...,5,6 people found this helpful,"Reviewed in the United States on December 7, 2024",True
1,B0CL5KNB9M,Fives No Jive,A longtime PlayStation fan I had been putting ...,5,551 people found this helpful,"Reviewed in the United States on June 8, 2024",True
2,B0CL5KNB9M,PlayStation5 Console Slim Review,The PlayStation5 Console Slim is a fantastic e...,5,94 people found this helpful,"Reviewed in the United States on November 15, ...",True
3,B0CL5KNB9M,Compact Sleek and Reliable,The PS5 Digital Slim has worked flawlessly Its...,5,,"Reviewed in the United States on December 11, ...",True
4,B0CL5KNB9M,works great,turn it on clean shiny fast load the only thin...,5,19 people found this helpful,"Reviewed in the United States on November 29, ...",True


In [3]:
# clean reviews content
df['comment'] = df['comment'].apply(lambda comment: re.sub(r'<[^>]+>|\W+|\d+', ' ', str(comment)).lower())

# tokenize comments
df['words'] = df['comment'].apply(lambda comment: nltk.word_tokenize(comment))

# remove stop words
stop_words = set(stopwords.words('english'))
df['words'] = df['words'].apply(lambda words: [word for word in words if word not in stop_words])

# join words back into a string
df['comment'] = df['words'].apply(lambda words: ' '.join(words))
df = df.drop(columns=['words'])



df.head()

Unnamed: 0,asin,title,comment,rate,helpful_votes,review_date,is_verified_purchase
0,B0CL5KNB9M,PlayStation 5 Digital Edition Fortnite Cobalt...,recently purchased playstation pro digital edi...,5,6 people found this helpful,"Reviewed in the United States on December 7, 2024",True
1,B0CL5KNB9M,Fives No Jive,longtime playstation fan putting getting ps th...,5,551 people found this helpful,"Reviewed in the United States on June 8, 2024",True
2,B0CL5KNB9M,PlayStation5 Console Slim Review,playstation console slim fantastic evolution o...,5,94 people found this helpful,"Reviewed in the United States on November 15, ...",True
3,B0CL5KNB9M,Compact Sleek and Reliable,ps digital slim worked flawlessly compact desi...,5,,"Reviewed in the United States on December 11, ...",True
4,B0CL5KNB9M,works great,turn clean shiny fast load thing astros playro...,5,19 people found this helpful,"Reviewed in the United States on November 29, ...",True


In [4]:

# tfidf for unigram
tfidf = TfidfVectorizer(max_features=2000, ngram_range=(1, 2))
rfidf_sparse = tfidf.fit_transform(df['comment'])

feature_names = tfidf.get_feature_names_out()


positive_words = set(opinion_lexicon.positive())
negative_words = set(opinion_lexicon.negative())


positive_scores = []
negative_scores = []

for row in rfidf_sparse.toarray():
    positive_score = sum([row[feature_names.tolist().index(word)] for word in positive_words if word in feature_names])
    negative_score = sum([row[feature_names.tolist().index(word)] for word in negative_words if  word in feature_names])
    positive_scores.append(positive_score)
    negative_scores.append(negative_score)

df['positive_score'] = positive_scores
df['negative_score'] = negative_scores

df['sentiment'] = df['positive_score'] - df['negative_score']
df['sentiment'] = df['sentiment'].apply(lambda sent: 'positive' if sent > 0 else 'negative' if sent < 0 else 'neutral')

# df.drop(columns=['positive_score', 'negative_score'], inplace=True)

# df = df[df['rate'] == 3]



with open('reviews_labeled.csv', 'w') as f:
    f.write(df.to_csv(index=False))


df.head()

Unnamed: 0,asin,title,comment,rate,helpful_votes,review_date,is_verified_purchase,positive_score,negative_score,sentiment
0,B0CL5KNB9M,PlayStation 5 Digital Edition Fortnite Cobalt...,recently purchased playstation pro digital edi...,5,6 people found this helpful,"Reviewed in the United States on December 7, 2024",True,1.431058,0.10331,positive
1,B0CL5KNB9M,Fives No Jive,longtime playstation fan putting getting ps th...,5,551 people found this helpful,"Reviewed in the United States on June 8, 2024",True,0.533968,0.26036,positive
2,B0CL5KNB9M,PlayStation5 Console Slim Review,playstation console slim fantastic evolution o...,5,94 people found this helpful,"Reviewed in the United States on November 15, ...",True,0.671529,0.0,positive
3,B0CL5KNB9M,Compact Sleek and Reliable,ps digital slim worked flawlessly compact desi...,5,,"Reviewed in the United States on December 11, ...",True,1.432418,0.0,positive
4,B0CL5KNB9M,works great,turn clean shiny fast load thing astros playro...,5,19 people found this helpful,"Reviewed in the United States on November 29, ...",True,0.548697,0.29762,positive


In [5]:
Y = df['sentiment']
X = df['comment']

X_train, X_test, y_train, y_test = train_test_split(rfidf_sparse, Y, test_size=0.25, random_state=42)

In [6]:
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

predictions = classifier.predict(X_test)

acccuracy = classifier.score(X_test, y_test)
acccuracy
# print(classification_report(y_test, predictions, zero_division=1))

0.9324324324324325

In [7]:
classifier = RandomForestClassifier()

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

acccuracy = classifier.score(X_test, y_test)

acccuracy

0.9459459459459459

In [8]:
def extract_date_from_text(text):
    date_pattern = r"(January|February|March|April|May|June|July|August|September|October|November|December)\s+(\d{1,2}),\s+(\d{4})"
    date =  re.search(date_pattern, text) 
    parsed_date = pd.to_datetime(date.group(0))
    return parsed_date

# Time series

In [68]:
df_time_series = df[['review_date','asin', 'positive_score', 'negative_score', 'sentiment']]

In [None]:

df_time_series['review_date'] = df_time_series['review_date'].apply(extract_date_from_text)

df_time_series = df_time_series.sort_values(by='review_date')
df_time_series.set_index('review_date', inplace=True)

In [70]:

month_df = df_time_series.resample('YE').agg({
    'positive_score': 'mean',
    'negative_score': 'mean',
    'sentiment': lambda x: (x == 'negative').sum()
})

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=month_df.index,
    y=month_df['sentiment'],
    name='Negative count',
    line=dict(color='red')
))


fig.update_layout(
    title='Negative Over Time',
    xaxis_title='Year',
    yaxis_title='Score',
    showlegend=True,
    width=800,
    height=400,
    template='plotly_dark'
)



fig.show()