In [3]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.preprocessing import LabelEncoder

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')

# Load the data
df = pd.read_csv('merged_data.csv')

# Data cleaning function
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Clean review text
df['cleaned_review'] = df['review_title'].apply(clean_text)

# Sentiment analysis function using VADER
sid = SentimentIntensityAnalyzer()

def get_sentiment(text):
    scores = sid.polarity_scores(text)
    return 'positive' if scores['compound'] >= 0.05 else ('negative' if scores['compound'] <= -0.05 else 'neutral')

# Apply sentiment analysis to get sentiment labels
df['sentiment'] = df['cleaned_review'].apply(get_sentiment)

# Encode sentiment labels
le = LabelEncoder()
df['sentiment_encoded'] = le.fit_transform(df['sentiment'])

# Prepare data for training
X = df['cleaned_review']
y = df['sentiment_encoded']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize text data
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Train logistic regression model
model = LogisticRegression()
model.fit(X_train_vect, y_train)

# Predict on test data
y_pred = model.predict(X_test_vect)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Predict sentiment for new reviews
new_reviews = vectorizer.transform(X)
df['predicted_sentiment'] = model.predict(new_reviews)

# Convert predicted sentiments back to original labels
df['predicted_sentiment_label'] = le.inverse_transform(df['predicted_sentiment'])

display(df.columns)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aniqa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aniqa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\aniqa\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Accuracy: 0.9788486444796927


Index(['rating', 'review_title', 'parent_asin', 'user_id', 'date',
       'helpful_vote', 'verified_purchase', 'main_category', 'item_title',
       'price', 'cleaned_review', 'sentiment', 'sentiment_encoded',
       'predicted_sentiment', 'predicted_sentiment_label'],
      dtype='object')

In [4]:
#remove the columns that are not needed
df = df.drop(columns=['rating', 'review_title','helpful_vote', 'verified_purchase', 'main_category', 'item_title',
       'price'])


# Save the dataset with predicted sentiments
df.to_csv('predicted_sentiments.csv', index=False)