In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.calibration import LabelEncoder

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')

# Load the data
df = pd.read_csv('merged_data.csv')

# Data cleaning function
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Clean review text
df['cleaned_review'] = df['review_title'].apply(clean_text)

# Sentiment analysis function using VADER
sid = SentimentIntensityAnalyzer()

def get_sentiment(text):
    scores = sid.polarity_scores(text)
    return 'positive' if scores['compound'] >= 0.05 else ('negative' if scores['compound'] <= -0.05 else 'neutral')

# Apply sentiment analysis to get sentiment labels
df['sentiment'] = df['cleaned_review'].apply(get_sentiment)

# Encode sentiment labels
le = LabelEncoder()
df['sentiment_encoded'] = le.fit_transform(df['sentiment'])

# Display the first few rows to verify the results
df[['user_id', 'sentiment', 'cleaned_review']].head()



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aniqa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aniqa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\aniqa\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,user_id,sentiment,cleaned_review
0,AGTJAM6NFSYTR5KB6DQNHZ3ZIFFQ,positive,awesome
1,AE7BV6IMNPZ3F266H7PXMH3BZQNQ,positive,best elvis golden records series
2,AE7BV6IMNPZ3F266H7PXMH3BZQNQ,neutral,exgi elvis tearing studio b
3,AE7BV6IMNPZ3F266H7PXMH3BZQNQ,neutral,elvis rhythm country essential absolutely
4,AE7BV6IMNPZ3F266H7PXMH3BZQNQ,positive,late great christine mcvie remembered celebrated


In [6]:
# Feature Engineering: Extract relevant features
df['review_length'] = df['cleaned_review'].apply(len)
df['word_count'] = df['cleaned_review'].apply(lambda x: len(x.split()))
df['helpful_vote'] = df['helpful_vote'].fillna(0)
df['verified_purchase'] = df['verified_purchase'].astype(int)

# Segment customers based on sentiment
positive_reviews = df[df['sentiment'] == 'positive']
neutral_reviews = df[df['sentiment'] == 'neutral']
negative_reviews = df[df['sentiment'] == 'negative']

# Analyze characteristics of each segment
positive_summary = positive_reviews.describe()
neutral_summary = neutral_reviews.describe()
negative_summary = negative_reviews.describe()

# Display summaries
print("Positive Reviews Summary:")
print(positive_summary)
print("\nNeutral Reviews Summary:")
print(neutral_summary)
print("\nNegative Reviews Summary:")
print(negative_summary)

display(df[['user_id', 'sentiment', 'review_length', 'word_count', 'helpful_vote', 'verified_purchase']].head())


Positive Reviews Summary:
              rating   helpful_vote  verified_purchase          price  \
count  413701.000000  413701.000000      413701.000000  413701.000000   
mean        4.613922       0.298704           0.850252      44.060923   
std         0.900461       1.440988           0.356825      92.121921   
min         1.000000       0.000000           0.000000       0.010000   
25%         5.000000       0.000000           1.000000      13.990000   
50%         5.000000       0.000000           1.000000      22.000000   
75%         5.000000       0.000000           1.000000      39.990000   
max         5.000000     274.000000           1.000000   15499.950000   

       sentiment_encoded  review_length     word_count  
count           413701.0  413701.000000  413701.000000  
mean                 2.0      17.015685       2.795599  
std                  0.0      11.372258       1.674108  
min                  2.0       2.000000       1.000000  
25%                  2.0       

Unnamed: 0,user_id,sentiment,review_length,word_count,helpful_vote,verified_purchase
0,AGTJAM6NFSYTR5KB6DQNHZ3ZIFFQ,positive,7,1,1,1
1,AE7BV6IMNPZ3F266H7PXMH3BZQNQ,positive,32,5,0,0
2,AE7BV6IMNPZ3F266H7PXMH3BZQNQ,neutral,27,5,0,0
3,AE7BV6IMNPZ3F266H7PXMH3BZQNQ,neutral,41,5,0,0
4,AE7BV6IMNPZ3F266H7PXMH3BZQNQ,positive,48,6,0,0


In [5]:
# Save the segmented data
df.to_csv('../AmazonReviewDatasets/Final/segmentedReviews.csv', index=False)


In [7]:
display(df.columns)

Index(['rating', 'review_title', 'parent_asin', 'user_id', 'date',
       'helpful_vote', 'verified_purchase', 'main_category', 'item_title',
       'price', 'cleaned_review', 'sentiment', 'sentiment_encoded',
       'review_length', 'word_count'],
      dtype='object')