In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from collections import Counter

# Load the data
df = pd.read_csv('merged_data.csv')

# Clean the review text
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

df['cleaned_review'] = df['review_title'].apply(clean_text)

# Sentiment analysis
sid = SentimentIntensityAnalyzer()

def get_sentiment(text):
    scores = sid.polarity_scores(text)
    return scores['compound']

df['sentiment_score'] = df['cleaned_review'].apply(get_sentiment)
df['sentiment'] = df['sentiment_score'].apply(lambda x: 'positive' if x >= 0.05 else ('negative' if x <= -0.05 else 'neutral'))

# Keyword frequency analysis
tokens = [word_tokenize(text) for text in df['cleaned_review']]
all_tokens = [token for sublist in tokens for token in sublist]
freq_dist = Counter(all_tokens)
freq_df = pd.DataFrame(freq_dist.items(), columns=['keyword', 'frequency']).sort_values(by='frequency', ascending=False)

# Save the results to a CSV file
df.to_csv('sentiment_analysis.csv', index=False)
freq_df.to_csv('keyword_frequency.csv', index=False)

# Display the first few rows of the sentiment analysis DataFrame
df[['cleaned_review', 'sentiment_score', 'sentiment']].head()

# Display the top 20 keywords
freq_df.head(20)



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aniqa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aniqa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\aniqa\AppData\Roaming\nltk_data...


Unnamed: 0,keyword,frequency
15,great,98076
96,good,47254
272,fun,24334
74,love,24133
323,quality,23929
476,perfect,20141
354,cute,18401
93,nice,18056
478,works,17816
234,product,16979


In [2]:
#exsport the csv of keyword frequency and its pasrent_asin involve?
import pandas as pd
df2 = pd.read_csv('merged_data.csv')

df2 = df2[['parent_asin', 'review_title']]
df2['cleaned_review'] = df2['review_title'].apply(clean_text)
df2['tokens'] = df2['cleaned_review'].apply(word_tokenize)
df2['keyword'] = df2['tokens'].apply(Counter)
df2['keyword'] = df2['keyword'].apply(lambda x: [word for word in x if freq_dist[word] > 1])
df2 = df2.explode('keyword')
df2 = df2.groupby(['parent_asin', 'keyword']).size().reset_index(name='frequency')
df2 = df2.sort_values(by='frequency', ascending=False)
# df2.to_csv('keyword_frequency_by_asin.csv', index=False)
df2.head(20)


NameError: name 'clean_text' is not defined