In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from matplotlib.dates import YearLocator, DateFormatter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from langdetect import detect
from wordcloud import WordCloud
from langdetect.lang_detect_exception import LangDetectException
import re
import warnings
warnings.filterwarnings("ignore")

In [2]:

def plot_category_stats(data, col):
    '''
    Plots bar charts for the mean, median, and sum of ratings grouped by a specified category.

    Parameters:
    data (pd.DataFrame): DataFrame containing the data with columns for categories and ratings.
    col (str): Column name to group by (e.g., 'primary_category').

    Returns:
    None
    '''
    # Calculate mean, median, and sum of ratings for each category
    category_stats = data.groupby(col)['rating'].agg(['mean', 'median', 'sum']).reset_index()

    # Plot the statistics
    fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharey=False)

    # Mean Ratings
    sns.barplot(x=col, y='mean', data=category_stats, palette='viridis', ax=axes[0])
    axes[0].set_xlabel('Primary Category')
    axes[0].set_ylabel('Mean Rating')
    axes[0].set_title(f'Mean Rating by {col}')
    axes[0].tick_params(axis='x', rotation=45)
    axes[0].set_ylim(0, category_stats['mean'].max() * 1.1)  # Set y-axis limit

    # Median Ratings
    sns.barplot(x=col, y='median', data=category_stats, palette='viridis', ax=axes[1])
    axes[1].set_xlabel('Primary Category')
    axes[1].set_ylabel('Median Rating')
    axes[1].set_title(f'Median Rating by {col}')
    axes[1].tick_params(axis='x', rotation=45)
    axes[1].set_ylim(0, category_stats['median'].max() * 1.1)  # Set y-axis limit

    # Sum Ratings
    sns.barplot(x=col, y='sum', data=category_stats, palette='viridis', ax=axes[2])
    axes[2].set_xlabel('Primary Category')
    axes[2].set_ylabel('Total Rating')
    axes[2].set_title(f'Total Rating by {col}')
    axes[2].tick_params(axis='x', rotation=45)
    axes[2].set_ylim(0, category_stats['sum'].max() * 1.1)  # Set y-axis limit

    plt.tight_layout()
    plt.show()


In [3]:
product_df = pd.read_csv("product_info.csv")


t1 = pd.read_csv("reviews_0-250.csv")
# t2 = pd.read_csv("reviews_250-500.csv")
# t3 = pd.read_csv("reviews_1250-end.csv")
# t4 = pd.read_csv("reviews_500-750.csv") 
review_df = t1#pd.concat([t1,t2,t3]).drop(columns='Unnamed: 0') .sort_values(by = 'submission_time')
review_df['primary_category'] = review_df.product_id.map(dict(zip(product_df.product_id, product_df.primary_category)))
review_df['submission_time'] = pd.to_datetime(review_df['submission_time'])

FileNotFoundError: [Errno 2] No such file or directory: 'product_info.csv'

In [None]:
print(product_df.columns )
plot_category_stats(product_df, 'primary_category')

In [None]:
# Get unique primary categories
categories = product_df.primary_category.unique()

# Set up the plotting environment
fig, axes = plt.subplots(len(categories), 1, figsize=(14, 2 + 5 * len(categories)))

# Ensure axes is always iterable
if len(categories) == 1:
    axes = [axes]

# Loop over each category and plot
for ax, category in zip(axes, categories):
    product_df_i = product_df[product_df.primary_category == category]
    product_df_i = product_df_i.groupby('product_name')[['rating']].mean().reset_index().sort_values(by = 'rating',ascending = False).iloc[:20]
    bar_plot = sns.barplot(x='product_name', y='rating', data=product_df_i, palette='viridis', ax=ax)
    ax.set_title(f'Ratings by Product for {category}')
    ax.set_ylabel(category)  # Set y-label to the current category
    ax.set_xticklabels(ax.get_xticklabels(), rotation=30, ha='right')  # Rotate x-axis labels for better readability

    # Annotate each bar with its height
    for p in bar_plot.patches:
        height = p.get_height()
        ax.annotate(f'{height:.2f}', 
                    (p.get_x() + p.get_width() / 2., height), 
                    ha='center', va='center', 
                    xytext=(0, 2),  # 3 points vertical offset
                    textcoords='offset points')

# Adjust layout to make room for labels
plt.tight_layout()
plt.subplots_adjust(hspace=2) 
plt.show()

In [None]:
# Get the top 15 brands by review count
top_brand = list(review_df.brand_name.value_counts().iloc[:15].index)
top_brand_review_df = review_df[review_df.brand_name.isin(top_brand)]

# Convert submission_time to datetime
top_brand_review_df['submission_time'] = pd.to_datetime(top_brand_review_df['submission_time'])

# Extract the year from submission_time
top_brand_review_df['year'] = top_brand_review_df['submission_time'].dt.year

# Group by year and brand_name and calculate the mean rating
mean_rating_yearly = top_brand_review_df.groupby(['year', 'brand_name'])['rating'].mean().reset_index()

# Create the line plot
plt.figure(figsize=(12, 8))
palette = sns.color_palette("tab20", len(top_brand))  # Use a color palette with 20 distinct colors
sns.lineplot(data=mean_rating_yearly, x='year', y='rating', hue='brand_name', palette=palette, marker='o')

# Set plot title and labels
plt.title('Mean Rating Yearly of Each Brand in Skin Care Category')
plt.xlabel('Year')
plt.ylabel('Mean Rating')
# plt.legend(title='Brand', bbox_to_anchor=(1.05, 1), loc='upper left')

# Label each line with the brand name at the end
for brand in top_brand:
    brand_data = mean_rating_yearly[mean_rating_yearly['brand_name'] == brand]
    plt.text(brand_data['year'].max() + 0.1, 
             brand_data['rating'].iloc[-1], 
             brand, 
             horizontalalignment='left', 
             size='medium', 
             color=palette[top_brand.index(brand)])

# Show plot
plt.tight_layout()
plt.show()

In [None]:
# Plot the data
plt.figure(figsize=(10, 6))
plt.plot(review_df.submission_time, review_df.total_feedback_count.cumsum(), linestyle='-', color='b', label='Total Feedback')
plt.plot(review_df.submission_time, review_df.total_neg_feedback_count.cumsum(),linestyle='-', color='r', label='Total Negative Feedback')
plt.plot(review_df.submission_time, review_df.total_pos_feedback_count.cumsum(), linestyle='-', color='g', label='Total Positive Feedback')

# Add title and labels
plt.title('Cumulative Feedback Over Time')
plt.xlabel('Submission Time')
plt.ylabel('Cumulative Feedback Count')

# Set major locator and formatter for yearly labels
plt.gca().xaxis.set_major_locator(YearLocator())
plt.gca().xaxis.set_major_formatter(DateFormatter('%Y'))

# Rotate x-axis ticks
plt.xticks(rotation=45)

# Add grid
plt.grid(True)

# Add legend
plt.legend()

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
# review_df has column total_neg_feedback_count	total_pos_feedback_count and brand_name 
# Set up the figure
plt.figure(figsize=(12, 8))
gb_top_brand_review_df = top_brand_review_df.groupby(['brand_name'])[['total_pos_feedback_count','total_neg_feedback_count']].sum().reset_index()
# Plot negative feedback counts
sns.barplot(data=gb_top_brand_review_df, y='brand_name', x='total_neg_feedback_count', color='red', label='Negative Feedback', alpha =  0.5)

# Plot positive feedback counts
sns.barplot(data=gb_top_brand_review_df, y='brand_name', x='total_pos_feedback_count', color='blue', label='Positive Feedback', alpha =  0.5)

plt.title('Feedback Counts by Brand')
plt.xlabel('Count')
plt.ylabel('Brand')
plt.legend(title='Feedback Type')
plt.show()

In [None]:
---------

In [None]:
df = review_df.iloc[:2000]
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text
#removing the stopwords
#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')
stop=set(stopwords.words('english'))
#Tokenization of text
tokenizer=ToktokTokenizer()
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

# Preprocess the data
df['review_text'].fillna('', inplace=True)

# Function to clean text
def clean_text(text):
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove non-English characters
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply text cleaning
df['cleaned_review_text'] = df['review_text'].apply(clean_text)
# df['cleaned_review_text']=df['cleaned_review_text'].apply(simple_stemmer) ##########
df['cleaned_review_text']=df['cleaned_review_text'].apply(remove_stopwords) ########## remove stop words in English are “the”, “a”, “an”, “so”, “what”.
# Function to detect language and filter out non-English reviews
def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False

# Filter out non-English reviews
df = df[df['cleaned_review_text'].apply(is_english)]

# df.to_csv('cleaned_review_text.csv',index=False)


In [None]:
# Compute word count for each review
df['word_count'] = df['cleaned_review_text'].apply(lambda x: len(x.split()))

# # Group by rating and compute the average word count
word_count_by_rating = df.groupby('rating')['word_count'].mean().reset_index()
print(f'the number of word is not correlated with rating')
display(word_count_by_rating)


In [None]:
# df has column cleaned_review_text and brand_name. I want to find the frequency word in each brand_name
from collections import Counter
# Create a dictionary to hold word frequencies for each brand
brand_word_frequencies = {}

# Group by brand_name
for brand, group in df.groupby('brand_name'):
    # Concatenate all review texts for the current brand
    text = ' '.join(group['cleaned_review_text'])
    # Split text into words and calculate frequencies
    words = text.split()
    word_freq = Counter(words)
    brand_word_frequencies[brand] = word_freq

selected_brand = 'Caudalie'
# for selected_brand in top_brand
# Get the word frequencies for the selected brand
word_freq = brand_word_frequencies.get(selected_brand, {})

# Convert to DataFrame for plotting
word_freq_df = pd.DataFrame(word_freq.items(), columns=['Word', 'Frequency'])

# Plot the top 10 most frequent words
df['submission_time'] = pd.to_datetime(df['submission_time'])

# Get and format the earliest and latest submission times
earliest_time = df['submission_time'].min().strftime('%Y-%m-%d')
latest_time = df['submission_time'].max().strftime('%Y-%m-%d')
top_words = word_freq_df.nlargest(10, 'Frequency')

plt.figure(figsize=(10, 6))
sns.barplot(x='Frequency', y='Word', data=top_words)
plt.title(f'Top 10 Words for {selected_brand} from {earliest_time} to {latest_time}')
plt.show()

In [None]:
# Create a dictionary to hold the text for each brand
brand_texts = {}

# Group by brand_name and concatenate review texts
for brand, group in df.groupby('brand_name'):
    # Concatenate all review texts for the current brand
    text = ' '.join(group['cleaned_review_text'])
    brand_texts[brand] = text

# Define the number of brands to display (adjust as needed)
num_brands_to_display = min(len(brand_texts), 10)

# Set up the plot
fig, axes = plt.subplots(nrows=num_brands_to_display, ncols=1, figsize=(10, 2*num_brands_to_display))
axes = axes.flatten()  # Flatten the array of axes

# Generate and plot word clouds for each brand
for i, (brand, text) in enumerate(list(brand_texts.items())[:num_brands_to_display]):
    wordcloud = WordCloud(width=1200, height=600, background_color='white').generate(text)
    axes[i].imshow(wordcloud, interpolation='bilinear')
    axes[i].axis('off')
    axes[i].set_title(brand)

# Adjust layout
plt.tight_layout()
plt.show()
