In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import plotly.express as px
import plotly.graph_objects as go
import statsmodels.api as sm
from plotly.subplots import make_subplots

In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("karkavelrajaj/amazon-sales-dataset")

In [5]:
df =pd.read_csv("/kaggle/input/amazon-sales-dataset/amazon.csv")
print('Data loaded. Shape:')
print(df.shape)
print(df.head())

Data loaded. Shape:
(1465, 16)
   product_id                                       product_name  \
0  B07JW9H4J1  Wayona Nylon Braided USB to Lightning Fast Cha...   
1  B098NS6PVG  Ambrane Unbreakable 60W / 3A Fast Charging 1.5...   
2  B096MSW6CT  Sounce Fast Phone Charging Cable & Data Sync U...   
3  B08HDJ86NZ  boAt Deuce USB 300 2 in 1 Type-C & Micro USB S...   
4  B08CF3B7N1  Portronics Konnect L 1.2M Fast Charging 3A 8 P...   

                                            category discounted_price  \
0  Computers&Accessories|Accessories&Peripherals|...             ₹399   
1  Computers&Accessories|Accessories&Peripherals|...             ₹199   
2  Computers&Accessories|Accessories&Peripherals|...             ₹199   
3  Computers&Accessories|Accessories&Peripherals|...             ₹329   
4  Computers&Accessories|Accessories&Peripherals|...             ₹154   

  actual_price discount_percentage rating rating_count  \
0       ₹1,099                 64%    4.2       24,269   
1    

In [6]:
# Find the maximum number of category levels
max_levels = df['category'].str.count('\|').max() + 1
category_cols = ['category_level_' + str(i+1) for i in range(max_levels)]

# Split the 'category' column
category_split = df['category'].str.split('|', expand=True)
category_split.columns = category_cols

# Concatenate the new columns to the original dataframe
df = pd.concat([df, category_split], axis=1)

print('Category columns added:')
print(df[category_cols].head())

# Convert 'rating' to numeric, coerce errors to NaN
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

# Clean 'rating_count' column: fill NaN, remove commas, convert to float
df['rating_count'] = df['rating_count'].fillna(0)
df['rating_count'] = df['rating_count'].astype(str).str.replace(',', '')
df['rating_count'] = pd.to_numeric(df['rating_count'], errors='coerce')

print('Cleaned review_content and ratings:')
print(df[['review_content', 'rating', 'rating_count']].head())

Category columns added:
        category_level_1         category_level_2    category_level_3  \
0  Computers&Accessories  Accessories&Peripherals  Cables&Accessories   
1  Computers&Accessories  Accessories&Peripherals  Cables&Accessories   
2  Computers&Accessories  Accessories&Peripherals  Cables&Accessories   
3  Computers&Accessories  Accessories&Peripherals  Cables&Accessories   
4  Computers&Accessories  Accessories&Peripherals  Cables&Accessories   

  category_level_4 category_level_5 category_level_6 category_level_7  
0           Cables        USBCables             None             None  
1           Cables        USBCables             None             None  
2           Cables        USBCables             None             None  
3           Cables        USBCables             None             None  
4           Cables        USBCables             None             None  
Cleaned review_content and ratings:
                                      review_content  rating  rating_

In [7]:
#Sentiment analysis
def classify_sentiment(compound_score):
    if compound_score >= 0.05:
        return 'Positive'
    elif compound_score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()
df[['negative', 'neutral', 'positive', 'compound']] = df['review_content'].apply(
    lambda x: pd.Series(sid.polarity_scores(str(x)))
)
df['sentiment'] = df['compound'].apply(classify_sentiment)
sentiment_dist = df['sentiment'].value_counts()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [8]:
# Comparar sentimiento por categoría	Bar chart horizontal o vertical
sentiment_by_category = pd.crosstab(df['category_level_1'], df['sentiment'], normalize='index') * 100

fig = px.bar(sentiment_by_category,
             title='Sentiment Distribution by Category',
             labels={'value': 'Percentage', 'category_level_1': 'Category'},
             height=600)

fig.update_layout(barmode='stack')
fig.show()

In [9]:
# Distribución del sentimiento por rating (1-5 estrellas)	Boxplot o Violin plot
rating_sentiment = df.groupby('rating')['compound'].mean().reset_index()

fig = px.scatter(df, x='rating', y='compound',
                 title='Sentiment Scores vs Rating',
                 labels={'compound': 'Sentiment Score', 'rating': 'Rating'},
                 opacity=0.6)

fig.add_traces(px.scatter(rating_sentiment, x='rating', y='compound',
                         trendline="ols").data)
fig.show()

In [10]:
# Densidad del sentimiento	Histogramas (positivo vs negativo)
df['sentiment_category'] = pd.cut(df['compound'],
                                bins=[-1, -0.1, 0.1, 1],
                                labels=['Negative', 'Neutral', 'Positive'])
fig = go.Figure()

for sentiment in ['Negative', 'Neutral', 'Positive']:
    mask = df['sentiment_category'] == sentiment
    fig.add_trace(go.Histogram(
        x=df[mask]['compound'],
        name=sentiment,
        opacity=0.75
    ))

fig.update_layout(
    title='Distribution of Sentiment Scores',
    xaxis_title='Sentiment Score',
    yaxis_title='Count',
    barmode='overlay'
)

fig.show()

In [11]:
df['review_length'] = df['review_content'].str.len()

fig = px.scatter(df, x='review_length', y='compound',
                 title='Review Length vs Sentiment Score',
                 labels={'review_length': 'Review Length (characters)',
                        'compound': 'Sentiment Score'},
                 opacity=0.6)

fig.add_traces(px.scatter(df, x='review_length', y='compound',
                         trendline="ols").data)

fig.update_layout(xaxis_range=[0, df['review_length'].quantile(0.95)])

fig.show()

In [13]:
# Reviews positivos/negativos por categoría	Stacked bar chart
df_filtered = df[df['sentiment'].isin(['Positive', 'Negative'])]

sentiment_count = pd.crosstab(df_filtered['category_level_1'], df_filtered['sentiment'])

fig = px.bar(sentiment_count,
             title='Reviews positivos/negativos por categoría',
             labels={'value': 'Count', 'category_level_1': 'Category'},
             height=600)
fig.update_layout(barmode='stack')

fig.write_html('stacked_bar_sentiment_by_category.html')
print("Grafico guardado como 'stacked_bar_sentiment_by_category.html'")

products = df['product_name'].value_counts().head(50).index

df_top = df[df['product_name'].isin(products)].copy()
df_top['product_name_short'] = df_top['product_name'].str[:50] + '...'

sentiment_by_product = pd.crosstab(df_top['product_name_short'],
                                 df_top['sentiment'],
                                 normalize='index') * 100

fig = px.bar(sentiment_by_product,
             title='Proporción de Sentimiento por Producto (Top 50)',
             labels={'value': 'Porcentaje',
                    'product_name_short': 'Producto',
                    'sentiment': 'Sentimiento'},
             height=800)

fig.update_layout(
    barmode='stack',
    xaxis_tickangle=45,
    showlegend=True
)

fig.show()

Grafico guardado como 'stacked_bar_sentiment_by_category.html'
