# Generate Missing Dashboard Files

Result: Creates disagreements.csv and sample_reviews.csv from existing ABSA results

In [1]:
import pandas as pd
import os

# Constants
SEED = 42

## File Paths

In [2]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

!ls /content/drive/MyDrive/Cadence2B/absa_results/

Mounted at /content/drive
aspect_sentiments.csv  aspect_summary.csv


In [10]:
ORIGINAL_REVIEWS = "/content/drive/MyDrive/Cadence2B/datasets/balanced_reviews_no_labels.parquet"
ASPECT_SENTIMENTS = "/content/drive/MyDrive/Cadence2B/absa_results/aspect_sentiments.csv"
ASPECT_SUMMARY_FILE = "/content/drive/MyDrive/Cadence2B/absa_results/aspect_summary.csv"

# Output directory for dashboard files
OUTPUT_DIR = "/content/drive/MyDrive/Cadence2B/dashboard_data/"

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load Data

### Load original reviews

In [9]:
# Load only needed columns
original_df = pd.read_parquet(ORIGINAL_REVIEWS, columns=['rating'])

# Create review_id from index
original_df['review_id'] = original_df.index

print(f"Loaded {len(original_df):,} reviews with ratings")
print(f"\nRating distribution:")
print(original_df['rating'].value_counts().sort_index())

Loaded 1,068,800 reviews with ratings

Rating distribution:
rating
1.0    213760
2.0    213760
3.0    213760
4.0    213760
5.0    213760
Name: count, dtype: int64


### Load Aspect Sentiments

In [12]:
print("\nLoading aspect sentiments...")
aspects_df = pd.read_csv(ASPECT_SENTIMENTS)

print(f"Loaded {len(aspects_df):,} aspect-sentiment pairs")
print(f"Current columns: {aspects_df.columns.tolist()}")


Loading aspect sentiments...
Loaded 918,167 aspect-sentiment pairs
Current columns: ['review_id', 'aspect', 'sentiment', 'sentiment_score', 'text']


### Merge Ratings

In [13]:
# Merge on review_id
aspects_with_ratings = aspects_df.merge(
    original_df[['review_id', 'rating']],
    on='review_id',
    how='left'
)

In [15]:
# Check for missing ratings
missing = aspects_with_ratings['rating'].isna().sum()
if missing > 0:
    print(f"(X) {missing:,} rows missing ratings (review_id mismatch)")
    # Drop rows without ratings
    aspects_with_ratings = aspects_with_ratings.dropna(subset=['rating'])
    print(f"Kept {len(aspects_with_ratings):,} rows with valid ratings")
else:
    print(f"All {len(aspects_with_ratings):,} rows matched successfully!")

# Show rating distribution in merged data
print(f"\nRating distribution in merged data:")
print(aspects_with_ratings['rating'].value_counts().sort_index())

All 918,167 rows matched successfully!

Rating distribution in merged data:
rating
1.0    146297
2.0    194978
3.0    200211
4.0    216971
5.0    159710
Name: count, dtype: int64


In [19]:
aspects_with_ratings.head(5)


Unnamed: 0,review_id,aspect,sentiment,sentiment_score,text,rating
0,1,What a beautiful TV,positive,0.947066,What a beautiful TV. The price was fair. I o...,5.0
1,2,cables,neutral,0.748858,Husband travels 300 days a year. If you trave...,5.0
2,2,every adapter,neutral,0.817963,Husband travels 300 days a year. If you trave...,5.0
3,2,backpack,positive,0.611,Husband travels 300 days a year. If you trave...,5.0
4,5,camera,neutral,0.656025,We recently got a bird feeder with a built in ...,5.0


### Save in aspect_sentiments.csv

In [20]:
print("\nSaving aspect_sentiments.csv with ratings...")
output_file = os.path.join(OUTPUT_DIR, 'aspect_sentiments.csv')
aspects_with_ratings.to_csv(output_file, index=False)
print(f"Saved: aspect_sentiments.csv ({len(aspects_with_ratings):,} rows)")


Saving aspect_sentiments.csv with ratings...
Saved: aspect_sentiments.csv (918,167 rows)


### Save aspect_summary.csv

In [21]:
aspect_summary = aspects_with_ratings.groupby('aspect').agg({
    'sentiment': [
        lambda x: (x == 'negative').sum(),
        lambda x: (x == 'neutral').sum(),
        lambda x: (x == 'positive').sum(),
        'count'
    ]
}).reset_index()

aspect_summary.columns = ['aspect', 'negative', 'neutral', 'positive', 'total']
aspect_summary = aspect_summary.sort_values('total', ascending=False).reset_index(drop=True)

output_file = os.path.join(OUTPUT_DIR, 'aspect_summary.csv')
aspect_summary.to_csv(output_file, index=False)
print(f"Saved: aspect_summary.csv ({len(aspect_summary):,} unique aspects)")

Saved: aspect_summary.csv (93,033 unique aspects)


# Generate Disagreements

In [22]:
# High-rated (4-5 stars) but negative sentiment
high_rated_negative = aspects_with_ratings[
    (aspects_with_ratings['rating'] >= 4) &
    (aspects_with_ratings['sentiment'] == 'negative')
]

# Low-rated (1-2 stars) but positive sentiment
low_rated_positive = aspects_with_ratings[
    (aspects_with_ratings['rating'] <= 2) &
    (aspects_with_ratings['sentiment'] == 'positive')
]

# Extreme ratings (1 or 5 stars) but neutral sentiment
extreme_rated_neutral = aspects_with_ratings[
    ((aspects_with_ratings['rating'] == 5) | (aspects_with_ratings['rating'] == 1)) &
    (aspects_with_ratings['sentiment'] == 'neutral')
]

In [23]:
# Combine all disagreement types
disagreements = pd.concat([
    high_rated_negative,
    low_rated_positive,
    extreme_rated_neutral
]).drop_duplicates()

In [24]:
output_file = os.path.join(OUTPUT_DIR, 'disagreements.csv')
disagreements.to_csv(output_file, index=False)

In [25]:
print(f"- Saved: disagreements.csv")
print(f"- Disagreement Breakdown:")
print(f"  • High-rated negative (4-5★, negative):   {len(high_rated_negative):>8,}")
print(f"  • Low-rated positive (1-2★, positive):    {len(low_rated_positive):>8,}")
print(f"  • Extreme-rated neutral (1/5★, neutral):  {len(extreme_rated_neutral):>8,}")
print(f"  • Total disagreement cases:               {len(disagreements):>8,}")
print(f"  • Disagreement rate:                      {len(disagreements)/len(aspects_with_ratings)*100:>8.2f}%")


- Saved: disagreements.csv
- Disagreement Breakdown:
  • High-rated negative (4-5★, negative):     44,791
  • Low-rated positive (1-2★, positive):      55,840
  • Extreme-rated neutral (1/5★, neutral):   101,855
  • Total disagreement cases:                194,295
  • Disagreement rate:                         21.16%


In [26]:
# Show examples
print(f"\nExample Disagreement Cases:")
print("─" * 80)
print("\nHigh-rated negative (customer gave high rating but mentioned negative aspect):")
if len(high_rated_negative) > 0:
    sample = high_rated_negative.sample(min(3, len(high_rated_negative)), random_state=SEED)
    for i, (idx, row) in enumerate(sample.iterrows(), 1):
        print(f"\n  {i}. Rating: {row['rating']}★ | Aspect: {row['aspect']} | Sentiment: {row['sentiment']}")
        print(f"     Review: {row['text'][:150]}...")


Example Disagreement Cases:
────────────────────────────────────────────────────────────────────────────────

High-rated negative (customer gave high rating but mentioned negative aspect):

  1. Rating: 5.0★ | Aspect: RAM | Sentiment: negative
     Review: If you don't know how to seat RAM into a computer, and don't feel confident that you could google some instructions on the internet and follow them, r...

  2. Rating: 4.0★ | Aspect: Battery | Sentiment: negative
     Review: Battery doesn't last as long and not the most comfortable good for the value...

  3. Rating: 4.0★ | Aspect: 1600 Mhz | Sentiment: negative
     Review: I purchased three sets of Komputerbay 32GB DDR3 RAM on three different occasions. Two of those sets failed to pass Memtest x86 v4.20. One set was repl...


In [27]:
print("\n" + "─" * 80)
print("\nLow-rated positive (customer gave low rating but mentioned positive aspect):")
if len(low_rated_positive) > 0:
    sample = low_rated_positive.sample(min(3, len(low_rated_positive)), random_state=SEED)
    for i, (idx, row) in enumerate(sample.iterrows(), 1):
        print(f"\n  {i}. Rating: {row['rating']}★ | Aspect: {row['aspect']} | Sentiment: {row['sentiment']}")
        print(f"     Review: {row['text'][:150]}...")


────────────────────────────────────────────────────────────────────────────────

Low-rated positive (customer gave low rating but mentioned positive aspect):

  1. Rating: 2.0★ | Aspect: stereo audio | Sentiment: positive
     Review: This was a great little camera to get nice and excited over.  Waterproof to 20 feet, all those great features, HD video, stereo audio... all that stuf...

  2. Rating: 1.0★ | Aspect: 5ghz | Sentiment: positive
     Review: I use this router to repeat a 5ghz signal so it's always in repeater mode, I play Battlefield 4 on xbox one, so I need fast wifi with no bs, well it w...

  3. Rating: 1.0★ | Aspect: Sound quality | Sentiment: positive
     Review: I like the look and feel in my ears. Sound quality is probably good if bought buds worked. I guess I shouldn't be complaining for the price I paid for...


# Sample Dataset

In [28]:
print("Create sample reviews dataset...")

# Get top 100 aspects
top_100_aspects = aspect_summary.head(100)['aspect'].tolist()

# Sample 10 reviews per aspect
sample_dfs = []
for aspect in top_100_aspects:
    aspect_data = aspects_with_ratings[aspects_with_ratings['aspect'] == aspect]
    sample_size = min(10, len(aspect_data))
    if sample_size > 0:
        sample_dfs.append(aspect_data.sample(sample_size, random_state=SEED))

# Combine and shuffle
sample_reviews = pd.concat(sample_dfs, ignore_index=True)
sample_reviews = sample_reviews.sample(frac=1, random_state=SEED).reset_index(drop=True)

output_file = os.path.join(OUTPUT_DIR, 'sample_reviews.csv')
sample_reviews.to_csv(output_file, index=False)
print(f"Saved: sample_reviews.csv ({len(sample_reviews):,} rows)")

Create sample reviews dataset...
Saved: sample_reviews.csv (1,000 rows)
