In [1]:
# Install the core library for Transformers
!pip install transformers

# Install PyTorch or TensorFlow (the backend framework)
!pip install torch



In [None]:
from pathlib import Path
import pandas as pd
from transformers import pipeline
from tqdm import tqdm

# -----------------------------
# Configuration
# -----------------------------
BASE_DIR = Path.cwd().parent 
REVIEWS_DIR = BASE_DIR / "Reviews" / "All"
OVERVIEW_DIR = BASE_DIR / "Reviews" / "Overview"

MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment"
TEXT_COLUMN = "text"
BATCH_SIZE = 32

# Existing VADER summary for top 20
VADER_FILE = OVERVIEW_DIR / "top_20_outlets_with_sentiment.csv"

# Output CSV
OUTPUT_FILE = OVERVIEW_DIR / "top_20_outlets_with_BERT_sentiment.csv"

# -----------------------------
# Step 1: Load top 20 outlets from VADER
# -----------------------------
vader_df = pd.read_csv(VADER_FILE)
top20_outlets = vader_df['outlet'].tolist()
top20_normalized = [x.strip().lower() for x in top20_outlets]
print(f"üìä Loaded {len(top20_outlets)} top 20 outlets from VADER summary")

# -----------------------------
# Step 2: Collect all review CSVs
# -----------------------------
csv_files = list(REVIEWS_DIR.glob("*.csv"))
if not csv_files:
    raise FileNotFoundError(f"‚ùå No CSV files found in {REVIEWS_DIR}")

print(f"üìÇ Found {len(csv_files)} outlet CSV files in '{REVIEWS_DIR}'")

# -----------------------------
# Step 3: Load BERT sentiment model
# -----------------------------
print(f"‚öôÔ∏è Loading BERT model: {MODEL_NAME} ...")
sentiment_pipeline = pipeline("sentiment-analysis", model=MODEL_NAME, tokenizer=MODEL_NAME)

# -----------------------------
# Step 4: Process each outlet
# -----------------------------
bert_summaries = []

MAX_LENGTH = 512  # max tokens for Roberta models

for csv_path in tqdm(csv_files, desc="Processing outlets"):
    outlet_name = csv_path.stem
    outlet_name_clean = outlet_name.replace("_reviews", "").strip().lower()

    if outlet_name_clean not in top20_normalized:
        continue

    try:
        df = pd.read_csv(csv_path)
        if TEXT_COLUMN not in df.columns:
            continue
        df = df.dropna(subset=[TEXT_COLUMN])
        if df.empty:
            continue

        texts = df[TEXT_COLUMN].astype(str).tolist()

        # Truncate texts to MAX_LENGTH
        texts_truncated = [t[:MAX_LENGTH*4] for t in texts]  # ~4 chars per token

        # Run BERT sentiment
        results = sentiment_pipeline(texts_truncated, batch_size=BATCH_SIZE)
        df['bert_label'] = [r['label'] for r in results]
        df['bert_score'] = [r['score'] for r in results]

        # Normalize labels
        label_map = {
            'LABEL_0': 'negative',
            'LABEL_1': 'neutral',
            'LABEL_2': 'positive',
            'NEGATIVE': 'negative',
            'NEUTRAL': 'neutral',
            'POSITIVE': 'positive'
        }
        df['bert_cat'] = df['bert_label'].map(label_map).fillna(df['bert_label'])

        # Compute counts and percentages
        total_reviews = len(df)
        pos = df['bert_cat'].eq('positive').sum()
        neu = df['bert_cat'].eq('neutral').sum()
        neg = df['bert_cat'].eq('negative').sum()

        bert_summary = {
            "outlet_name": outlet_name_clean,
            "total_reviews_bert": total_reviews,
            "positive_bert": int(pos),
            "neutral_bert": int(neu),
            "negative_bert": int(neg),
            "pct_positive_bert": round(pos / total_reviews * 100, 2),
            "pct_neutral_bert": round(neu / total_reviews * 100, 2),
            "pct_negative_bert": round(neg / total_reviews * 100, 2)
        }
        bert_summaries.append(bert_summary)

    except Exception as e:
        print(f"‚ùå Error processing {outlet_name}: {e}")


# -----------------------------
# Step 5: Merge with VADER summary
# -----------------------------
bert_df = pd.DataFrame(bert_summaries)
if bert_df.empty:
    raise ValueError("‚ùå No BERT results were generated.")

# Ensure consistent column for merge
vader_df['outlet_clean'] = vader_df['outlet'].str.strip().str.lower()
merged_df = pd.merge(
    vader_df,
    bert_df,
    left_on='outlet_clean',
    right_on='outlet_name',
    how='left'
).drop(columns=['outlet_clean', 'outlet_name'])

# -----------------------------
# Step 6: Save final CSV
# -----------------------------
merged_df.to_csv(OUTPUT_FILE, index=False)
print(f"‚úÖ Saved BERT-augmented bottom 20 outlets CSV: {OUTPUT_FILE}")

# Preview
print("\nSample rows:")
print(merged_df.head())