### Environment Setup

In [52]:
# !pip install openai python-dotenv pandas tqdm
import os, ast, pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from openai import OpenAI

# Load key from .env
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise RuntimeError("OPENAI_API_KEY is not set in your .env")

client = OpenAI(api_key=api_key)

def resolve_model(alias: str, fallback: str | None = None) -> str:
    models = client.models.list()
    ids = [m.id for m in models.data]
    if alias in ids:
        return alias
    cand = [m for m in ids if m.startswith(alias)]
    if cand:
        cand.sort(reverse=True)
        return cand[0]
    if fallback:
        return fallback
    raise ValueError(f"No model found for alias '{alias}'. Sample IDs: {ids[:5]}")

MODEL_ALIAS = os.getenv("LLM_MODEL", "gpt-5-nano")
MODEL_NAME = resolve_model(MODEL_ALIAS)
print("Using model:", MODEL_NAME)

Using model: gpt-5-nano


In [63]:
CSV_PATH = os.getenv("REVIEWS_CSV", "merged_digital_2012.csv.zip")
TEXT_COL = os.getenv("TEXT_COL", "review_body")
STAR_COL = os.getenv("STAR_COL", "star_rating")

EXPORT_DIR = "data/ab_outputs"
os.makedirs(EXPORT_DIR, exist_ok=True)
EXPORT_TSV = os.path.join(EXPORT_DIR, "sentiment_dictstyle.tsv")

SAMPLE_N = int(os.getenv("SAMPLE_N", "100"))
if SAMPLE_N <= 0:
    SAMPLE_N = None

# Robustly parse TEMPERATURE from env, fallback to 0.2 if not a valid float
_temp_env = os.getenv("TEMP", None)
try:
    TEMPERATURE = float(_temp_env)
except (TypeError, ValueError):
    TEMPERATURE = 0.2

MAX_CHARS = int(os.getenv("MAX_CHARS", "8000"))

### Method1

In [64]:
SYSTEM_PROMPT = """You are an aspect-based sentiment labeling assistant for Amazon digital goods reviews.
Return ONLY a valid Python dictionary (no code fences, no extra text) with this structure:

{
  "overall_sentiment": "Positive|Negative|Neutral",
  "aspect_sentiment": {
    "Product Quality": "Positive|Negative|Neutral|N/A",
    "Price & Value": "Positive|Negative|Neutral|N/A",
    "Usability": "Positive|Negative|Neutral|N/A",
    "Access & Compatibility": "Positive|Negative|Neutral|N/A",
    "Customer Support": "Positive|Negative|Neutral|N/A",
    "Content Variety & Features": "Positive|Negative|Neutral|N/A",
    "Other": "Positive|Negative|Neutral|N/A"
  },
  "main_problem": "string|null",
  "reason": "string|null",
  "text_rating": 0
}

Rules:
- If an aspect is not mentioned, set its label to "N/A".
- Keep "reason" ≤ 25 words.
- Do NOT include backticks, code fences, or any explanation; output the dict only."""

USER_TEMPLATE = """Review (stars = {stars}):
{review_text}

Return ONLY the Python dict described; nothing else."""

In [67]:
def call_model_dict(text: str, stars=None, model: str = MODEL_NAME, temperature: float = TEMPERATURE) -> dict | None:
    msg = USER_TEMPLATE.format(stars=str(stars) if stars is not None else "NA", review_text=str(text)[:MAX_CHARS])
    # Only pass temperature if it is not None, not 1, and not required to be omitted for this model
    kwargs = {
        "model": model,
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": msg},
        ]
    }
    # Only include temperature if it is not 1 and not None, and model is not in the known set that only allows default
    models_requiring_default_temp = {"gpt-5-nano", "gpt-5-nano-2025-08-07"}
    if (temperature is not None and temperature != 1
        and not any(model.startswith(m) for m in models_requiring_default_temp)):
        kwargs["temperature"] = temperature
    resp = client.chat.completions.create(**kwargs)
    raw = resp.choices[0].message.content.strip()
    try:
        return ast.literal_eval(raw)
    except Exception:
        # Minimal fallback: try to trim accidental code fences or leading text
        cleaned = raw.strip()
        if cleaned.startswith("```"):
            cleaned = cleaned.strip("`")
            # attempt to drop leading "python" on the first line if present
            lines = cleaned.splitlines()
            if lines and lines[0].strip().lower().startswith("python"):
                cleaned = "\n".join(lines[1:])
        try:
            return ast.literal_eval(cleaned)
        except Exception:
            return None

### read data and run analysis

In [68]:
df = pd.read_csv(CSV_PATH, compression="zip") if CSV_PATH.endswith(".zip") else pd.read_csv(CSV_PATH)
if TEXT_COL not in df.columns:
    raise ValueError(f"Column '{TEXT_COL}' not found. Available: {list(df.columns)[:15]}")

df_run = df.sample(min(len(df), SAMPLE_N), random_state=42).reset_index(drop=True) if SAMPLE_N else df.copy().reset_index(drop=True)

# Prepare columns to append (keep your original frame intact)
for col in ["overall_sentiment","main_problem","reason","text_rating","aspect_sentiment"]:
    if col not in df_run.columns:
        df_run[col] = None

errs = 0
for i, r in enumerate(tqdm(df_run.itertuples(index=False), total=len(df_run), desc="Infer (dict)")):
    text = getattr(r, TEXT_COL)
    stars = getattr(r, STAR_COL) if STAR_COL in df_run.columns else None
    res = call_model_dict(text, stars=stars)
    res = normalize_result(res)
    if res is None:
        errs += 1
        continue
    # Write back
    df_run.at[i, "overall_sentiment"] = res.get("overall_sentiment")
    df_run.at[i, "main_problem"] = res.get("main_problem")
    df_run.at[i, "reason"] = res.get("reason")
    df_run.at[i, "text_rating"] = res.get("text_rating")
    # Store aspect_sentiment as a compact JSON-like string
    try:
        import json
        df_run.at[i, "aspect_sentiment"] = json.dumps(res.get("aspect_sentiment", {}), ensure_ascii=False)
    except Exception:
        df_run.at[i, "aspect_sentiment"] = str(res.get("aspect_sentiment", {}))

print(f"Done. Errors: {errs}")
print(df_run.head(3)[[TEXT_COL, 'overall_sentiment', 'aspect_sentiment', 'main_problem', 'reason', 'text_rating']])

Infer (dict): 100%|██████████| 100/100 [17:28<00:00, 10.49s/it]

Done. Errors: 0
                                         review_body overall_sentiment  \
0  This game has a few bugs I've encounter but no...          Positive   
1  This is a poor substitute for the Real Blues B...          Negative   
2  For its price, this DLC gives you what amounts...          Negative   

                                    aspect_sentiment  \
0  {"Product Quality": "Positive", "Price & Value...   
1  {"Product Quality": "Negative", "Price & Value...   
2  {"Product Quality": "Negative", "Price & Value...   

                                        main_problem  \
0             Minor bugs present but non-disruptive.   
1  Copied, low-quality substitute of the Real Blu...   
2  Overpriced DLC with minimal unique content and...   

                                              reason text_rating  
0         Bugs mentioned but do not affect enjoyment           0  
1           Substantial plagiarism and poor quality.           0  
2  Charged money for a mod that adds




In [None]:
pd.set_option('display.max_colwidth', None)
print(df_run.head(5)[[TEXT_COL, 'overall_sentiment', 'aspect_sentiment', 'main_problem', 'reason', 'text_rating']])

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        