In [1]:
import pandas as pd
import openai
from dotenv import load_dotenv
import os
import json
from tqdm import tqdm
from pathlib import Path

load_dotenv(dotenv_path=".env")

True

In [2]:
openai.api_key = os.getenv("OPENAI_API_KEY")
print("OPENAI API KEY LOADED:", bool(openai.api_key))

client = openai.OpenAI(api_key=openai.api_key)
print("OpenAI CLIENT INITIALIZED:", bool(client))

OPENAI API KEY LOADED: True
OpenAI CLIENT INITIALIZED: True


In [3]:
def load_prompt(prompt_path):
    prompt_path = Path(prompt_path)
    with open(prompt_path, "r", encoding="utf-8") as f:
        return f.read()


In [4]:
def analyze_review(review_text, star_rating, prompt_template):
    # Replace placeholder with actual review
    prompt = (prompt_template
              .replace("{review_text}", review_text or "")
              .replace("{star_rating}", str(star_rating)))

    try:
        response = client.chat.completions.create(
            model="gpt-5-nano",
            messages=[{"role": "user", "content": prompt}]
        )

        content = response.choices[0].message.content.strip()
        return json.loads(content)

    except json.JSONDecodeError:
        return {
            "error": "Invalid JSON returned by model",
            "raw_output": content if 'content' in locals() else None
        }
    except Exception as e:
        return {"error": str(e)}


In [5]:
sample_review = "The book was interesting to read but hard to download."
star_rating = 5
prompt_path = "prompts/prompt2.txt"
result = analyze_review(sample_review, star_rating, load_prompt(prompt_path))
result

{'overall_sentiment': 'Neutral',
 'aspect_sentiments': {'Product Quality': 'Positive',
  'Price & Value': 'N/A',
  'Usability': 'Negative',
  'Access & Compatibility': 'Negative',
  'Customer Support': 'N/A',
  'Content Variety & Features': 'N/A',
  'Other': 'N/A'},
 'main_problem': 'Download access issue',
 'reason': 'Difficulty downloading the ebook',
 'text_rating': 3}

In [80]:
def process_reviews(
    df,
    prompt_path,
    output_path,
    max_samples=None,
    save_every=5  # autosave every 400 rows
):

    PROMPT_TEMPLATE = load_prompt(prompt_path)

    # limit to first n rows if needed
    if max_samples:
        df = df.head(max_samples)
    # df = df.reset_index(drop=True)

    # Resume if previous file exists
    if os.path.exists(output_path):
        print(f"Resuming from {output_path}...")
        processed_df = pd.read_csv(output_path)
        start_idx = len(processed_df)
        print(f"Found {start_idx} already processed rows, resuming...")
    else:
        processed_df = pd.DataFrame(columns=list(df.columns) + ["analysis_result"])
        start_idx = 0

    # Processing loop
    if start_idx != 0:
        end_idx = start_idx + len(df)
    else:
        end_idx = len(df)
    for i in tqdm(range(start_idx, end_idx), total=end_idx - start_idx, desc="Processing reviews"):
        row = df.loc[i]
        review_text = row.get("review_body", "")
        star_rating = row.get("star_rating", 0)

        try:
            result = analyze_review(review_text, star_rating, PROMPT_TEMPLATE)
        except Exception as e:
            result = {"error": str(e)}

        # Add row with JSON string
        processed_df.loc[len(processed_df)] = list(row.values) + [str(result)]

        # Save every batch or at the very end
        if (i + 1) % save_every == 0 or (i + 1) == end_idx:
            processed_df.to_csv(output_path, index=False)
            print(f"Progress saved at {i + 1} rows")

    return processed_df


In [81]:
df = pd.read_csv('final_dataset.csv')
df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,review_length,length_bin,month,category
0,US,45450659,R2X2VCD6881DZG,B0079W9WZ6,318707260,And the Moonbeams Kissed the Sea,Digital_Video_Download,5,31,33,False,False,Cerebral crime procedural featuring lots of li...,Note: Amazon first made Series 3&4 available v...,2012-04-08,855,extra_long,2012-04,videos
1,US,10366214,R3DXVCE0HEUVLC,B0054W4JQK,465996227,Before I Go To Sleep,Digital_Ebook_Purchase,4,0,1,False,True,Surprising End,"The start of the book was interesting, then ha...",2012-05-26,38,short,2012-05,ebooks
2,US,15355503,RQZMZFDSAGADS,B00822X7OY,810428534,Square Enix Ultimate Collection [Download],Digital_Video_Games,5,1,3,False,True,$7.49 for 5 games? Amazo.com is THE BEST!,this is the Biggest deal on amazon.com about g...,2012-05-19,61,medium,2012-05,games
3,US,15526727,RECG1YZ46MM0V,B005S4Y13K,70285996,TurboTax Deluxe Federal + E-file + State 2011 ...,Digital_Software,4,0,0,False,True,No issues with download/install,Just downloaded and installed and it took 5 mi...,2012-01-27,43,short,2012-01,software
4,US,38097415,R1SKCVKCY3F8CD,B0064TYRAU,771851250,Mass Effect 3,Digital_Video_Games,1,11,17,False,False,please do not support,"This game is, without a doubt, absolutely bril...",2012-03-17,154,medium,2012-03,games


In [83]:
prompt_path = "prompts/prompt3.txt"
output_dir = "analyzed_review_data"
output_path = os.path.join(output_dir, "prompt3_processed_reviews.csv")
os.makedirs(output_dir, exist_ok=True)

r = process_reviews(df[1999:], prompt_path, output_path, max_samples=len(df[1999:]))
r

Resuming from analyzed_review_data/prompt3_processed_reviews.csv...
Found 1999 already processed rows, resuming...


Processing reviews: 100%|██████████| 1/1 [00:13<00:00, 13.56s/it]

Progress saved at 2000 rows





Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,review_length,length_bin,month,category,analysis_result
0,US,45450659,R2X2VCD6881DZG,B0079W9WZ6,318707260,And the Moonbeams Kissed the Sea,Digital_Video_Download,5,31,33,False,False,Cerebral crime procedural featuring lots of li...,Note: Amazon first made Series 3&4 available v...,2012-04-08,855,extra_long,2012-04,videos,"{'overall_sentiment': 'Positive', 'aspect_sent..."
1,US,10366214,R3DXVCE0HEUVLC,B0054W4JQK,465996227,Before I Go To Sleep,Digital_Ebook_Purchase,4,0,1,False,True,Surprising End,"The start of the book was interesting, then ha...",2012-05-26,38,short,2012-05,ebooks,"{'overall_sentiment': 'Positive', 'aspect_sent..."
2,US,15355503,RQZMZFDSAGADS,B00822X7OY,810428534,Square Enix Ultimate Collection [Download],Digital_Video_Games,5,1,3,False,True,$7.49 for 5 games? Amazo.com is THE BEST!,this is the Biggest deal on amazon.com about g...,2012-05-19,61,medium,2012-05,games,"{'overall_sentiment': 'Positive', 'aspect_sent..."
3,US,15526727,RECG1YZ46MM0V,B005S4Y13K,70285996,TurboTax Deluxe Federal + E-file + State 2011 ...,Digital_Software,4,0,0,False,True,No issues with download/install,Just downloaded and installed and it took 5 mi...,2012-01-27,43,short,2012-01,software,"{'overall_sentiment': 'Positive', 'aspect_sent..."
4,US,38097415,R1SKCVKCY3F8CD,B0064TYRAU,771851250,Mass Effect 3,Digital_Video_Games,1,11,17,False,False,please do not support,"This game is, without a doubt, absolutely bril...",2012-03-17,154,medium,2012-03,games,"{'overall_sentiment': 'Negative', 'aspect_sent..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,US,43498454,RUBQWETGCIWFV,B0068TJ7OC,740565112,H&R Block At Home 2011 Deluxe + State,Digital_Software,4,2,2,False,True,Great tax prep software,I switched to TurboTax last year and switched ...,2012-02-04,49,short,2012-02,software,"{'overall_sentiment': 'Neutral', 'aspect_senti..."
1996,US,46785098,R304SNPYUR2SE5,B007977HM6,151163064,LA Noire,Digital_Video_Games,2,0,0,False,True,"Good game, HORRIBLE DRM!!!",I was very excited to get this game on PC beca...,2012-05-08,201,long,2012-05,games,"{'overall_sentiment': 'Negative', 'aspect_sent..."
1997,US,24803828,R1YCE7ULKKTH4X,B0068TJ7OC,740565112,H&R Block At Home 2011 Deluxe + State,Digital_Software,1,0,0,False,True,"Good program, bad pricing structure",I used H&R Block at Home Deluxe + State for my...,2012-04-29,162,medium,2012-04,software,"{'overall_sentiment': 'Negative', 'aspect_sent..."
1998,US,50994027,R2M5ELQIUI5F4D,B006ULENFG,445862452,Crusader Kings II [Download],Digital_Video_Games,5,1,2,False,True,Top Notch Strategy Game,Crusader Kings II is a game fit for kings! Or...,2012-05-15,201,long,2012-05,games,"{'overall_sentiment': 'Positive', 'aspect_sent..."
