Preparing Set of Inliers

In [None]:
#1. imports
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from openai import OpenAI
import os
from collections import defaultdict
import random

os.environ["OPENAI_API_KEY"] = "insertapikey"
client = OpenAI()

file_review = "yelp_academic_dataset_review.json"
reviews_list = []
sample_size = 200
random.seed(42)  # for reproducibility

with open(file_review, "r") as f:
    for i, line in enumerate(f):
        obj = json.loads(line)
        obj['_line_number'] = i + 1
        if len(reviews_list) < sample_size:
            reviews_list.append(obj)
        else:
            # Reservoir sampling: replace with decreasing probability
            j = random.randint(0, i)
            if j < sample_size:
                reviews_list[j] = obj

reviews = pd.DataFrame(reviews_list)
sample_reviews = reviews['text'].tolist()
original_stars = [int(s) for s in reviews['stars']]

#3. prompts
prompt_discrete = (
    "You are a Yelp user. Given a Yelp review, predict the star rating (1 to 5). "
    "Only respond with a number (1 to 5)."
)
prompt_probs = (
    "You are a Yelp user. Given a Yelp review, estimate the probability the user gave each star rating from 1 to 5.\n"
    "Output five numbers between 0 and 1 that sum to 1, in order: P(1), P(2), P(3), P(4), P(5). "
    "Only output the numbers, separated by spaces. No labels."
)

# Single prediction loop
discrete_preds = []
prob_preds = []

print("\n--- Running predictions for 200 reviews ---")
for i, review in enumerate(sample_reviews):
    print(f"Review {i+1}/200")

    # Discrete prediction
    try:
        res_discrete = client.chat.completions.create(
            model="gpt-4.1-nano",
            messages=[
                {"role": "system", "content": prompt_discrete},
                {"role": "user", "content": review}
            ],
            temperature=0
        )
        pred = int(res_discrete.choices[0].message.content.strip())
    except:
        pred = np.nan
    discrete_preds.append(pred)

    # Probability prediction
    try:
        res_probs = client.chat.completions.create(
            model="gpt-4.1",
            messages=[
                {"role": "system", "content": prompt_probs},
                {"role": "user", "content": review}
            ],
            temperature=0
        )
        probs = list(map(float, res_probs.choices[0].message.content.strip().split()))
        if len(probs) != 5:
            raise Exception("Incorrect prob format")
    except:
        probs = [np.nan] * 5
    prob_preds.append(probs)

# Compute inconsistency scores
inconsistency_discrete = []
inconsistency_prob = []

for i in range(sample_size):
    true = original_stars[i]
    pred_d = discrete_preds[i]
    pred_p = prob_preds[i]

    # Discrete inconsistency
    if not np.isnan(pred_d):
        inconsistency_discrete.append(abs(pred_d - true))
    else:
        inconsistency_discrete.append(np.nan)

    # Probabilistic inconsistency: -log(P(true label))
    if not np.isnan(pred_p).any():
        log_prob = -np.log(pred_p[true - 1] + 1e-12)
        inconsistency_prob.append(log_prob)
    else:
        inconsistency_prob.append(np.nan)

# Combine results into a DataFrame
output_df = pd.DataFrame({
    "Review #": range(1, sample_size + 1),
    "True Rating": original_stars,
    "Discrete Prediction": discrete_preds,
    "Discrete Inconsistency (|pred - true|)": inconsistency_discrete,
    "Prob Vector": prob_preds,
    "Prob Inconsistency (-log(P(true)))": inconsistency_prob
})

# Preview first 10 rows
print("\n--- Inconsistency Scores Side by Side (First 10 Reviews) ---")
print(output_df.head(10).to_string(index=False))

# Filter out NaNs
valid_discrete_inconsistencies = [v for v in inconsistency_discrete if not np.isnan(v)]

# Plot histogram
plt.figure(figsize=(8, 4))
plt.hist(valid_discrete_inconsistencies, bins=range(0, 6), align='left', color='skyblue', edgecolor='black', rwidth=0.8)
plt.xticks(range(0, 6))
plt.xlabel('|Predicted - True| (Discrete Inconsistency)')
plt.ylabel('Number of Reviews')
plt.title('Distribution of Discrete Inconsistency Scores')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Filter out NaNs
valid_prob_inconsistencies = [v for v in inconsistency_prob if not np.isnan(v)]

# Plot histogram
plt.figure(figsize=(8, 4))
plt.hist(valid_prob_inconsistencies, bins=20, color='salmon', edgecolor='black', alpha=0.8)
plt.xlabel('-log(P(True Label)) (Probabilistic Inconsistency)')
plt.ylabel('Number of Reviews')
plt.title('Distribution of Probabilistic Inconsistency Scores')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

print("\n--- Reviews with Discrete Inconsistency ≥ 1 ---")

for i in range(sample_size):
    if not np.isnan(inconsistency_discrete[i]) and inconsistency_discrete[i] >= 1:
        print(f"\nReview #{i+1} | True: {original_stars[i]} | Predicted: {discrete_preds[i]} | Inconsistency: {inconsistency_discrete[i]}")
        print("Text:")
        print(sample_reviews[i])

        
# Prepare DataFrame for probabilistic inconsistency
prob_df = pd.DataFrame({
    "Review #": range(1, sample_size + 1),
    "Inconsistency": inconsistency_prob,
    "Text": sample_reviews,
    "True Rating": original_stars,
    "Prob Vector": prob_preds
})

# Drop NaNs and get top 10%
prob_df = prob_df.dropna(subset=["Inconsistency"])
top_k = int(sample_size * 0.10)
top_prob_outliers = prob_df.nlargest(top_k, "Inconsistency")

print("\n--- Top 10% Reviews with Highest Probabilistic Inconsistency ---")
for _, row in top_prob_outliers.iterrows():
    print(f"\nReview #{int(row['Review #'])} | True: {row['True Rating']} | -log(P(true)): {row['Inconsistency']:.3f}")
    print("Probabilities:", ["{:.2f}".format(p) for p in row["Prob Vector"]])
    print("Text:")
    print(row["Text"])

Saving Probability Vector Inlier Dataset and appending Overrated and Underrated Scores

In [None]:
# --- Inlier Dataset: Probabilistic Predictions ---
prob_exclude_review_nums = [
    135,38,162,12,16,21,25,60,65,91,
    101,140,156,26,84,130,185
]

df_prob_inliers = output_df[
    ~output_df["Review #"].isin(prob_exclude_review_nums)
][[
    "Review #", "True Rating", "Prob Vector", "Prob Inconsistency (-log(P(true)))"
]]

# Add review text
df_prob_inliers["Review Text"] = df_prob_inliers["Review #"].apply(lambda x: sample_reviews[x - 1])

# Compute expected ratings
expected_ratings = df_prob_inliers["Prob Vector"].apply(lambda probs: sum((i+1)*p for i, p in enumerate(probs)))

# Compare to true rating
df_prob_inliers["Expected Rating"] = expected_ratings

# Define overrated/underrated
df_prob_inliers["Overrated"] = df_prob_inliers.apply(
    lambda row: row["Prob Inconsistency (-log(P(true)))"] if row["Expected Rating"] < row["True Rating"] else 0,
    axis=1
)

df_prob_inliers["Underrated"] = df_prob_inliers.apply(
    lambda row: row["Prob Inconsistency (-log(P(true)))"] if row["Expected Rating"] > row["True Rating"] else 0,
    axis=1
)

# Reorder columns
df_prob_inliers = df_prob_inliers[[
    "Review #", "Review Text", "True Rating", "Prob Vector", "Expected Rating", "Prob Inconsistency (-log(P(true)))","Overrated","Underrated"
]]

print("\n--- Probabilistic Inlier Dataset (Preview) ---")
print(df_prob_inliers.head())

df_prob_inliers.to_csv("inliers_probabilistic.csv", index=False)
print("Inlier datasets saved.")

Running for 10,000 Reviews and Applying Benjamini Hochberg w/ Parallel Processing

In [None]:
#1. imports
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from openai import OpenAI
import os
from collections import defaultdict
import random
import ast
from scipy.stats import percentileofscore
from statsmodels.stats.multitest import multipletests
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
from threading import Lock
import traceback

# Configuration
MAX_WORKERS = 2  # Adjust based on your OpenAI rate limits
BATCH_SIZE = 50  # Number of reviews to process in each batch for progress updates

os.environ["OPENAI_API_KEY"] = "insertapikey" 
client = OpenAI()

# Thread-safe progress tracking
progress_lock = Lock()
completed_requests = 0

def update_progress(total):
    global completed_requests
    with progress_lock:
        completed_requests += 1
        if completed_requests % BATCH_SIZE == 0 or completed_requests == total:
            print(f"Completed {completed_requests}/{total} reviews ({completed_requests/total*100:.1f}%)")

def estimate_tokens(text):
    return int(len(text.split()) * 1.5) + 50  # rough estimate

def get_probability_prediction(review_data, max_retries=6):
    """
    Process a single review and return probability predictions with retry and throttling.
    review_data is a tuple of (index, review_text)
    """
    index, review_text = review_data

    prompt_probs = (
        "You are a Yelp user. Given a Yelp review, estimate the probability the user gave each star rating from 1 to 5.\n"
        "Output five numbers between 0 and 1 that sum to 1, in order: P(1), P(2), P(3), P(4), P(5). "
        "Only output the numbers, separated by spaces. No labels."
    )

    for attempt in range(max_retries):
        try:
            res_probs = client.chat.completions.create(
                model="gpt-4.1",
                messages=[
                    {"role": "system", "content": prompt_probs},
                    {"role": "user", "content": review_text}
                ],
                temperature=0,
                max_tokens=50  # Limit tokens since we only need 5 numbers
            )

            content = res_probs.choices[0].message.content.strip()
            probs = list(map(float, content.split()))

            if len(probs) != 5:
                raise Exception("Incorrect prob format")

            prob_sum = sum(probs)
            if prob_sum > 0:
                probs = [p / prob_sum for p in probs]
            else:
                raise Exception("All probabilities are zero")

            # Delay based on token estimation to avoid TPM overflow
            total_tokens = estimate_tokens(review_text) + 50
            delay = total_tokens / 30000 * 60  # 30k TPM max => delay in seconds
            time.sleep(delay)

            return index, probs

        except Exception as e:
            # Rate limit case: retry with backoff
            error_msg = str(e)
            if 'rate limit' in error_msg.lower() or '429' in error_msg:
                wait = (2 ** attempt) + random.uniform(0, 2.0)
                print(f"Rate limit hit for review {index}. Retrying in {wait:.2f}s (attempt {attempt + 1})")
                time.sleep(wait)
            else:
                print(f"Error processing review {index}: {e}")
                break

    return index, [np.nan] * 5


def process_reviews_parallel(reviews_text, max_workers=MAX_WORKERS):
    """
    Process reviews in parallel using ThreadPoolExecutor
    """
    global completed_requests
    completed_requests = 0
    
    # Create list of (index, review_text) tuples
    review_data = [(i, review) for i, review in enumerate(reviews_text)]
    
    # Initialize results array
    prob_preds = [[np.nan] * 5] * len(reviews_text)
    
    print(f"\n--- Running predictions for {len(reviews_text)} reviews with {max_workers} workers ---")
    start_time = time.time()
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        future_to_index = {
            executor.submit(get_probability_prediction, data): data[0] 
            for data in review_data
        }
        
        # Collect results as they complete
        for future in as_completed(future_to_index):
            try:
                index, probs = future.result()
                prob_preds[index] = probs
                update_progress(len(reviews_text))
            except Exception as e:
                index = future_to_index[future]
                print(f"Error with review {index}: {e}")
                prob_preds[index] = [np.nan] * 5
                update_progress(len(reviews_text))
    
    end_time = time.time()
    print(f"\nCompleted all predictions in {end_time - start_time:.2f} seconds")
    print(f"Average time per review: {(end_time - start_time) / len(reviews_text):.3f} seconds")
    
    return prob_preds

# Load and sample data
print("Loading and sampling Yelp reviews...")
file_review = "yelp_academic_dataset_review.json"
reviews_list = []
sample_size = 10000
random.seed(29)  # for reproducibility

with open(file_review, "r") as f:
    for i, line in enumerate(f):
        obj = json.loads(line)
        obj['_line_number'] = i + 1
        if len(reviews_list) < sample_size:
            reviews_list.append(obj)
        else:
            # Reservoir sampling: replace with decreasing probability
            j = random.randint(0, i)
            if j < sample_size:
                reviews_list[j] = obj

reviews = pd.DataFrame(reviews_list)
sample_reviews = reviews['text'].tolist()
original_stars = [int(s) for s in reviews['stars']]

print(f"Loaded {len(sample_reviews)} reviews")

# Process reviews in parallel
prob_preds = process_reviews_parallel(sample_reviews, max_workers=MAX_WORKERS)

# Compute inconsistency scores
print("\nComputing inconsistency scores...")
inconsistency_prob = []

for i in range(sample_size):
    true = original_stars[i]
    pred_p = prob_preds[i]

    # Probabilistic inconsistency: -log(P(true label))
    if not np.isnan(pred_p).any():
        log_prob = -np.log(pred_p[true - 1] + 1e-12)
        inconsistency_prob.append(log_prob)
    else:
        inconsistency_prob.append(np.nan)

# Combine results into a DataFrame
output_df = pd.DataFrame({
    "Review #": range(1, sample_size + 1),
    "True Rating": original_stars,
    "Prob Vector": prob_preds,
    "Prob Inconsistency (-log(P(true)))": inconsistency_prob
})

# Preview first 10 rows
print("\n--- Inconsistency Scores Side by Side (First 10 Reviews) ---")
print(output_df.head(10).to_string(index=False))

# Filter out NaNs for statistics
valid_prob_inconsistencies = [v for v in inconsistency_prob if not np.isnan(v)]
nan_count = len(inconsistency_prob) - len(valid_prob_inconsistencies)

print(f"\nProcessing Statistics:")
print(f"Total reviews: {len(inconsistency_prob)}")
print(f"Valid predictions: {len(valid_prob_inconsistencies)}")
print(f"Failed predictions: {nan_count}")
print(f"Success rate: {len(valid_prob_inconsistencies)/len(inconsistency_prob)*100:.1f}%")

# Plot histogram
plt.figure(figsize=(10, 6))
plt.hist(valid_prob_inconsistencies, bins=30, color='salmon', edgecolor='black', alpha=0.8)
plt.xlabel('-log(P(True Label)) (Probabilistic Inconsistency)')
plt.ylabel('Number of Reviews')
plt.title('Distribution of Probabilistic Inconsistency Scores')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Prepare DataFrame for probabilistic inconsistency
prob_df = pd.DataFrame({
    "Review #": range(1, sample_size + 1),
    "Inconsistency": inconsistency_prob,
    "Text": sample_reviews,
    "True Rating": original_stars,
    "Prob Vector": prob_preds
})

# Drop NaNs and get top 10%
prob_df = prob_df.dropna(subset=["Inconsistency"])
top_k = int(len(prob_df) * 0.10)  # Use actual valid count, not original sample_size
top_prob_outliers = prob_df.nlargest(top_k, "Inconsistency")

print(f"\n--- Top 10% Reviews with Highest Probabilistic Inconsistency (Top {top_k}) ---")
for _, row in top_prob_outliers.head(5).iterrows():  # Show only first 5 for brevity
    print(f"\nReview #{int(row['Review #'])} | True: {row['True Rating']} | -log(P(true)): {row['Inconsistency']:.3f}")
    print("Probabilities:", ["{:.2f}".format(p) for p in row["Prob Vector"]])
    print("Text:")
    print(row["Text"][:200] + "..." if len(row["Text"]) > 200 else row["Text"])

# Load inlier dataset (if it exists)
try:
    inliers_prob = pd.read_csv("inliers_probabilistic.csv")
    inliers_prob["Prob Vector"] = inliers_prob["Prob Vector"].apply(ast.literal_eval)

    # Separate into overrated and underrated inliers
    overrated_inliers = inliers_prob[inliers_prob["Expected Rating"] <= inliers_prob["True Rating"]]
    underrated_inliers = inliers_prob[inliers_prob["Expected Rating"] >= inliers_prob["True Rating"]]

    overrated_scores_inlier = overrated_inliers["Overrated"].values
    underrated_scores_inlier = underrated_inliers["Underrated"].values

    print(f"\nLoaded {len(inliers_prob)} inlier reviews for comparison")
    
    # Compute expected rating and scores for 10000 reviews
    print("Computing directional inconsistency scores...")
    expected_ratings = []
    overrated_scores_10000 = []
    underrated_scores_10000 = []

    for i in range(sample_size):
        probs = prob_preds[i]
        true = original_stars[i]

        if np.isnan(probs).any():
            expected_ratings.append(np.nan)
            overrated_scores_10000.append(np.nan)
            underrated_scores_10000.append(np.nan)
            continue

        expected = sum((j + 1) * p for j, p in enumerate(probs))
        inconsistency = -np.log(probs[true - 1] + 1e-12)

        expected_ratings.append(expected)
        if expected < true:
            overrated_scores_10000.append(inconsistency)
            underrated_scores_10000.append(0)
        elif expected > true:
            overrated_scores_10000.append(0)
            underrated_scores_10000.append(inconsistency)
        else:
            overrated_scores_10000.append(0)
            underrated_scores_10000.append(0)

    output_df["Expected Rating"] = expected_ratings
    output_df["Overrated Score"] = overrated_scores_10000
    output_df["Underrated Score"] = underrated_scores_10000

    # Compute directional p-values
    def get_pvals(scores, inlier_scores):
        return [
            1 - percentileofscore(inlier_scores, score, kind='mean') / 100 if score > 0 else 1.0
            for score in scores
        ]

    print("Computing p-values...")
    pvals_overrated = get_pvals(overrated_scores_10000, overrated_scores_inlier)
    pvals_underrated = get_pvals(underrated_scores_10000, underrated_scores_inlier)

    # Apply Benjamini-Hochberg (FDR)
    print("Applying FDR correction...")
    rejected_over, pvals_over_adj, _, _ = multipletests(pvals_overrated, alpha=0.2, method='fdr_bh')
    rejected_under, pvals_under_adj, _, _ = multipletests(pvals_underrated, alpha=0.2, method='fdr_bh')

    # Add results to DataFrame
    output_df["Overrated P-Value"] = pvals_overrated
    output_df["Overrated P-Adj"] = pvals_over_adj
    output_df["Is Overrated Outlier"] = rejected_over

    output_df["Underrated P-Value"] = pvals_underrated
    output_df["Underrated P-Adj"] = pvals_under_adj
    output_df["Is Underrated Outlier"] = rejected_under
    
    print(f"Found {sum(rejected_over)} overrated outliers and {sum(rejected_under)} underrated outliers")
    
    # Preview outliers
    print("\n--- OVERRATED OUTLIERS ---")
    overrated_outliers = output_df[output_df["Is Overrated Outlier"]]
    if len(overrated_outliers) > 0:
        print(overrated_outliers[["Review #", "True Rating", "Expected Rating", "Overrated Score", "Overrated P-Adj"]].head())
    else:
        print("No overrated outliers found")

    print("\n--- UNDERRATED OUTLIERS ---")
    underrated_outliers = output_df[output_df["Is Underrated Outlier"]]
    if len(underrated_outliers) > 0:
        print(underrated_outliers[["Review #", "True Rating", "Expected Rating", "Underrated Score", "Underrated P-Adj"]].head())
    else:
        print("No underrated outliers found")

except FileNotFoundError:
    print("\nWarning: 'inliers_probabilistic.csv' not found. Skipping directional analysis.")
    print("Only basic inconsistency analysis will be performed.")

# Add review text to output
output_df["Review Text"] = sample_reviews

# Save to file
output_filename = "10000_reviews_with_directional_pvals.csv"
output_df.to_csv(output_filename, index=False)
print(f"\nSaved results to '{output_filename}'")

Analysis and Visualization

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ks_2samp, spearmanr, percentileofscore
from statsmodels.stats.multitest import multipletests

# --- Load Data ---
df = pd.read_csv("10000_reviews_with_directional_pvals.csv")
df["Prob Vector"] = df["Prob Vector"].apply(eval)

# Compute expected ratings if not in file
if "Expected Rating" not in df.columns:
    df["Expected Rating"] = df["Prob Vector"].apply(lambda probs: sum((i + 1) * p for i, p in enumerate(probs)))

# === Q1: ChatGPT vs Human Rating Distributions ===

# Prepare data in long format
plot_df = pd.DataFrame({
    "Rating": pd.concat([df["True Rating"], df["Expected Rating"]], ignore_index=True),
    "Source": ["Human"] * len(df) + ["ChatGPT"] * len(df)
})

plt.figure(figsize=(10, 6))
sns.histplot(
    data=plot_df,
    x="Rating",
    hue="Source",
    bins=np.arange(1, 7) - 0.5,
    multiple="dodge",  # <-- This enables side-by-side bars
    shrink=0.9,
    palette={"Human": "skyblue", "ChatGPT": "salmon"},
    edgecolor="black"
)

plt.title("Distribution of Human vs ChatGPT Ratings")
plt.xlabel("Rating")
plt.ylabel("Count")
plt.xticks(range(1, 6))
plt.grid(True, linestyle="--", alpha=0.5)
plt.tight_layout()
plt.show()

# Stats
ks_stat, ks_pval = ks_2samp(df["True Rating"], df["Expected Rating"])
print("\nQ1: ChatGPT vs Human Rating Stats")
print(f"Mean (Human): {df['True Rating'].mean():.2f}")
print(f"Mean (ChatGPT): {df['Expected Rating'].mean():.2f}")
print(f"Std (Human): {df['True Rating'].std():.2f}")
print(f"Std (ChatGPT): {df['Expected Rating'].std():.2f}")
print(f"KS Test Statistic: {ks_stat:.3f}, p-value: {ks_pval:.4f}")

# === Q2: Distribution of Inconsistency Scores ===

plt.figure(figsize=(10, 6))
sns.histplot(df["Prob Inconsistency (-log(P(true)))"].dropna(), bins=30, color='mediumpurple', edgecolor='black')
plt.title("Distribution of Probabilistic Inconsistency Scores")
plt.xlabel("-log(P(True Label))")
plt.ylabel("Number of Reviews")
plt.grid(True, linestyle="--", alpha=0.6)
plt.tight_layout()
plt.show()

print("\nQ2: Inconsistency Score Summary")
print(df["Prob Inconsistency (-log(P(true)))"].describe())

# Filter reviews with extreme inconsistency scores
high_inconsistency_df = df[df["Prob Inconsistency (-log(P(true)))"] > 25]

# Sort by score descending (optional)
high_inconsistency_df = high_inconsistency_df.sort_values("Prob Inconsistency (-log(P(true)))", ascending=False)

# Print results
print(f"\nFound {len(high_inconsistency_df)} reviews with -log(P(true)) > 25\n")

for idx, row in high_inconsistency_df.iterrows():
    print(f"--- Review #{int(row['Review #'])} ---")
    print(f"Human Rating: {row['True Rating']}")
    print(f"ChatGPT Expected Rating: {row['Expected Rating']:.2f}")
    print(f"-log(P(true)): {row['Prob Inconsistency (-log(P(true)))']:.4f}")
    print("\nReview Text:")
    print(row["Review Text"])
    print("\n" + "-" * 60 + "\n")

# === Q3: Inconsistency Rates by True Rating ===

grouped = df.groupby("True Rating")["Prob Inconsistency (-log(P(true)))"]
means = grouped.mean()
counts = grouped.count()

plt.figure(figsize=(8, 5))
sns.barplot(x=means.index, y=means.values, palette="YlOrRd")
plt.title("Avg. Inconsistency Score by Human Rating")
plt.xlabel("True Rating")
plt.ylabel("Avg. -log(P(True))")
plt.grid(True, linestyle="--", alpha=0.6)
plt.tight_layout()
plt.show()

print("\nQ3: Mean Inconsistency by Rating")
print(means)

# === Q4: Review Length vs Inconsistency ===

df["Review Length"] = df["Review Text"].str.split().apply(len)

plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x="Review Length", y="Prob Inconsistency (-log(P(true)))", alpha=0.4)
plt.title("Review Length vs Inconsistency Score")
plt.xlabel("Review Length (word count)")
plt.ylabel("-log(P(True Label))")
plt.grid(True, linestyle="--", alpha=0.5)
plt.tight_layout()
plt.show()

# Correlation
corr, pval = spearmanr(df["Review Length"], df["Prob Inconsistency (-log(P(true)))"])
print("\nQ4: Length vs Inconsistency")
print(f"Spearman Correlation: {corr:.3f}, p-value: {pval:.4f}")

# === Q5: Directional Outliers Overview ===

# Count overrated and underrated
over_count = df["Is Overrated Outlier"].sum()
under_count = df["Is Underrated Outlier"].sum()

plt.figure(figsize=(6, 4))
sns.barplot(x=["Overrated", "Underrated"], y=[over_count, under_count], palette="coolwarm")
plt.title("Count of Directional Outliers")
plt.ylabel("Number of Outliers")
plt.grid(True, axis='y', linestyle="--", alpha=0.5)
plt.tight_layout()
plt.show()

print("\nQ5: Directional Outlier Summary")
print(f"Overrated Outliers: {over_count}")
print(f"Underrated Outliers: {under_count}")

# Optional: Preview some examples
print("\n--- Sample Overrated Outliers ---")
print(df[df["Is Overrated Outlier"]][["True Rating", "Expected Rating", "Review Text"]].head(3))

print("\n--- Sample Underrated Outliers ---")
print(df[df["Is Underrated Outlier"]][["True Rating", "Expected Rating", "Review Text"]].head(3))