In [None]:
import pandas as pd
import numpy as np

results_file = "outputs/batch/p2_batch_preds.csv"
df = pd.read_csv(results_file)

print(f"Total articles loaded: {len(df)}")
print(f"Columns: {df.columns.tolist()}")
df.head()

#Analyze current results
print("=== Current Results Summary ===")
print(f"Total articles: {len(df)}")

# Check sentiment distribution
print("\nSentiment distribution:")
print(df['pred_sentiment'].value_counts())

# Check for missing/null values
print(f"\nMissing pred_sentiment: {df['pred_sentiment'].isna().sum()}")

# Identify failed articles
failed_conditions = (
    df['pred_sentiment'].isin(['PARSE_ERROR', 'TRUNCATED_RESPONSE', 'API_ERROR', 'API_BLOCKED']) |
    df['pred_sentiment'].isna()
)

failed_df = df[failed_conditions].copy()
successful_df = df[~failed_conditions].copy()

print(f"Failed articles: {len(failed_df)}")
print(f"Successful articles: {len(successful_df)}")
print(f"Success rate: {(len(successful_df) / len(df) * 100):.1f}%")

if len(failed_df) > 0:
    print("\nFailed article breakdown:")
    print(failed_df['pred_sentiment'].value_counts())

#Create CSV with failed articles for retry
if len(failed_df) > 0:
    # Keep only the columns needed for inference
    failed_articles = failed_df[['id', 'HEADING', 'ARTICLE CONTENT']].copy()
    
    # Save to CSV for retry
    failed_csv_path = "outputs/batch/failed_articles_p2.csv"
    failed_articles.to_csv(failed_csv_path, index=False)
    
    print(f"Failed articles saved to: {failed_csv_path}")
    print(f"Number of articles to retry: {len(failed_articles)}")
    
    # Show sample of failed
    print("\nSample failed articles:")
    print(failed_articles[['id', 'HEADING']].head())
    
else:
    print("No failed articles! All processing was successful.")


In [None]:
import pandas as pd

# Load files
original_file = "outputs/batch/p2_batch_preds.csv"
retry_file = "outputs/batch/retry_p2_preds.csv"

original_df = pd.read_csv(original_file)
retry_df = pd.read_csv(retry_file)

print(f"Original results: {len(original_df)} articles")
print(f"Retry results: {len(retry_df)} articles")

# Check retry success rate
print("=== Retry Results Summary ===")
print("Retry sentiment distribution:")
print(retry_df['pred_sentiment'].value_counts())

retry_success = ~retry_df['pred_sentiment'].isin(['PARSE_ERROR', 'TRUNCATED_RESPONSE', 'API_ERROR'])
print(f"\nRetry success rate: {(retry_success.sum() / len(retry_df) * 100):.1f}%")
print(f"Successfully recovered: {retry_success.sum()} articles")

# Merge results by ID
merged_df = original_df.copy()

print("=== Standardizing Data Types ===")
# Convert both to same data type (integers)
merged_df['pred_sentiment'] = pd.to_numeric(merged_df['pred_sentiment'], errors='coerce')
retry_df['pred_sentiment'] = pd.to_numeric(retry_df['pred_sentiment'], errors='coerce')

print("Original sentiment dtype:", merged_df['pred_sentiment'].dtype)
print("Retry sentiment dtype:", retry_df['pred_sentiment'].dtype)

print("=== Merging Results ===")
updated_count = 0

for _, retry_row in retry_df.iterrows():
    article_id = retry_row['id']
    
    # Find matching row index in original data
    mask = merged_df['id'] == article_id
    
    if mask.any():
        # Update with retry results (now same data type)
        merged_df.loc[mask, 'pred_sentiment'] = retry_row['pred_sentiment']
        merged_df.loc[mask, 'pred_reasoning'] = retry_row['pred_reasoning'] 
        merged_df.loc[mask, 'pred_raw'] = retry_row['pred_raw']
        updated_count += 1
            
print(f"Updated {updated_count} articles with successful retry results")

print("=== Final Results ===")

# Count remaining failures
final_failed = merged_df['pred_sentiment'].isin(['PARSE_ERROR', 'TRUNCATED_RESPONSE', 'API_ERROR', 'API_BLOCKED']).sum()
final_success_rate = ((len(merged_df) - final_failed) / len(merged_df) * 100)

print(f"Final success rate: {final_success_rate:.1f}%")
print(f"Total successful: {len(merged_df) - final_failed}")
print(f"Remaining failed: {final_failed}")

print("\nFinal sentiment distribution:")
print(merged_df['pred_sentiment'].value_counts().sort_index())


final_output = "outputs/batch/final_p2_preds.csv"
merged_df.to_csv(final_output, index=False)

print(f"Final results saved to: {final_output}")