# AI Video Fraud Detection - Experiment Analysis

This notebook provides detailed statistical analysis of the video fraud detection experiment results.

## Contents
1. Data Loading and Overview
2. Performance Metrics Analysis
3. Confidence Score Analysis
4. Error Analysis
5. Visualizations
6. Statistical Tests
7. Conclusions

In [None]:
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

# Load metrics data
metrics_path = Path('../results/metrics/experiment_metrics.json')
with open(metrics_path) as f:
    data = json.load(f)

print(f"Experiment: {data['experiment']['name']}")
print(f"Date: {data['experiment']['date']}")
print(f"Dataset size: {data['experiment']['dataset_size']} videos")

## 1. Data Overview

### Dataset Composition

In [None]:
# Dataset composition
dataset = data['dataset']
print(f"Total videos: {dataset['total_videos']}")
print(f"AI-generated: {dataset['ai_generated']}")
print(f"Authentic: {dataset['authentic']}")
print(f"\nClass distribution: {dataset['ai_generated']/dataset['total_videos']*100:.1f}% AI / {dataset['authentic']/dataset['total_videos']*100:.1f}% Authentic")

# Dataset table
print("\n--- Dataset Summary ---")
for video in dataset['videos']:
    print(f"Video {video['id']}: {video['file']:15s} -> {video['ground_truth']}")

## 2. Performance Metrics

### Classification Metrics

The standard binary classification metrics are:

$$\text{Accuracy} = \frac{TP + TN}{TP + TN + FP + FN}$$

$$\text{Precision} = \frac{TP}{TP + FP}$$

$$\text{Recall} = \frac{TP}{TP + FN}$$

$$F_1 = 2 \cdot \frac{\text{Precision} \cdot \text{Recall}}{\text{Precision} + \text{Recall}}$$

In [None]:
# Extract confusion matrix values
cm = data['confusion_matrix']
tp = cm['true_positives']
tn = cm['true_negatives']
fp = cm['false_positives']
fn = cm['false_negatives']

# Calculate metrics (verify against stored values)
accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

print("=" * 40)
print("PERFORMANCE METRICS")
print("=" * 40)
print(f"Accuracy:  {accuracy:.1%} ({tp + tn}/{tp + tn + fp + fn})")
print(f"Precision: {precision:.1%} ({tp}/{tp + fp})")
print(f"Recall:    {recall:.1%} ({tp}/{tp + fn})")
print(f"F1 Score:  {f1:.3f}")
print("=" * 40)

In [None]:
# Confusion Matrix Visualization
fig, ax = plt.subplots(figsize=(8, 6))

cm_array = np.array([[tp, fn], [fp, tn]])
labels = ['AI Generated', 'Authentic']

sns.heatmap(cm_array, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Predicted AI', 'Predicted Authentic'],
            yticklabels=['Actual AI', 'Actual Authentic'],
            ax=ax, cbar_kws={'label': 'Count'})

ax.set_title('Confusion Matrix', fontsize=14, fontweight='bold')
ax.set_xlabel('Predicted Label', fontsize=12)
ax.set_ylabel('Actual Label', fontsize=12)

# Add annotations
ax.text(0, -0.15, f'TP={tp}', transform=ax.transAxes, fontsize=10)
ax.text(0.25, -0.15, f'FN={fn}', transform=ax.transAxes, fontsize=10)
ax.text(0.5, -0.15, f'FP={fp}', transform=ax.transAxes, fontsize=10)
ax.text(0.75, -0.15, f'TN={tn}', transform=ax.transAxes, fontsize=10)

plt.tight_layout()
plt.savefig('../results/figures/confusion_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

## 3. Confidence Score Analysis

Analyzing the relationship between model confidence and prediction correctness.

In [None]:
# Extract predictions
predictions = data['predictions']

# Separate correct and incorrect predictions
correct_conf = [p['confidence'] for p in predictions if p['correct']]
incorrect_conf = [p['confidence'] for p in predictions if not p['correct']]

print("=" * 40)
print("CONFIDENCE ANALYSIS")
print("=" * 40)
print(f"Correct predictions ({len(correct_conf)}):")
print(f"  Mean confidence: {np.mean(correct_conf):.1f}%")
print(f"  Std deviation:   {np.std(correct_conf):.1f}%")
print(f"  Range: {min(correct_conf)}% - {max(correct_conf)}%")
print()
print(f"Incorrect predictions ({len(incorrect_conf)}):")
print(f"  Mean confidence: {np.mean(incorrect_conf):.1f}%")
print("=" * 40)

# Key finding
if np.mean(incorrect_conf) >= np.mean(correct_conf):
    print("\n‚ö†Ô∏è WARNING: Incorrect predictions have equal or higher confidence!")
    print("   This indicates confidence scores are not well-calibrated.")

In [None]:
# Confidence visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Per-video confidence
ax1 = axes[0]
video_ids = [p['video_id'] for p in predictions]
confidences = [p['confidence'] for p in predictions]
colors = ['green' if p['correct'] else 'red' for p in predictions]

bars = ax1.bar(video_ids, confidences, color=colors, edgecolor='black', alpha=0.8)
ax1.axhline(y=np.mean(confidences), color='blue', linestyle='--', label=f'Mean: {np.mean(confidences):.1f}%')
ax1.set_xlabel('Video ID', fontsize=12)
ax1.set_ylabel('Confidence (%)', fontsize=12)
ax1.set_title('Confidence by Video', fontsize=14, fontweight='bold')
ax1.set_ylim(0, 105)
ax1.legend()

# Add correct/incorrect labels
for i, (vid, conf, pred) in enumerate(zip(video_ids, confidences, predictions)):
    label = '‚úì' if pred['correct'] else '‚úó'
    ax1.text(vid, conf + 2, label, ha='center', fontsize=14)

# Confidence distribution by correctness
ax2 = axes[1]
data_to_plot = [correct_conf, incorrect_conf] if incorrect_conf else [correct_conf]
labels_plot = ['Correct', 'Incorrect'] if incorrect_conf else ['Correct']

bp = ax2.boxplot(data_to_plot, labels=labels_plot, patch_artist=True)
bp['boxes'][0].set_facecolor('lightgreen')
if len(bp['boxes']) > 1:
    bp['boxes'][1].set_facecolor('lightcoral')

ax2.set_ylabel('Confidence (%)', fontsize=12)
ax2.set_title('Confidence Distribution', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('../results/figures/confidence_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## 4. Error Analysis

Detailed examination of classification errors.

In [None]:
# Error analysis
error_info = data['error_analysis']

print("=" * 50)
print("ERROR ANALYSIS")
print("=" * 50)
print(f"Total errors: {error_info['total_errors']}")
print(f"Error type: {error_info['error_type']}")
print(f"Error video: Video {error_info['error_video']}")
print(f"\nDescription:")
print(f"  {error_info['error_description']}")
print("=" * 50)

# Find the error prediction details
error_pred = next(p for p in predictions if not p['correct'])
print(f"\nError Details:")
print(f"  Video ID: {error_pred['video_id']}")
print(f"  Prediction: {error_pred['prediction']}")
print(f"  Confidence: {error_pred['confidence']}%")
print(f"  Ground Truth: AUTHENTIC")

## 5. Precision-Recall Trade-off

Understanding the trade-off between precision and recall in our detection system.

In [None]:
# Create metrics comparison visualization
fig, ax = plt.subplots(figsize=(10, 6))

metrics_names = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
metrics_values = [accuracy, precision, recall, f1]
colors = ['steelblue', 'coral', 'seagreen', 'mediumpurple']

bars = ax.bar(metrics_names, metrics_values, color=colors, edgecolor='black', alpha=0.8)

# Add value labels on bars
for bar, val in zip(bars, metrics_values):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.02,
            f'{val:.1%}' if val <= 1 else f'{val:.3f}',
            ha='center', va='bottom', fontsize=12, fontweight='bold')

ax.axhline(y=0.75, color='red', linestyle='--', alpha=0.7, label='75% threshold')
ax.set_ylabel('Score', fontsize=12)
ax.set_title('Classification Performance Metrics', fontsize=14, fontweight='bold')
ax.set_ylim(0, 1.15)
ax.legend()

plt.tight_layout()
plt.savefig('../results/figures/metrics_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Statistical Considerations

### Sample Size Limitations

With only $n=5$ samples, we must be cautious about statistical inference:

$$\text{Standard Error} = \sqrt{\frac{p(1-p)}{n}}$$

For our accuracy of 80%:
$$SE = \sqrt{\frac{0.8 \times 0.2}{5}} \approx 0.179$$

This gives a 95% confidence interval of approximately:
$$0.80 \pm 1.96 \times 0.179 = [0.45, 1.00]$$

In [None]:
# Statistical calculations
n = 5
p = accuracy

# Standard error
se = np.sqrt(p * (1 - p) / n)

# 95% CI (normal approximation - note: not ideal for small n)
z = 1.96
ci_lower = max(0, p - z * se)
ci_upper = min(1, p + z * se)

print("=" * 50)
print("STATISTICAL ANALYSIS")
print("=" * 50)
print(f"Sample size: n = {n}")
print(f"Observed accuracy: p = {p:.1%}")
print(f"Standard error: SE = {se:.3f}")
print(f"95% Confidence Interval: [{ci_lower:.1%}, {ci_upper:.1%}]")
print("=" * 50)
print("\n‚ö†Ô∏è NOTE: Small sample size limits statistical power.")
print("   Results should be validated with larger datasets.")

## 7. Conclusions

### Key Findings

1. **Accuracy**: 80% overall accuracy, meeting the >75% target
2. **High Recall**: 100% recall indicates all AI-generated content was detected
3. **False Positive Risk**: One authentic video was incorrectly flagged as AI
4. **Confidence Calibration**: High confidence does not guarantee correctness

### Recommendations

1. Expand dataset for more robust statistical conclusions
2. Implement confidence calibration techniques
3. Add compression artifact awareness to the prompt
4. Consider ensemble methods combining multiple models

In [None]:
# Final summary
print("\n" + "=" * 60)
print("EXPERIMENT SUMMARY")
print("=" * 60)
print(f"\nüìä Dataset: {n} videos ({dataset['ai_generated']} AI, {dataset['authentic']} authentic)")
print(f"\nüìà Results:")
print(f"   ‚Ä¢ Accuracy:  {accuracy:.1%}")
print(f"   ‚Ä¢ Precision: {precision:.1%}")
print(f"   ‚Ä¢ Recall:    {recall:.1%}")
print(f"   ‚Ä¢ F1 Score:  {f1:.3f}")
print(f"\n‚ö†Ô∏è Errors: {error_info['total_errors']} ({error_info['error_type']})")
print(f"\n‚úÖ Conclusion: Model shows promise but needs larger validation")
print("=" * 60)