In [None]:
# ==================== SETUP DIRECTORIES ====================
print("\n" + "="*80)
print("SETUP - LOADING PREVIOUS RESULTS")
print("="*80)

# Output directory (works on any Kaggle account)
try:
    output_dir = "/kaggle/working/fl_adult_analysis"
    os.makedirs(output_dir, exist_ok=True)
    print(f"‚úì Running on Kaggle - Output: {output_dir}")
except:
    output_dir = os.path.join(r"c:\Users\almir\ai-privacy\backend", "fl_adult_analysis")
    os.makedirs(output_dir, exist_ok=True)
    print(f"‚úì Running locally - Output: {output_dir}")

# Try to find FL results from previous run
fl_results_path = None
try:
    # Try Kaggle input datasets first
    fl_paths = glob.glob('/kaggle/input/*/fl_adult_results.json')
    if fl_paths:
        fl_results_path = fl_paths[0]
        print(f"‚úì Found FL results in Kaggle input: {fl_results_path}")
except:
    pass

if not fl_results_path:
    # Try local backend folder
    local_path = os.path.join(r"c:\Users\almir\ai-privacy\backend", "models_fl_adult", "fl_adult_results.json")
    if os.path.exists(local_path):
        fl_results_path = local_path
        print(f"‚úì Found FL results locally: {fl_results_path}")
    else:
        print("‚ùå ERROR: Could not find fl_adult_results.json")
        print("   Please add the FL results as a Kaggle dataset or ensure it exists locally")
        raise FileNotFoundError("fl_adult_results.json not found")

In [None]:
# ==================== LOAD RESULTS ====================
print("\n" + "="*80)
print("LOADING FL AND BASELINE RESULTS")
print("="*80)

# Load FL results
with open(fl_results_path, 'r') as f:
    fl_data = json.load(f)

fl_results = fl_data['federated_learning']
print(f"‚úì Loaded FL results: {len(fl_results)} configurations")

# Load baseline results
baseline_results = None
try:
    # Try Kaggle input first
    baseline_paths = glob.glob('/kaggle/input/*/research_results.json')
    if baseline_paths:
        with open(baseline_paths[0], 'r') as f:
            baseline_data = json.load(f)
        print(f"‚úì Loaded baseline from Kaggle input")
except:
    pass

if not baseline_results:
    # Try local
    try:
        baseline_path = os.path.join(r"c:\Users\almir\ai-privacy\backend", "models_research", "research_results.json")
        with open(baseline_path, 'r') as f:
            baseline_data = json.load(f)
        print(f"‚úì Loaded baseline locally")
    except:
        print("‚ö†Ô∏è  Baseline results not found - will skip baseline comparison")
        baseline_data = None

# Extract baseline for adult dataset
if baseline_data:
    baseline_results = {}
    for model in ['LR', 'FNN']:
        key = f"adult_{model}"
        baseline_results[key] = {
            'accuracy': baseline_data['baseline_results']['adult'][model]['accuracy']['mean'],
            'all_accuracies': baseline_data['baseline_results']['adult'][model]['all_accuracies']
        }
    print(f"‚úì Extracted baseline for Adult dataset: {len(baseline_results)} models")

In [None]:
# ==================== STATISTICAL ANALYSIS ====================
print("\n" + "="*80)
print("STATISTICAL ANALYSIS - FL vs BASELINE")
print("="*80)

if baseline_results:
    fl_comparison = []
    
    for config_key, fl_config in fl_results.items():
        baseline_key = f"adult_{fl_config['model']}"
        baseline_acc = baseline_results[baseline_key]['accuracy']
        baseline_all = baseline_results[baseline_key]['all_accuracies']
        
        fl_acc = fl_config['accuracy']['mean']
        fl_all = fl_config['all_accuracies']
        accuracy_loss = baseline_acc - fl_acc
        
        # T-test
        t_stat, p_value = stats.ttest_ind(baseline_all, fl_all)
        
        fl_comparison.append({
            'Model': fl_config['model'],
            'Aggregation': fl_config['aggregation'],
            'FL_Accuracy': fl_acc * 100,
            'FL_Std': fl_config['accuracy']['std'] * 100,
            'Baseline': baseline_acc * 100,
            'Accuracy_Loss': accuracy_loss * 100,
            't_statistic': t_stat,
            'p_value': p_value,
            'Significant': 'Yes' if p_value < 0.05 else 'No'
        })
    
    fl_comparison_df = pd.DataFrame(fl_comparison)
    print("\n" + fl_comparison_df.to_string(index=False))
else:
    print("\n‚ö†Ô∏è  Skipping baseline comparison - baseline results not available")
    
    # Create summary without baseline
    fl_summary = []
    for config_key, fl_config in fl_results.items():
        fl_summary.append({
            'Model': fl_config['model'],
            'Aggregation': fl_config['aggregation'],
            'FL_Accuracy': fl_config['accuracy']['mean'] * 100,
            'FL_Std': fl_config['accuracy']['std'] * 100,
            'FL_Min': fl_config['accuracy']['min'] * 100,
            'FL_Max': fl_config['accuracy']['max'] * 100,
            'F1_Score': fl_config['f1']['mean'] * 100,
            'F1_Std': fl_config['f1']['std'] * 100
        })
    
    fl_comparison_df = pd.DataFrame(fl_summary)
    print("\n" + fl_comparison_df.to_string(index=False))

In [None]:
# ==================== SAVE ANALYSIS RESULTS ====================
print("\n" + "="*80)
print("SAVING ANALYSIS RESULTS")
print("="*80)

# Save comparison/summary CSV
csv_path = os.path.join(output_dir, 'fl_adult_statistical_analysis.csv')
fl_comparison_df.to_csv(csv_path, index=False)
print(f"‚úì Saved: fl_adult_statistical_analysis.csv")

# Save combined JSON
analysis_json = {
    'metadata': {
        'timestamp': datetime.now().isoformat(),
        'analysis_type': 'FL Adult Dataset - Statistical Analysis',
        'source': 'Continuation from fl_adult_crossvalidation.ipynb'
    },
    'statistical_comparison': fl_comparison_df.to_dict('records'),
    'fl_results': fl_results
}

if baseline_results:
    analysis_json['baseline_reference'] = baseline_results

json_path = os.path.join(output_dir, 'fl_adult_analysis_complete.json')
with open(json_path, 'w') as f:
    json.dump(analysis_json, f, indent=2, default=lambda x: float(x) if isinstance(x, np.floating) else x)
print(f"‚úì Saved: fl_adult_analysis_complete.json")

print("\n" + "="*80)
print("ANALYSIS RESULTS SAVED")
print("="*80)

In [None]:
# ==================== VISUALIZATIONS ====================
print("\n" + "="*80)
print("GENERATING VISUALIZATIONS")
print("="*80)

# FL Comparison Plot
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
fig.suptitle('Federated Learning - Adult Dataset Analysis', fontsize=16, fontweight='bold')

for idx, model in enumerate(['LR', 'FNN']):
    ax = axes[idx]
    
    subset = fl_comparison_df[fl_comparison_df['Model'] == model]
    
    x = range(len(subset))
    
    # Bar plot with error bars
    ax.bar(x, subset['FL_Accuracy'], yerr=subset['FL_Std'], capsize=5, alpha=0.7, label='FL Accuracy')
    
    # Add baseline line if available
    if baseline_results and 'Baseline' in subset.columns:
        baseline_line = subset['Baseline'].iloc[0]
        ax.axhline(y=baseline_line, color='red', linestyle='--', linewidth=2, label='Baseline')
    
    ax.set_xticks(x)
    ax.set_xticklabels(subset['Aggregation'], rotation=45, ha='right')
    ax.set_ylabel('Accuracy (%)', fontsize=12)
    ax.set_title(f'Adult - {model}', fontsize=14)
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
viz_path = os.path.join(output_dir, 'fl_adult_comparison.png')
plt.savefig(viz_path, dpi=300, bbox_inches='tight')
print(f"‚úì Saved: fl_adult_comparison.png")
plt.show()

print("\n" + "="*80)
print("VISUALIZATIONS COMPLETE")
print("="*80)

In [None]:
# ==================== SUMMARY STATISTICS ====================
print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)

print("\nüìä FL Performance Summary:")
print("\nLogistic Regression (LR):")
lr_subset = fl_comparison_df[fl_comparison_df['Model'] == 'LR']
for _, row in lr_subset.iterrows():
    print(f"  {row['Aggregation']:12s}: {row['FL_Accuracy']:.2f}% ¬± {row['FL_Std']:.2f}%")

print("\nFeedforward Neural Network (FNN):")
fnn_subset = fl_comparison_df[fl_comparison_df['Model'] == 'FNN']
for _, row in fnn_subset.iterrows():
    print(f"  {row['Aggregation']:12s}: {row['FL_Accuracy']:.2f}% ¬± {row['FL_Std']:.2f}%")

if baseline_results and 'Accuracy_Loss' in fl_comparison_df.columns:
    print("\nüìâ Accuracy Loss vs Baseline:")
    print(f"  LR  - Mean Loss: {lr_subset['Accuracy_Loss'].mean():.2f}%")
    print(f"  FNN - Mean Loss: {fnn_subset['Accuracy_Loss'].mean():.2f}%")
    
    significant_count = fl_comparison_df[fl_comparison_df['Significant'] == 'Yes'].shape[0]
    total_count = len(fl_comparison_df)
    print(f"\nüìà Statistical Significance:")
    print(f"  {significant_count}/{total_count} comparisons show significant difference (p < 0.05)")

print("\n" + "="*80)
print("‚úÖ ANALYSIS COMPLETE - All results saved to:", output_dir)
print("="*80)

## Summary

This notebook continues the FL Adult analysis by:

1. **Loading Results**: Imports FL results from previous training run (works on any Kaggle account)
2. **Statistical Analysis**: Compares FL against baseline with t-tests (if baseline available)
3. **Visualizations**: Generates comparison charts showing mean ¬± std
4. **Export**: Saves analysis to `/kaggle/working/fl_adult_analysis/` (portable across Kaggle accounts)

**Input Requirements**:
- FL results: Add `fl_adult_results.json` as a Kaggle dataset input
- Baseline results (optional): Add `research_results.json` as a Kaggle dataset input

**Outputs**:
- `fl_adult_statistical_analysis.csv`: Comparison table
- `fl_adult_analysis_complete.json`: Complete analysis results
- `fl_adult_comparison.png`: Visualization