In [3]:
import os
import joblib
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.metrics import confusion_matrix, classification_report

CONFIGS = {
    '1k': {
        'model_path': '/home/ctai42@tntech.edu/OCC/Chao_new_pretrained_mc_15_1k/oc_svm_nu0.01_gamma1.0.joblib',
        'scalar_path': '/home/ctai42@tntech.edu/OCC/Pre_Trained_Model/scaler/scaler_mc15_1k.joblib',
        'train_data_path': '/home/ctai42@tntech.edu/OCC/Model_training_and_inference_code/new_occ_training_data_1k_mc_15.csv',
        'output_csv': '/home/ctai42@tntech.edu/OCC/New_vary_train_size_results_mc15/mc_15_results_Q1_filter_1k.csv'
    },
    '2k': {
        'model_path': '/home/ctai42@tntech.edu/OCC/Chao_new_pretrained_mc_15_2k/oc_svm_nu0.01_gamma1.0.joblib',
        'scalar_path': '/home/ctai42@tntech.edu/OCC/Pre_Trained_Model/scaler/scaler_mc15_2k.joblib',
        'train_data_path': '/home/ctai42@tntech.edu/OCC/Model_training_and_inference_code/new_occ_training_data_2k_mc_15.csv',
        'output_csv': '/home/ctai42@tntech.edu/OCC/New_vary_train_size_results_mc15/mc_15_results_Q1_filter_2k.csv'
    },
    '3k': {
        'model_path': '/home/ctai42@tntech.edu/OCC/Chao_new_pretrained_mc_15_3k/oc_svm_nu0.01_gamma1.0.joblib',
        'scalar_path': '/home/ctai42@tntech.edu/OCC/Pre_Trained_Model/scaler/scaler_mc15_3k.joblib',
        'train_data_path': '/home/ctai42@tntech.edu/OCC/Model_training_and_inference_code/new_occ_training_data_3k_mc_15.csv',
        'output_csv': '/home/ctai42@tntech.edu/OCC/New_vary_train_size_results_mc15/mc_15_results_Q1_filter_3k.csv'
    },
    '4k': {
        'model_path': '/home/ctai42@tntech.edu/OCC/Chao_new_pretrained_mc_15_4k/oc_svm_nu0.01_gamma1.0.joblib',
        'scalar_path': '/home/ctai42@tntech.edu/OCC/Pre_Trained_Model/scaler/scaler_mc15_4k.joblib',
        'train_data_path': '/home/ctai42@tntech.edu/OCC/Model_training_and_inference_code/new_occ_training_data_4k_mc_15.csv',
        'output_csv': '/home/ctai42@tntech.edu/OCC/New_vary_train_size_results_mc15/mc_15_results_Q1_filter_4k.csv'
    },
    '5k': {
        'model_path': '/home/ctai42@tntech.edu/OCC/Chao_new_pretrained_mc_15_5k/oc_svm_nu0.01_gamma1.0.joblib',
        'scalar_path': '/home/ctai42@tntech.edu/OCC/Pre_Trained_Model/scaler/scaler_mc15_5k.joblib',
        'train_data_path': '/home/ctai42@tntech.edu/OCC/Model_training_and_inference_code/new_occ_training_data_5k_mc_15.csv',
        'output_csv': '/home/ctai42@tntech.edu/OCC/New_vary_train_size_results_mc15/mc_15_results_Q1_filter_5k.csv'
    },
    '10k': {
        'model_path': '/home/ctai42@tntech.edu/OCC/Chao_new_pretrained_mc_15_10k/oc_svm_nu0.01_gamma1.0.joblib',
        'scalar_path': '/home/ctai42@tntech.edu/OCC/Pre_Trained_Model/scaler/scaler_mc15_10k.joblib',
        'train_data_path': '/home/ctai42@tntech.edu/OCC/Model_training_and_inference_code/new_occ_training_data_10k_mc_15.csv',
        'output_csv': '/home/ctai42@tntech.edu/OCC/New_vary_train_size_results_mc15/mc_15_results_Q1_filter_10k.csv'
    },
    '15k': {
        'model_path': '/home/ctai42@tntech.edu/OCC/Chao_new_pretrained_mc_15_15k/oc_svm_nu0.01_gamma1.0.joblib',
        'scalar_path': '/home/ctai42@tntech.edu/OCC/Pre_Trained_Model/scaler/scaler_mc15_15k.joblib',
        'train_data_path': '/home/ctai42@tntech.edu/OCC/Model_training_and_inference_code/new_occ_training_data_15k_mc_15.csv',
        'output_csv': '/home/ctai42@tntech.edu/OCC/New_vary_train_size_results_mc15/mc_15_results_Q1_filter_15k.csv'
    },
    '20k': {
        'model_path': '/home/ctai42@tntech.edu/OCC/Chao_new_pretrained_mc_15_20k/oc_svm_nu0.01_gamma1.0.joblib',
        'scalar_path': '/home/ctai42@tntech.edu/OCC/Pre_Trained_Model/scaler/scaler_mc15_20k.joblib',
        'train_data_path': '/home/ctai42@tntech.edu/OCC/Model_training_and_inference_code/new_occ_training_data_20k_mc_15.csv',
        'output_csv': '/home/ctai42@tntech.edu/OCC/New_vary_train_size_results_mc15/mc_15_results_Q1_filter_20k.csv'
    }
}

TEST_DATA_PATH = "/home/ctai42@tntech.edu/OCC/Model_training_and_inference_code/new_occ_testing_data_mc_15.csv"

def run_inference_for_size(size_key, config, test_data):
    
    # Load model and scaler
    model = joblib.load(config['model_path'])
    scalar = joblib.load(config['scalar_path'])
    train_data = pd.read_csv(config['train_data_path'])
    
    # Extract test data information
    image_nums = test_data["image_num"]
    actual_classes = test_data["actual_class"]
    noise_levels = test_data["noise_level"]
    test_features = test_data.drop(columns=["image_num", "actual_class", "noise_level"])
    scaled_test_features = scalar.transform(test_features)

    # Calculate Q1 threshold from training data
    train_clean_data = train_data[train_data['noise_level'] == 0.0]['top1_mean']
    threshold_single = train_clean_data.quantile(0.25)

    # Initialize arrays for results
    filtered_indices = []
    unfiltered_indices = []
    all_predictions = np.empty(len(test_data), dtype=object)
    stage_used = np.empty(len(test_data), dtype=object)
    filter_reason = np.empty(len(test_data), dtype=object)

    # Stage 1: Filtering
    single_class_filtered = 0
    multi_class_count = 0
    
    for i in range(len(test_data)):
        num_classes = sum(1 for j in range(10) if test_features[f'top1_class_{j}_count'].iloc[i] > 0)
        
        if num_classes == 1:
            if test_features['top1_mean'].iloc[i] >= threshold_single:
                filtered_indices.append(i)
                all_predictions[i] = "Normal"
                stage_used[i] = "Stage1"
                filter_reason[i] = "SingleClassHighConfidence"
                single_class_filtered += 1
            else:
                unfiltered_indices.append(i)
                stage_used[i] = "Stage2"
                filter_reason[i] = "SingleClassLowConfidence"
        else:
            unfiltered_indices.append(i)
            stage_used[i] = "Stage2"
            filter_reason[i] = "MultiClass"
            multi_class_count += 1

    # Stage 2: OC-SVM Prediction
    if unfiltered_indices:
        occ_predictions = model.predict(scaled_test_features[unfiltered_indices])
        for idx, pred in zip(unfiltered_indices, occ_predictions):
            all_predictions[idx] = "Flagged" if pred == -1 else "Normal"  # Prediction labels

    # Create ground truth labels (Comparison between ground truth and prediction labels)
    true_labels = np.array(["Flagged" if noise > 0 else "Normal" for noise in noise_levels])

    # Calculate metrics by noise level
    noise_level_metrics = {}
    if unfiltered_indices:
        unfiltered_predictions = all_predictions[unfiltered_indices]
        unfiltered_true_labels = true_labels[unfiltered_indices]
        unfiltered_noise_levels = noise_levels.iloc[unfiltered_indices]
        unfiltered_accuracy = np.mean(unfiltered_predictions == unfiltered_true_labels)

        for noise_level in sorted(test_data["noise_level"].unique()):
            # Get total samples for this noise level
            total_noise_samples = len(test_data[test_data['noise_level'] == noise_level])
            
            # Get Stage 2 metrics
            noise_mask = unfiltered_noise_levels == noise_level
            stage2_samples = sum(noise_mask)
            if stage2_samples > 0:
                noise_preds = unfiltered_predictions[noise_mask]
                noise_true = unfiltered_true_labels[noise_mask]
                accuracy = np.mean(noise_preds == noise_true)
                num_flagged = sum(pred == "Flagged" for pred in noise_preds)
            else:
                accuracy = 0
                num_flagged = 0
            
            filtered_samples = total_noise_samples - stage2_samples
            
            noise_level_metrics[noise_level] = {
                "total_samples": total_noise_samples,
                "filtered": filtered_samples,
                "filtered_percentage": (filtered_samples / total_noise_samples) * 100,
                "stage2_samples": stage2_samples,
                "accuracy": accuracy,
                "flagged": num_flagged,
                "flagged_percentage": (num_flagged / stage2_samples * 100) if stage2_samples > 0 else 0
            }
    
    results = pd.DataFrame({
        "image_num": image_nums,
        "actual_class": actual_classes,
        "noise_level": noise_levels,
        "prediction": all_predictions,
        "true_label": true_labels,
        "decision_stage": stage_used,
        "filter_reason": filter_reason
    })
    results.to_csv(config['output_csv'], index=False)

    summary = {
        'size': size_key,
        'threshold': threshold_single,
        'total_filtered': len(filtered_indices),
        'single_class_high_conf': single_class_filtered,
        'multi_class': multi_class_count,
        'single_class_low_conf': len(unfiltered_indices) - multi_class_count,
        'noise_level_metrics': noise_level_metrics,
        'overall_accuracy': unfiltered_accuracy if unfiltered_indices else 0.0,
        'total_stage2': len(unfiltered_indices)
    }
    
    return summary

def create_summary_tables(all_summaries):
    base_path = "/home/ctai42@tntech.edu/OCC/New_vary_train_size_results_mc15"
    rows = []
    
    rows.append(['Section 1: Threshold and Initial Filtering'])
    rows.append(['Size', 'Q1_Thres.', 'Filtered', 'Filtered %'])
    for summary in all_summaries:
        rows.append([
            summary['size'],
            f"{summary['threshold']:.4f}",
            summary['total_filtered'],
            f"{(summary['total_filtered'] / 1000) * 100:.1f}"
        ])
    
    rows.append([])
    rows.append([])

    rows.append(['Section 2: Performance by Noise Level (Stage 2)'])
    
    noise_headers = ['Size', 'Metric'] + [f"NL {level}" for level in sorted([0.0, 0.1, 0.25, 0.5, 0.75])]
    rows.append(noise_headers)
    
    for summary in all_summaries:
        metrics = summary['noise_level_metrics']
        
        samples_row = [summary['size'], 'Samples']
        for noise_level in sorted(metrics.keys()):
            samples_row.append(metrics[noise_level]['stage2_samples'])
        rows.append(samples_row)
        
        accuracy_row = ['', 'Accuracy']
        for noise_level in sorted(metrics.keys()):
            accuracy_row.append(f"{metrics[noise_level]['accuracy']:.4f}")
        rows.append(accuracy_row)
        
        flagged_row = ['', '% Flagged']
        for noise_level in sorted(metrics.keys()):
            flagged_row.append(f"{metrics[noise_level]['flagged_percentage']:.2f}")
        rows.append(flagged_row)
        
        rows.append([])
    
    rows.append([])

    rows.append(['Section 3: Overall Performance'])
    rows.append(['Size', 'Accuracy', 'S2 Samples'])
    for summary in all_summaries:
        rows.append([
            summary['size'],
            f"{summary['overall_accuracy']:.4f}",
            summary['total_stage2']
        ])
    
    output_file = f'{base_path}/structured_summary.csv'
    pd.DataFrame(rows).to_csv(output_file, index=False, header=False)
    print(f"\nStructured summary saved to: {output_file}")
    
def main():
    # Load test data
    test_data = pd.read_csv(TEST_DATA_PATH)
    
    # Run inference for each size
    all_summaries = []
    for size_key, config in CONFIGS.items():
        print(f"\nProcessing {size_key} dataset...")
        summary = run_inference_for_size(size_key, config, test_data)
        all_summaries.append(summary)
    
    create_summary_tables(all_summaries)

if __name__ == "__main__":
    main()


Processing 1k dataset...

Processing 2k dataset...

Processing 3k dataset...

Processing 4k dataset...

Processing 5k dataset...

Processing 10k dataset...

Processing 15k dataset...

Processing 20k dataset...

Structured summary saved to: /home/ctai42@tntech.edu/OCC/New_vary_train_size_results_mc15/structured_summary.csv
