In [8]:
# Imports
import tensorflow as tf
from tensorflow.keras import layers, models
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from tqdm import tqdm
import time

In [9]:
# Load the MNIST dataset
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Normalize the pixel values to be between 0 and 1
x_train, x_test = x_train / 255.0, x_test / 255.0

# Reshape the data to fit the LeNet input structure (28x28x1 for grayscale images)
x_train = x_train.reshape(x_train.shape[0], 28, 28, 1)
x_test = x_test.reshape(x_test.shape[0], 28, 28, 1)

In [10]:
NUM_MC_SAMPLES = 15  # This can be modified

# Here, test data is fixed for fair comparison
TEST_START_IDX = 2000
TEST_END_IDX = 3000
NUM_TEST_IMAGES = 1000

# Training data configs
TRAIN_CONFIGS = {
    '1k': {'NUM_IMAGES': 1000, 'INDICES': [(0, 1000)]},
    '2k': {'NUM_IMAGES': 2000, 'INDICES': [(0, 2000)]},
    '3k': {'NUM_IMAGES': 3000, 'INDICES': [(0, 2000), (3000, 4000)]},
    '4k': {'NUM_IMAGES': 4000, 'INDICES': [(0, 2000), (3000, 5000)]},
    '5k': {'NUM_IMAGES': 5000, 'INDICES': [(0, 2000), (3000, 6000)]},
    '10k': {'NUM_IMAGES': 10000, 'INDICES': [(0, 2000), (3000, 11000)]},
    '15k': {'NUM_IMAGES': 15000, 'INDICES': [(0, 2000), (3000, 16000)]},
    '20k': {'NUM_IMAGES': 20000, 'INDICES': [(0, 2000), (3000, 21000)]}
}

# Dropout rates and noise levels
dropout_rates = [0.5]
noise_levels = [0.0, 0.1, 0.25, 0.5, 0.75]

In [11]:
# Model and helper functions
def create_lenet_with_dropout(dropout_rate=0.5):
    model = models.Sequential()
    model.add(layers.Conv2D(6, kernel_size=(5, 5), activation='tanh', input_shape=(28, 28, 1), padding='same'))
    model.add(layers.AveragePooling2D(pool_size=(2, 2)))
    model.add(layers.Conv2D(16, kernel_size=(5, 5), activation='tanh'))
    model.add(layers.AveragePooling2D(pool_size=(2, 2)))
    model.add(layers.Flatten())
    model.add(layers.Dense(120, activation='tanh'))
    model.add(layers.Dropout(dropout_rate))
    model.add(layers.Dense(84, activation='tanh'))
    model.add(layers.Dropout(dropout_rate))
    model.add(layers.Dense(10, activation='softmax'))
    return model

def add_noise(image, noise_level=0.1):
    scaled_noise = np.random.normal(loc=0.0, scale=noise_level, size=image.shape)
    noisy_image = image + scaled_noise
    noisy_image = np.clip(noisy_image, 0.0, 1.0)
    return noisy_image

In [12]:
# Processing Functions
def process_mc_results(result_row, all_top5_classes, all_top5_probs):
    all_top5_classes = np.array(all_top5_classes)
    all_top5_probs = np.array(all_top5_probs)
    
    # For each rank position (1-5)
    for rank in range(5):
        # Get classes and probabilities for this rank
        rank_classes = all_top5_classes[:, rank]
        rank_probs = all_top5_probs[:, rank]
        
        # Count unique classes
        unique_classes = np.unique(rank_classes)
        
        # Create probability strings grouped by class
        prob_strings = []
        for cls in unique_classes:
            cls_probs = rank_probs[rank_classes == cls]
            prob_strings.append(f"Class {cls}: {cls_probs.tolist()}")
        
        # Store results
        result_row[f'Unique_Top{rank+1}'] = len(unique_classes)
        result_row[f'Top{rank+1}_Probs'] = '\n'.join(prob_strings)
        result_row[f'Top{rank+1}_Mean'] = np.mean(rank_probs)
        result_row[f'Top{rank+1}_Std'] = np.std(rank_probs)

def create_test_data():
    start_time = time.time()  # Start timing (time analysis to record how long it takes to process MC samples)
    testing_results = []
    
    for dropout_rate in dropout_rates:
        model_name = f'lenet_dropout_{int(dropout_rate*100)}.h5'
        model = tf.keras.models.load_model(model_name)
        
        # Create balanced noise level assignments
        images_per_noise = NUM_TEST_IMAGES // len(noise_levels)
        noise_assignments = []
        for noise_level in noise_levels:
            noise_assignments.extend([noise_level] * images_per_noise)
        np.random.shuffle(noise_assignments)
        
        # Process test images
        for image_index in tqdm(range(NUM_TEST_IMAGES), desc=f"Processing test data DR={dropout_rate}"):
            actual_index = TEST_START_IDX + image_index
            image = x_train[actual_index].reshape(1, 28, 28, 1)
            actual_class = y_train[actual_index]
            assigned_noise = noise_assignments[image_index]
            
            result_row = {
                'Image_Num': actual_index,
                'Actual_Class': actual_class,
                'Dropout_Rate': dropout_rate,
                'Noise_Level': assigned_noise
            }
            
            all_top5_classes = []
            all_top5_probs = []
            
            for _ in range(NUM_MC_SAMPLES):
                noisy_image = add_noise(image, noise_level=assigned_noise)
                pred = model(noisy_image, training=True).numpy()[0]
                top5_indices = np.argsort(pred)[-5:][::-1]
                top5_probs = pred[top5_indices]
                all_top5_classes.append(top5_indices)
                all_top5_probs.append(top5_probs)
            
            process_mc_results(result_row, all_top5_classes, all_top5_probs)
            testing_results.append(result_row.copy())

    # Create DataFrame
    test_df = pd.DataFrame(testing_results)
    
    # Define column order
    column_order = [
        'Image_Num', 'Actual_Class', 'Dropout_Rate', 'Noise_Level',
        'Unique_Top1', 'Unique_Top2', 'Unique_Top3', 'Unique_Top4', 'Unique_Top5',
        'Top1_Probs', 'Top2_Probs', 'Top3_Probs', 'Top4_Probs', 'Top5_Probs',
        'Top1_Mean', 'Top1_Std', 'Top2_Mean', 'Top2_Std',
        'Top3_Mean', 'Top3_Std', 'Top4_Mean', 'Top4_Std',
        'Top5_Mean', 'Top5_Std'
    ]
    
    test_df = test_df[column_order]
    test_df.to_csv(f'new_test_mc_{NUM_MC_SAMPLES}.csv', index=False)
    
    end_time = time.time()  # End timing
    elapsed_time = end_time - start_time
    
    print("\nTest data noise level distribution:")
    print(test_df['Noise_Level'].value_counts())
    print(f"\nTest dataset generation with MC {NUM_MC_SAMPLES} took {elapsed_time:.2f} seconds")
    print(f"Average time per image: {elapsed_time/NUM_TEST_IMAGES:.3f} seconds")
    
    # Save timing info
    with open('dataset_generation_times.csv', 'a') as f:
        if os.path.getsize('dataset_generation_times.csv') == 0:
            # Write header if file is empty
            f.write("type,mc_samples,total_images,total_time,avg_time_per_image\n")
        f.write(f"test,{NUM_MC_SAMPLES},{NUM_TEST_IMAGES},{elapsed_time:.2f},{elapsed_time/NUM_TEST_IMAGES:.3f}\n")
    
    return test_df

def create_train_data(size_key):
    """Creates training dataset of specified size"""
    start_time = time.time()  
    training_results = []
    config = TRAIN_CONFIGS[size_key]
    
    for dropout_rate in dropout_rates:
        model_name = f'lenet_dropout_{int(dropout_rate*100)}.h5'
        model = tf.keras.models.load_model(model_name)
        
        # Process each range of indices
        for start_idx, end_idx in config['INDICES']:
            num_images = end_idx - start_idx
            for image_index in tqdm(range(num_images), 
                                  desc=f"Processing train data {start_idx}-{end_idx} DR={dropout_rate}"):
                actual_index = start_idx + image_index
                image = x_train[actual_index].reshape(1, 28, 28, 1)
                actual_class = y_train[actual_index]
                
                result_row = {
                    'Image_Num': actual_index,
                    'Actual_Class': actual_class,
                    'Dropout_Rate': dropout_rate,
                    'Noise_Level': 0.0  # No noise for training data
                }
                
                all_top5_classes = []
                all_top5_probs = []
                
                for _ in range(NUM_MC_SAMPLES):
                    pred = model(image, training=True).numpy()[0]
                    top5_indices = np.argsort(pred)[-5:][::-1]
                    top5_probs = pred[top5_indices]
                    all_top5_classes.append(top5_indices)
                    all_top5_probs.append(top5_probs)
                
                process_mc_results(result_row, all_top5_classes, all_top5_probs)
                training_results.append(result_row.copy())

    # Create DataFrame
    train_df = pd.DataFrame(training_results)
    
    # Define column order
    column_order = [
        'Image_Num', 'Actual_Class', 'Dropout_Rate', 'Noise_Level',
        'Unique_Top1', 'Unique_Top2', 'Unique_Top3', 'Unique_Top4', 'Unique_Top5',
        'Top1_Probs', 'Top2_Probs', 'Top3_Probs', 'Top4_Probs', 'Top5_Probs',
        'Top1_Mean', 'Top1_Std', 'Top2_Mean', 'Top2_Std',
        'Top3_Mean', 'Top3_Std', 'Top4_Mean', 'Top4_Std',
        'Top5_Mean', 'Top5_Std'
    ]
    
    train_df = train_df[column_order]
    train_df.to_csv(f'new_train_{size_key}_mc_{NUM_MC_SAMPLES}.csv', index=False)
    
    end_time = time.time()  # End timing
    elapsed_time = end_time - start_time
    
    print(f"\nTraining dataset generation for {size_key} with MC {NUM_MC_SAMPLES} took {elapsed_time:.2f} seconds")
    print(f"Average time per image: {elapsed_time/config['NUM_IMAGES']:.3f} seconds")
    
    # Save timing info
    with open('dataset_generation_times.csv', 'a') as f:
        if os.path.getsize('dataset_generation_times.csv') == 0:
            # Write header if file is empty
            f.write("type,mc_samples,total_images,total_time,avg_time_per_image\n")
        f.write(f"train_{size_key},{NUM_MC_SAMPLES},{config['NUM_IMAGES']},{elapsed_time:.2f},{elapsed_time/config['NUM_IMAGES']:.3f}\n")
    
    return train_df

In [13]:
# Process OCC data
def process_occ_data(df):
    results = []
    
    for _, row in df.iterrows():
        # Initialize counts
        top1_counts = np.zeros(10)
        top2_counts = np.zeros(10)
        
        # Parse Top1_Probs string to get class counts
        top1_probs_str = row['Top1_Probs']
        for class_probs in top1_probs_str.split('\n'):
            class_num = int(class_probs.split(':')[0].split()[-1])
            probs_list = eval(class_probs.split(':')[1].strip())
            top1_counts[class_num] = len(probs_list)
            
        # Parse Top2_Probs string to get class counts
        top2_probs_str = row['Top2_Probs']
        for class_probs in top2_probs_str.split('\n'):
            class_num = int(class_probs.split(':')[0].split()[-1])
            probs_list = eval(class_probs.split(':')[1].strip())
            top2_counts[class_num] = len(probs_list)
        
        # Create result dictionary
        result = {
            'image_num': row['Image_Num'],
            'actual_class': row['Actual_Class'],
            'noise_level': row['Noise_Level'],
            'top1_mean': row['Top1_Mean'],
            'top1_std': row['Top1_Std'],
            'top2_mean': row['Top2_Mean'],
            'top2_std': row['Top2_Std']
        }
        
        # Add class counts
        for i in range(10):
            result[f'top1_class_{i}_count'] = int(top1_counts[i])
            result[f'top2_class_{i}_count'] = int(top2_counts[i])
        
        results.append(result)
    
    return results

def create_occ_dataset(size_key):
    """Creates OCC datasets from MC analysis results"""
    train_results_path = f'new_train_{size_key}_mc_{NUM_MC_SAMPLES}.csv'
    test_results_path = f'new_test_mc_{NUM_MC_SAMPLES}.csv'
    
    # Read the results
    train_df = pd.read_csv(train_results_path)
    test_df = pd.read_csv(test_results_path)
    
    # Process data
    train_results = process_occ_data(train_df)
    test_results = process_occ_data(test_df)
    
    # Create DataFrames
    train_occ_df = pd.DataFrame(train_results)
    test_occ_df = pd.DataFrame(test_results)
    
    # Ensure consistent column ordering
    column_order = ['image_num', 'actual_class', 'noise_level',
                   'top1_mean', 'top1_std', 'top2_mean', 'top2_std']
    
    for i in range(10):
        column_order.append(f'top1_class_{i}_count')
    for i in range(10):
        column_order.append(f'top2_class_{i}_count')
    
    train_occ_df = train_occ_df[column_order]
    test_occ_df = test_occ_df[column_order]
    
    # Verify the distributions
    print(f"\nTraining data ({size_key}) noise level distribution:")
    print(train_occ_df['noise_level'].value_counts())
    print("\nTesting data noise level distribution:")
    print(test_occ_df['noise_level'].value_counts())
    
    # Save to CSV
    train_occ_df.to_csv(f'new_occ_training_data_{size_key}_mc_{NUM_MC_SAMPLES}.csv', index=False)
    test_occ_df.to_csv(f'new_occ_testing_data_mc_{NUM_MC_SAMPLES}.csv', index=False)
    
    return train_occ_df, test_occ_df

In [14]:
if __name__ == "__main__":
    
    create_test_data()
    
    # Create training data for each size and corresponding OCC datasets
    for size in ['2k', '4k', '10k', '15k', '20k']:  
        print(f"\nProcessing {size} dataset...")
        train_df = create_train_data(size)
        create_occ_dataset(size)




Processing 2k dataset...


Processing train data 0-2000 DR=0.5: 100%|██████████████████████████████████████████| 2000/2000 [02:04<00:00, 16.08it/s]



Training dataset generation for 2k with MC 15 took 124.50 seconds
Average time per image: 0.062 seconds





Training data (2k) noise level distribution:
noise_level
0.0    2000
Name: count, dtype: int64

Testing data noise level distribution:
noise_level
0.25    200
0.10    200
0.00    200
0.50    200
0.75    200
Name: count, dtype: int64

Processing 4k dataset...


Processing train data 0-2000 DR=0.5: 100%|██████████████████████████████████████████| 2000/2000 [02:04<00:00, 16.07it/s]
Processing train data 3000-5000 DR=0.5: 100%|███████████████████████████████████████| 2000/2000 [02:04<00:00, 16.06it/s]



Training dataset generation for 4k with MC 15 took 249.10 seconds
Average time per image: 0.062 seconds





Training data (4k) noise level distribution:
noise_level
0.0    4000
Name: count, dtype: int64

Testing data noise level distribution:
noise_level
0.25    200
0.10    200
0.00    200
0.50    200
0.75    200
Name: count, dtype: int64

Processing 10k dataset...


Processing train data 0-2000 DR=0.5: 100%|██████████████████████████████████████████| 2000/2000 [02:04<00:00, 16.08it/s]
Processing train data 3000-11000 DR=0.5: 100%|██████████████████████████████████████| 8000/8000 [08:18<00:00, 16.06it/s]



Training dataset generation for 10k with MC 15 took 622.65 seconds
Average time per image: 0.062 seconds





Training data (10k) noise level distribution:
noise_level
0.0    10000
Name: count, dtype: int64

Testing data noise level distribution:
noise_level
0.25    200
0.10    200
0.00    200
0.50    200
0.75    200
Name: count, dtype: int64

Processing 15k dataset...


Processing train data 0-2000 DR=0.5: 100%|██████████████████████████████████████████| 2000/2000 [02:04<00:00, 16.09it/s]
Processing train data 3000-16000 DR=0.5: 100%|████████████████████████████████████| 13000/13000 [13:25<00:00, 16.14it/s]



Training dataset generation for 15k with MC 15 took 930.12 seconds
Average time per image: 0.062 seconds





Training data (15k) noise level distribution:
noise_level
0.0    15000
Name: count, dtype: int64

Testing data noise level distribution:
noise_level
0.25    200
0.10    200
0.00    200
0.50    200
0.75    200
Name: count, dtype: int64

Processing 20k dataset...


Processing train data 0-2000 DR=0.5: 100%|██████████████████████████████████████████| 2000/2000 [02:03<00:00, 16.21it/s]
Processing train data 3000-21000 DR=0.5: 100%|████████████████████████████████████| 18000/18000 [18:35<00:00, 16.13it/s]



Training dataset generation for 20k with MC 15 took 1239.82 seconds
Average time per image: 0.062 seconds

Training data (20k) noise level distribution:
noise_level
0.0    20000
Name: count, dtype: int64

Testing data noise level distribution:
noise_level
0.25    200
0.10    200
0.00    200
0.50    200
0.75    200
Name: count, dtype: int64
