In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
import matplotlib.pyplot as plt
import os
import cv2

In [3]:
def load_memory_dumps(dump_dir, target_size=(64, 64)):
    """
    Load memory dump images from directory structure and resize to fixed dimensions.
    
    Args:
        dump_dir: Directory containing class subdirectories
        target_size: Tuple of (height, width) for resizing images
    """
    if not os.path.exists(dump_dir):
        raise FileNotFoundError(f"Memory dump directory not found: {dump_dir}")
    
    # Get all subdirectories (classes)
    class_dirs = [d for d in os.listdir(dump_dir) if os.path.isdir(os.path.join(dump_dir, d))]
    
    if not class_dirs:
        raise ValueError(f"No class directories found in {dump_dir}")
    
    # Create label mapping
    label_to_idx = {label: idx for idx, label in enumerate(sorted(class_dirs))}
    print(f"Found classes: {label_to_idx}")
    
    # Lists to store images and labels
    images = []
    labels = []
    
    # Process each class directory
    for class_dir in class_dirs:
        class_path = os.path.join(dump_dir, class_dir)
        label = label_to_idx[class_dir]
        
        # Get all image files in the directory
        image_files = [f for f in os.listdir(class_path) if f.endswith(('.png'))]
        
        print(f"Processing {len(image_files)} images from class {class_dir}")
        
        for img_file in image_files:
            img_path = os.path.join(class_path, img_file)
            try:
                # Read raw binary data
                data = np.fromfile(img_path, dtype=np.uint8)
                
                # Convert to square image
                width = int(np.sqrt(len(data)))
                if width * width == len(data):
                    img = data.reshape(width, width)
                else:
                    # Pad to nearest square
                    square_size = int(np.ceil(np.sqrt(len(data))))
                    img = np.zeros((square_size, square_size), dtype=np.uint8)
                    img.flat[:len(data)] = data
                
                # Resize to target size using cv2
                img_resized = cv2.resize(img, target_size, interpolation=cv2.INTER_AREA)
                
                # Flatten the resized image
                img_flat = img_resized.ravel()
                
                images.append(img_flat)
                labels.append(label)
                
            except Exception as e:
                print(f"Error processing {img_path}: {str(e)}")
                continue
    
    if not images:
        raise ValueError("No valid images found in any class directory")
    
    # Convert to numpy arrays
    X = np.array(images)
    y = np.array(labels)
    
    print(f"Loaded {len(X)} total images across {len(class_dirs)} classes")
    print(f"Feature vector shape: {X.shape}")
    
    return X, y, label_to_idx

In [5]:
def apply_pca(X, variance_threshold=0.9):
    """
    Apply PCA while preserving specified variance threshold.
    
    Args:
        X: Input features (n_samples, n_features)
        variance_threshold: Minimum variance to preserve (default: 0.9 for 90%).
    
    Returns:
        tuple: (transformed data, fitted PCA object, scaler object)
    """
    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Initialize PCA without specifying number of components
    pca = PCA()
    pca.fit(X_scaled)
    
    # Calculate cumulative variance ratio
    cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
    
    # Find number of components that preserve desired variance
    n_components = np.argmax(cumulative_variance_ratio >= variance_threshold) + 1
    
    # Fit PCA with selected number of components
    pca = PCA(n_components=n_components)
    X_transformed = pca.fit_transform(X_scaled)
    
    return X_transformed, pca, scaler

In [7]:
def plot_variance_explained(pca, output_path):
    """
    Plot cumulative variance explained by principal components.
    """
    plt.figure(figsize=(10, 6))
    
    # Cumulative variance plot
    cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
    plt.plot(range(1, len(cumulative_variance_ratio) + 1), 
            cumulative_variance_ratio, 
            'bo-', label='Cumulative Explained Variance')
    
    # 90% threshold line
    plt.axhline(y=0.9, color='r', linestyle='--', label='90% Variance Threshold')
    
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance Ratio')
    plt.title('PCA Explained Variance')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    
    plt.savefig(output_path)
    plt.close()

In [9]:
def main():
    # Specify paths
    dump_dir = r"C:\Users\bacaup\AEF\TRAINING"  # Replace with your memory dumps directory
    output_dir = "pca_results"
    os.makedirs(output_dir, exist_ok=True)
    
    # Load memory dumps with fixed size
    print("Loading memory dumps...")
    X, y, label_mapping = load_memory_dumps(dump_dir, target_size=(64, 64))
    
    # Apply PCA
    print("\nApplying PCA...")
    X_transformed, pca, scaler = apply_pca(X, variance_threshold=0.9)
    
    print(f"\nOriginal feature dimension: {X.shape[1]}")
    print(f"Reduced feature dimension: {X_transformed.shape[1]}")
    print(f"Variance preserved: {sum(pca.explained_variance_ratio_):.4f}")
    
    # Plot variance explained
    plot_path = os.path.join(output_dir, "pca_variance_explained.png")
    plot_variance_explained(pca, plot_path)
    print(f"\nVariance plot saved to: {plot_path}")
    
    # Decode the labels back to the original class names
    label_encoder = LabelEncoder()
    label_encoder.fit(list(label_mapping.keys()))
    decoded_labels = label_encoder.inverse_transform(y)
    
    # Save transformed data with decoded labels
    output_data = pd.DataFrame(X_transformed)
    output_data.columns = [f'PC_{i+1}' for i in range(X_transformed.shape[1])]
    output_data['label'] = decoded_labels  # Add decoded labels back
    
    output_path = os.path.join(output_dir, "pca_transformed_data.csv")
    output_data.to_csv(output_path, index=False)
    print(f"Transformed data saved to: {output_path}")
    
    # Save PCA components and explained variance
    components_df = pd.DataFrame(
        pca.components_,
        columns=[f'feature_{i}' for i in range(X.shape[1])],
        index=[f'PC_{i+1}' for i in range(X_transformed.shape[1])]
    )
    components_path = os.path.join(output_dir, "pca_components.csv")
    components_df.to_csv(components_path)
    
    # Save explained variance ratios
    variance_df = pd.DataFrame({
        'component': [f'PC_{i+1}' for i in range(len(pca.explained_variance_ratio_))],
        'explained_variance_ratio': pca.explained_variance_ratio_,
        'cumulative_variance_ratio': np.cumsum(pca.explained_variance_ratio_)
    })
    variance_path = os.path.join(output_dir, "explained_variance.csv")
    variance_df.to_csv(variance_path, index=False)
    
    print(f"\nComponent details saved to: {components_path}")
    print(f"Variance details saved to: {variance_path}")

In [11]:
if __name__ == "__main__":
    main()

Loading memory dumps...
Found classes: {'.ipynb_checkpoints': 0, 'Adposhel': 1, 'Allaple': 2, 'Amonetize': 3, 'AutoRun': 4, 'BrowseFox': 5, 'Dinwod': 6, 'InstallCore': 7, 'MultiPlug': 8, 'Other': 9, 'VBA': 10, 'Vilsel': 11}
Processing 0 images from class .ipynb_checkpoints
Processing 364 images from class Adposhel
Processing 349 images from class Allaple
Processing 349 images from class Amonetize
Processing 158 images from class AutoRun
Processing 152 images from class BrowseFox
Processing 98 images from class Dinwod
Processing 376 images from class InstallCore
Processing 390 images from class MultiPlug
Processing 487 images from class Other
Processing 399 images from class VBA
Processing 311 images from class Vilsel
Loaded 3433 total images across 12 classes
Feature vector shape: (3433, 4096)

Applying PCA...

Original feature dimension: 4096
Reduced feature dimension: 1793
Variance preserved: 0.8973

Variance plot saved to: pca_results\pca_variance_explained.png
Transformed data save