In [5]:
! pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading joblib-1.4.2-py3-none-any.whl (301 kB)
Downloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.6.1 threadpoolctl-3.5.0


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.decomposition import PCA
import os

# Set plot style
plt.style.use('ggplot')
sns.set(font_scale=1.2)

def load_and_process_data(benign_file, malware_file):
    """Load and process benign and malware sample data"""
    print(f"Loading and processing data from {benign_file} and {malware_file}...")
    
    benign_df = pd.read_csv(benign_file)
    malware_df = pd.read_csv(malware_file)
    
    # Create label column, 0 for benign, 1 for malware
    benign_df['is_malware'] = 0
    malware_df['is_malware'] = 1
    
    # Combine datasets
    combined_df = pd.concat([benign_df, malware_df])
    
    # Select features (exclude non-numerical and zero columns)
    exclude_cols = ['file_name', 'CPU', 'label', 'family', 'is_malware']
    feature_cols = [col for col in combined_df.columns if col not in exclude_cols]
    
    # Remove zero-value features
    non_zero_features = []
    for col in feature_cols:
        if combined_df[col].sum() > 0:
            non_zero_features.append(col)
    
    # Print information
    print(f"Total samples: {len(combined_df)}")
    print(f"Benign samples: {len(benign_df)}")
    print(f"Malware samples: {len(malware_df)}")
    print(f"CPU types: {', '.join(combined_df['CPU'].unique())}")
    print(f"Number of non-zero features: {len(non_zero_features)}")
    
    return combined_df, non_zero_features

def evaluate_classification_performance(df, feature_cols):
    """Evaluate how well the features can classify malware"""
    # Prepare data
    X = df[feature_cols].fillna(0)
    y = df['is_malware']
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.3, random_state=42, stratify=y
    )
    
    # Train Random Forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    
    # Evaluate on test set
    y_pred = rf.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Cross-validation
    cv_scores = cross_val_score(rf, X_scaled, y, cv=5, scoring='f1')
    
    # Print results
    print("\nClassification Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"5-Fold CV F1: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")
    
    # Plot confusion matrix
    os.makedirs("./plots", exist_ok=True)
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Benign', 'Malware'],
                yticklabels=['Benign', 'Malware'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.tight_layout()
    plt.savefig('./plots/confusion_matrix.png', dpi=300)
    plt.close()
    
    # Get feature importance
    feature_importance = pd.DataFrame({
        'Feature': feature_cols,
        'Importance': rf.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    # Plot top 15 important features
    plt.figure(figsize=(12, 8))
    top_features = feature_importance.head(15)
    sns.barplot(x='Importance', y='Feature', data=top_features)
    plt.title('Top 15 Features for Malware Classification')
    plt.tight_layout()
    plt.savefig('./plots/top_features.png', dpi=300)
    plt.close()
    
    return feature_importance, rf

def evaluate_by_cpu(df, feature_cols):
    """Evaluate classification performance for each CPU type"""
    print("\nClassification Performance by CPU Type:")
    
    results = {}
    cpu_types = df['CPU'].unique()
    
    for cpu in cpu_types:
        # Skip null values
        if pd.isna(cpu):
            continue
            
        # Get subset for this CPU
        cpu_df = df[df['CPU'] == cpu]
        
        # Check if we have enough samples
        benign_count = len(cpu_df[cpu_df['is_malware'] == 0])
        malware_count = len(cpu_df[cpu_df['is_malware'] == 1])
        
        if benign_count < 10 or malware_count < 10:
            print(f"CPU {cpu}: Insufficient samples (Benign: {benign_count}, Malware: {malware_count})")
            continue
            
        # Prepare data
        X = cpu_df[feature_cols].fillna(0)
        y = cpu_df['is_malware']
        
        # Scale features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        
        # Cross-validation
        rf = RandomForestClassifier(n_estimators=100, random_state=42)
        cv_scores = cross_val_score(rf, X_scaled, y, cv=5, scoring='f1')
        
        print(f"CPU {cpu}: F1 Score = {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")
        print(f"  - Benign samples: {benign_count}")
        print(f"  - Malware samples: {malware_count}")
        
        results[cpu] = {
            'f1_mean': cv_scores.mean(),
            'f1_std': cv_scores.std(),
            'benign_count': benign_count,
            'malware_count': malware_count
        }
        
    return results

def visualize_pca_explained_variance(df, feature_cols):
    """Visualize PCA explained variance to see how many features are needed"""
    # Prepare data
    X = df[feature_cols].fillna(0)
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # PCA with all components
    pca = PCA()
    pca.fit(X_scaled)
    
    # Calculate cumulative explained variance
    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
    
    # Plot
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o')
    plt.axhline(y=0.8, color='r', linestyle='--', label='80% Explained Variance')
    plt.axhline(y=0.9, color='g', linestyle='--', label='90% Explained Variance')
    
    # Find number of components for thresholds
    n_components_80 = np.argmax(cumulative_variance >= 0.8) + 1
    n_components_90 = np.argmax(cumulative_variance >= 0.9) + 1
    
    plt.axvline(x=n_components_80, color='r', linestyle=':', 
                label=f'{n_components_80} components for 80%')
    plt.axvline(x=n_components_90, color='g', linestyle=':', 
                label=f'{n_components_90} components for 90%')
    
    plt.grid(True)
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance')
    plt.title('PCA Explained Variance')
    plt.legend()
    plt.tight_layout()
    plt.savefig('./plots/pca_explained_variance.png', dpi=300)
    plt.close()
    
    print(f"\nPCA Analysis:")
    print(f"Number of components needed for 80% variance: {n_components_80}")
    print(f"Number of components needed for 90% variance: {n_components_90}")
    print(f"Total number of features: {len(feature_cols)}")
    
    return n_components_80, n_components_90

def main():
    # File paths
    benign_file = '/home/tommy/cross-architecture/Experiment2/csv/20250228_cleanedtrain_benign_file_features.csv'
    malware_file = '/home/tommy/cross-architecture/Experiment2/csv/20250228_cleanedtrain_malware_file_features.csv'
    
    # Load and process data
    df, feature_cols = load_and_process_data(benign_file, malware_file)
    
    # Evaluate classification performance
    feature_importance, model = evaluate_classification_performance(df, feature_cols)
    
    # Save top features
    os.makedirs("./results", exist_ok=True)
    feature_importance.to_csv('./results/feature_importance.csv', index=False)
    
    # Evaluate by CPU
    cpu_results = evaluate_by_cpu(df, feature_cols)
    
    # PCA analysis
    visualize_pca_explained_variance(df, feature_cols)
    
    print("\nConclusion:")
    print("Based on the classification metrics and feature importance analysis,")
    top_features = ", ".join(feature_importance.head(5)['Feature'].tolist())
    print(f"the most important features for malware detection are: {top_features}")
    print("See the full analysis results in the 'results' and 'plots' directories.")

if __name__ == "__main__":
    main()

Loading and processing data from /home/tommy/cross-architecture/Experiment2/csv/20250228_cleanedtrain_benign_file_features.csv and /home/tommy/cross-architecture/Experiment2/csv/20250228_cleanedtrain_malware_file_features.csv...
Total samples: 5286
Benign samples: 2298
Malware samples: 2988
CPU types: ARM, MIPS R3000, Intel 80386, Advanced Micro Devices X86-64
Number of non-zero features: 33

Classification Performance:
Accuracy: 0.9710
Precision: 0.9896
Recall: 0.9588
F1 Score: 0.9740
5-Fold CV F1: 0.9776 (±0.0034)

Classification Performance by CPU Type:
CPU ARM: F1 Score = 0.9761 (±0.0065)
  - Benign samples: 627
  - Malware samples: 779
CPU MIPS R3000: F1 Score = 0.9718 (±0.0101)
  - Benign samples: 672
  - Malware samples: 740
CPU Intel 80386: F1 Score = 0.9943 (±0.0057)
  - Benign samples: 795
  - Malware samples: 791
CPU Advanced Micro Devices X86-64: F1 Score = 0.9756 (±0.0040)
  - Benign samples: 204
  - Malware samples: 678

PCA Analysis:
Number of components needed for 80% v

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import os

# Set plotting style
plt.style.use('ggplot')
sns.set(font_scale=1.2)

def load_data(benign_file, malware_file):
    """Load benign and malware data"""
    benign_df = pd.read_csv(benign_file)
    malware_df = pd.read_csv(malware_file)
    
    # Add labels
    benign_df['is_malware'] = 0
    malware_df['is_malware'] = 1
    
    return benign_df, malware_df

def analyze_cpu_distribution(benign_df, malware_df):
    """Analyze sample distribution by CPU architecture"""
    # Count samples by CPU
    benign_counts = benign_df['CPU'].value_counts()
    malware_counts = malware_df['CPU'].value_counts()
    
    # Create a DataFrame for plotting
    cpu_counts = pd.DataFrame({
        'Benign': benign_counts,
        'Malware': malware_counts
    }).fillna(0)
    
    # Print distribution
    print("Sample Distribution by CPU Architecture:")
    print(cpu_counts)
    print()
    
    # Plot distribution
    os.makedirs('./plots', exist_ok=True)
    plt.figure(figsize=(10, 6))
    cpu_counts.plot(kind='bar')
    plt.title('Sample Distribution by CPU Architecture')
    plt.ylabel('Number of Samples')
    plt.xlabel('CPU Architecture')
    plt.tight_layout()
    plt.savefig('./plots/cpu_distribution.png', dpi=300)
    plt.close()
    
    return cpu_counts

def get_top_features_by_cpu(benign_df, malware_df, cpu_type):
    """Get top discriminative features for a specific CPU architecture"""
    # Filter by CPU
    benign_subset = benign_df[benign_df['CPU'] == cpu_type]
    malware_subset = malware_df[malware_df['CPU'] == cpu_type]
    
    # Skip if not enough samples
    if len(benign_subset) < 5 or len(malware_subset) < 5:
        print(f"Not enough samples for {cpu_type}")
        return None
    
    # Select features (exclude non-numeric columns)
    exclude_cols = ['file_name', 'CPU', 'label', 'family', 'is_malware']
    feature_cols = [col for col in benign_subset.columns if col not in exclude_cols]
    
    # Filter out zero-value features
    non_zero_features = []
    combined = pd.concat([benign_subset, malware_subset])
    for col in feature_cols:
        if combined[col].sum() > 0:
            non_zero_features.append(col)
    
    # Calculate statistics for each feature
    feature_stats = []
    for feature in non_zero_features:
        benign_mean = benign_subset[feature].mean()
        malware_mean = malware_subset[feature].mean()
        
        # Calculate absolute difference and percent difference
        abs_diff = abs(benign_mean - malware_mean)
        if malware_mean != 0:
            pct_diff = abs_diff / malware_mean * 100
        else:
            pct_diff = float('inf') if abs_diff > 0 else 0
            
        # Calculate effect size (Cohen's d)
        pooled_std = np.sqrt((benign_subset[feature].std()**2 + malware_subset[feature].std()**2) / 2)
        effect_size = abs_diff / pooled_std if pooled_std > 0 else 0
        
        feature_stats.append({
            'Feature': feature,
            'Benign Mean': benign_mean,
            'Malware Mean': malware_mean,
            'Absolute Difference': abs_diff,
            'Percent Difference': pct_diff,
            'Effect Size': effect_size
        })
    
    # Convert to DataFrame and sort by effect size
    stats_df = pd.DataFrame(feature_stats).sort_values('Effect Size', ascending=False)
    
    return stats_df

def train_classifier_by_cpu(benign_df, malware_df, cpu_type):
    """Train a Random Forest classifier for a specific CPU and get feature importance"""
    # Filter by CPU
    benign_subset = benign_df[benign_df['CPU'] == cpu_type]
    malware_subset = malware_df[malware_df['CPU'] == cpu_type]
    
    # Skip if not enough samples
    if len(benign_subset) < 5 or len(malware_subset) < 5:
        return None
    
    # Combine datasets
    combined = pd.concat([benign_subset, malware_subset])
    
    # Select features
    exclude_cols = ['file_name', 'CPU', 'label', 'family', 'is_malware']
    feature_cols = [col for col in combined.columns if col not in exclude_cols and combined[col].sum() > 0]
    
    # Prepare data
    X = combined[feature_cols].fillna(0)
    y = combined['is_malware']
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Train Random Forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_scaled, y)
    
    # Get feature importance
    importance = pd.DataFrame({
        'Feature': feature_cols,
        'Importance': rf.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    return importance

def plot_top_features_comparison(benign_df, malware_df, cpu_types, top_n=10):
    """Plot top feature means comparison for each CPU architecture"""
    os.makedirs('./plots', exist_ok=True)
    os.makedirs('./results', exist_ok=True)
    
    for cpu_type in cpu_types:
        # Get top features
        stats_df = get_top_features_by_cpu(benign_df, malware_df, cpu_type)
        
        if stats_df is None or len(stats_df) == 0:
            continue
            
        # Get top features by effect size
        top_features = stats_df.head(top_n)
        
        # Save results
        stats_df.to_csv(f'./results/feature_stats_{cpu_type}.csv', index=False)
        
        # Create data for plotting
        plot_data = []
        for _, row in top_features.iterrows():
            plot_data.append({
                'Feature': row['Feature'],
                'Type': 'Benign',
                'Value': row['Benign Mean']
            })
            plot_data.append({
                'Feature': row['Feature'],
                'Type': 'Malware',
                'Value': row['Malware Mean']
            })
        
        plot_df = pd.DataFrame(plot_data)
        
        # Plot
        plt.figure(figsize=(12, 8))
        chart = sns.barplot(x='Value', y='Feature', hue='Type', data=plot_df)
        plt.title(f'Top {top_n} Discriminative Features for {cpu_type}')
        plt.tight_layout()
        plt.savefig(f'./plots/top_features_{cpu_type}.png', dpi=300)
        plt.close()
        
        # Plot feature importance from Random Forest
        importance = train_classifier_by_cpu(benign_df, malware_df, cpu_type)
        
        if importance is not None and len(importance) > 0:
            importance.to_csv(f'./results/feature_importance_{cpu_type}.csv', index=False)
            
            plt.figure(figsize=(12, 8))
            sns.barplot(x='Importance', y='Feature', data=importance.head(top_n))
            plt.title(f'Top {top_n} Important Features for {cpu_type} Classification')
            plt.tight_layout()
            plt.savefig(f'./plots/feature_importance_{cpu_type}.png', dpi=300)
            plt.close()

def compare_features_across_architectures(benign_df, malware_df, cpu_types, top_n=5):
    """Compare the top features across different CPU architectures"""
    # Get top features for each architecture
    all_top_features = {}
    
    for cpu_type in cpu_types:
        importance = train_classifier_by_cpu(benign_df, malware_df, cpu_type)
        if importance is not None and len(importance) > 0:
            all_top_features[cpu_type] = importance.head(top_n)['Feature'].tolist()
    
    # Find common features
    all_features = []
    for cpu, features in all_top_features.items():
        all_features.extend(features)
    
    unique_features = sorted(list(set(all_features)))
    
    # Create comparison matrix
    comparison = pd.DataFrame(0, index=unique_features, columns=cpu_types)
    
    for cpu, features in all_top_features.items():
        for i, feature in enumerate(features):
            # Use reverse rank as value (top feature gets highest score)
            comparison.loc[feature, cpu] = top_n - i
    
    # Plot heatmap
    plt.figure(figsize=(10, len(unique_features) * 0.4 + 2))
    sns.heatmap(comparison, cmap='YlGnBu', linewidths=0.5, annot=True, fmt='.0f')
    plt.title('Feature Importance Across CPU Architectures')
    plt.tight_layout()
    plt.savefig('./plots/feature_comparison_across_cpus.png', dpi=300)
    plt.close()
    
    # Save comparison
    comparison.to_csv('./results/feature_comparison_across_cpus.csv')
    
    return comparison

def main():
    # File paths
    benign_file = '/home/tommy/cross-architecture/Experiment2/csv/20250228_cleanedtrain_benign_file_features.csv'
    malware_file = '/home/tommy/cross-architecture/Experiment2/csv/20250228_cleanedtrain_malware_file_features.csv'
    
    # Load data
    benign_df, malware_df = load_data(benign_file, malware_file)
    
    # Get CPU types
    cpu_types = sorted(list(set(benign_df['CPU'].unique()) | set(malware_df['CPU'].unique())))
    cpu_types = [cpu for cpu in cpu_types if not pd.isna(cpu)]
    
    # Analyze CPU distribution
    analyze_cpu_distribution(benign_df, malware_df)
    
    # Plot top features for each CPU
    plot_top_features_comparison(benign_df, malware_df, cpu_types)
    
    # Compare features across architectures
    compare_features_across_architectures(benign_df, malware_df, cpu_types)
    
    # Print conclusions for each architecture
    print("\nKey Characteristics by CPU Architecture:")
    for cpu_type in cpu_types:
        print(f"\n{cpu_type} Architecture:")
        
        stats_df = get_top_features_by_cpu(benign_df, malware_df, cpu_type)
        if stats_df is None or len(stats_df) == 0:
            print(f"  Insufficient samples for analysis")
            continue
            
        # Get top 5 features with largest effect size
        top_features = stats_df.head(5)
        
        print(f"  Top Discriminative Features:")
        for _, row in top_features.iterrows():
            direction = "Higher in Benign" if row['Benign Mean'] > row['Malware Mean'] else "Higher in Malware"
            print(f"  - {row['Feature']}: {direction}, Effect Size: {row['Effect Size']:.2f}")
    
    print("\nCommon Patterns Across Architectures:")
    # Look for features that appear in top 5 for multiple architectures
    importance_comparison = compare_features_across_architectures(benign_df, malware_df, cpu_types)
    
    # Features that appear in at least 2 architectures
    common_features = importance_comparison[(importance_comparison > 0).sum(axis=1) >= 2]
    
    if len(common_features) > 0:
        for feature in common_features.index:
            architectures = [cpu for cpu in cpu_types if importance_comparison.loc[feature, cpu] > 0]
            print(f"  - {feature}: Important in {', '.join(architectures)}")
    else:
        print("  No common important features found across architectures")

if __name__ == "__main__":
    main()

Sample Distribution by CPU Architecture:
                               Benign  Malware
CPU                                           
ARM                               627      779
Advanced Micro Devices X86-64     204      678
Intel 80386                       795      791
MIPS R3000                        672      740


Key Characteristics by CPU Architecture:

ARM Architecture:
  Top Discriminative Features:
  - Avg_CFG_EDGE_TO_NODE_RATIO: Higher in Malware, Effect Size: 1.69
  - Avg_CFG_EDGES: Higher in Malware, Effect Size: 1.16
  - Avg_BRANCHING: Higher in Malware, Effect Size: 1.12
  - Avg_BOOLEAN: Higher in Malware, Effect Size: 1.11
  - Avg_INTEGER_COMPARISON: Higher in Malware, Effect Size: 1.09

Advanced Micro Devices X86-64 Architecture:
  Top Discriminative Features:
  - Avg_CFG_EDGE_TO_NODE_RATIO: Higher in Malware, Effect Size: 1.45
  - Avg_CFG_NODES: Higher in Malware, Effect Size: 0.97
  - Avg_FLOATING_POINT_COMPARE: Higher in Malware, Effect Size: 0.95
  - Avg_CFG_ED

<Figure size 1000x600 with 0 Axes>

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import os

def load_data(benign_file, malware_file):
    """Load benign and malware data"""
    print("Loading data...")
    benign_df = pd.read_csv(benign_file)
    malware_df = pd.read_csv(malware_file)
    
    # Add labels
    benign_df['is_malware'] = 0
    malware_df['is_malware'] = 1
    
    return benign_df, malware_df

def get_features(df):
    """Get feature columns, excluding non-feature columns and zero columns"""
    exclude_cols = ['file_name', 'CPU', 'label', 'family', 'is_malware']
    feature_cols = [col for col in df.columns if col not in exclude_cols]
    
    # Filter out zero columns
    non_zero_features = []
    for col in feature_cols:
        if df[col].sum() > 0:
            non_zero_features.append(col)
    
    return non_zero_features

def cross_architecture_evaluation(benign_df, malware_df):
    """Perform cross-architecture experiments: train on one CPU, test on another"""
    # Get all CPU types
    all_cpus = sorted(list(set(benign_df['CPU'].unique()) | set(malware_df['CPU'].unique())))
    all_cpus = [cpu for cpu in all_cpus if not pd.isna(cpu)]
    
    # Combine datasets
    combined_df = pd.concat([benign_df, malware_df])
    
    # Get features
    feature_cols = get_features(combined_df)
    print(f"Using {len(feature_cols)} non-zero features")
    
    # Initialize results matrix
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
    results = {}
    for metric in metrics:
        results[metric] = pd.DataFrame(np.zeros((len(all_cpus), len(all_cpus))), 
                                       index=all_cpus, columns=all_cpus)
    
    # For each CPU pair, train on one and test on another
    for train_cpu in all_cpus:
        print(f"Training on {train_cpu}...")
        
        # Get training data for this CPU
        train_benign = benign_df[benign_df['CPU'] == train_cpu]
        train_malware = malware_df[malware_df['CPU'] == train_cpu]
        
        # Skip if not enough samples
        if len(train_benign) < 10 or len(train_malware) < 10:
            print(f"  Skipping {train_cpu} (insufficient samples)")
            continue
        
        # Combine training data
        train_data = pd.concat([train_benign, train_malware])
        
        # Prepare training data
        X_train = train_data[feature_cols].fillna(0)
        y_train = train_data['is_malware']
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        
        # Train Random Forest
        rf = RandomForestClassifier(n_estimators=100, random_state=42)
        rf.fit(X_train_scaled, y_train)
        
        # Test on each CPU
        for test_cpu in all_cpus:
            print(f"  Testing on {test_cpu}...")
            
            # Get test data for this CPU
            test_benign = benign_df[benign_df['CPU'] == test_cpu]
            test_malware = malware_df[malware_df['CPU'] == test_cpu]
            
            # Skip if not enough samples
            if len(test_benign) < 10 or len(test_malware) < 10:
                print(f"    Skipping {test_cpu} (insufficient samples)")
                continue
            
            # Combine test data
            test_data = pd.concat([test_benign, test_malware])
            
            # Prepare test data
            X_test = test_data[feature_cols].fillna(0)
            y_test = test_data['is_malware']
            
            # Scale test data using training scaler
            X_test_scaled = scaler.transform(X_test)
            
            # Make predictions
            y_pred = rf.predict(X_test_scaled)
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            
            # Store results
            results['Accuracy'].loc[train_cpu, test_cpu] = accuracy
            results['Precision'].loc[train_cpu, test_cpu] = precision
            results['Recall'].loc[train_cpu, test_cpu] = recall
            results['F1 Score'].loc[train_cpu, test_cpu] = f1
            
            print(f"    Accuracy: {accuracy:.4f}, F1: {f1:.4f}")
    
    return results, all_cpus

def plot_results(results, all_cpus, output_dir='./results'):
    """Plot result matrices"""
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Plot each metric
    for metric, matrix in results.items():
        plt.figure(figsize=(10, 8))
        sns.heatmap(matrix, annot=True, fmt='.4f', cmap='YlGnBu', vmin=0, vmax=1)
        plt.title(f'Cross-Architecture {metric}')
        plt.xlabel('Test CPU')
        plt.ylabel('Train CPU')
        plt.tight_layout()
        plt.savefig(f'{output_dir}/{metric.lower().replace(" ", "_")}_matrix.png', dpi=300)
        plt.close()
    
    # Save matrices to CSV
    for metric, matrix in results.items():
        matrix.to_csv(f'{output_dir}/{metric.lower().replace(" ", "_")}_matrix.csv')

def main():
    # File paths
    benign_file = '/home/tommy/cross-architecture/Experiment2/csv/20250228_cleanedtrain_benign_file_features.csv'
    malware_file = '/home/tommy/cross-architecture/Experiment2/csv/20250228_cleanedtrain_malware_file_features.csv'
    
    # Load data
    benign_df, malware_df = load_data(benign_file, malware_file)
    
    # Get CPU distribution
    benign_counts = benign_df['CPU'].value_counts()
    malware_counts = malware_df['CPU'].value_counts()
    
    print("Sample distribution by CPU architecture:")
    print(pd.DataFrame({
        'Benign': benign_counts,
        'Malware': malware_counts
    }).fillna(0))
    
    # Run cross-architecture evaluation
    results, all_cpus = cross_architecture_evaluation(benign_df, malware_df)
    
    # Plot results
    plot_results(results, all_cpus)
    
    print("Cross-architecture experiment completed. Results saved to the 'results' directory.")

if __name__ == "__main__":
    main()

Loading data...
Sample distribution by CPU architecture:
                               Benign  Malware
CPU                                           
ARM                               627      779
Advanced Micro Devices X86-64     204      678
Intel 80386                       795      791
MIPS R3000                        672      740
Using 33 non-zero features
Training on ARM...
  Testing on ARM...
    Accuracy: 1.0000, F1: 1.0000
  Testing on Advanced Micro Devices X86-64...
    Accuracy: 0.5771, F1: 0.6205
  Testing on Intel 80386...
    Accuracy: 0.8348, F1: 0.8015
  Testing on MIPS R3000...
    Accuracy: 0.6629, F1: 0.5441
Training on Advanced Micro Devices X86-64...
  Testing on ARM...
    Accuracy: 0.8634, F1: 0.8889
  Testing on Advanced Micro Devices X86-64...
    Accuracy: 1.0000, F1: 1.0000
  Testing on Intel 80386...
    Accuracy: 0.9817, F1: 0.9817
  Testing on MIPS R3000...
    Accuracy: 0.8088, F1: 0.8391
Training on Intel 80386...
  Testing on ARM...
    Accuracy: 0.8

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
import os

def load_data(train_benign_file, train_malware_file, test_benign_file, test_malware_file):
    """Load training and testing data"""
    print("Loading data...")
    
    # Load training data
    train_benign_df = pd.read_csv(train_benign_file)
    train_malware_df = pd.read_csv(train_malware_file)
    
    # Load testing data
    test_benign_df = pd.read_csv(test_benign_file)
    test_malware_df = pd.read_csv(test_malware_file)
    
    # Add labels
    train_benign_df['is_malware'] = 0
    train_malware_df['is_malware'] = 1
    test_benign_df['is_malware'] = 0
    test_malware_df['is_malware'] = 1
    
    return train_benign_df, train_malware_df, test_benign_df, test_malware_df

def get_features(df):
    """Get feature columns, excluding non-feature columns and zero columns"""
    exclude_cols = ['file_name', 'CPU', 'label', 'family', 'is_malware']
    feature_cols = [col for col in df.columns if col not in exclude_cols]
    
    # Filter out zero columns
    non_zero_features = []
    for col in feature_cols:
        if df[col].sum() > 0:
            non_zero_features.append(col)
    
    return non_zero_features

def evaluate_on_powerpc(train_cpu, train_benign_df, train_malware_df, 
                        test_benign_df, test_malware_df, feature_cols):
    """Train on specified architecture and test on PowerPC"""
    print(f"Training on {train_cpu}, testing on PowerPC...")
    
    # Get training data
    if train_cpu == 'all':
        # Use all training data
        train_benign = train_benign_df
        train_malware = train_malware_df
    else:
        # Filter by CPU
        train_benign = train_benign_df[train_benign_df['CPU'] == train_cpu]
        train_malware = train_malware_df[train_malware_df['CPU'] == train_cpu]
    
    # Skip if not enough training samples
    if len(train_benign) < 10 or len(train_malware) < 10:
        print(f"  Skipping {train_cpu} (insufficient training samples)")
        return None
    
    # Get testing data for PowerPC
    test_benign = test_benign_df[test_benign_df['CPU'] == 'PowerPC']
    test_malware = test_malware_df[test_malware_df['CPU'] == 'PowerPC']
    
    # Skip if not enough testing samples
    if len(test_benign) < 5 or len(test_malware) < 5:
        print(f"  Skipping PowerPC (insufficient testing samples)")
        return None
    
    # Print data sizes
    print(f"  Training data: {len(train_benign)} benign, {len(train_malware)} malware")
    print(f"  Testing data: {len(test_benign)} benign, {len(test_malware)} malware")
    
    # Combine training data
    train_data = pd.concat([train_benign, train_malware])
    
    # Combine testing data
    test_data = pd.concat([test_benign, test_malware])
    
    # Prepare training data
    X_train = train_data[feature_cols].fillna(0)
    y_train = train_data['is_malware']
    
    # Prepare testing data
    X_test = test_data[feature_cols].fillna(0)
    y_test = test_data['is_malware']
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train Random Forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = rf.predict(X_test_scaled)
    
    # Calculate F1 score
    f1 = f1_score(y_test, y_pred)
    
    print(f"  F1 Score: {f1:.4f}")
    
    return f1

def plot_f1_comparison(results, output_dir='./results'):
    """Plot and save F1 score comparison"""
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Get training CPUs and F1 scores
    train_cpus = list(results.keys())
    f1_scores = list(results.values())
    
    # Create DataFrame
    df = pd.DataFrame({
        'Architecture': train_cpus,
        'F1 Score': f1_scores
    })
    
    # Save to CSV
    df.to_csv(f'{output_dir}/powerpc_f1_comparison.csv', index=False)
    
    # Sort by F1 score
    df = df.sort_values('F1 Score', ascending=False)
    
    # Plot
    plt.figure(figsize=(12, 7))
    
    # Bar plot
    bars = plt.bar(df['Architecture'], df['F1 Score'], color='steelblue')
    
    # Add values on top of bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                 f'{height:.4f}', ha='center', va='bottom', fontsize=12)
    
    plt.title('F1 Scores on PowerPC by Training Architecture', fontsize=16)
    plt.xlabel('Training Architecture', fontsize=14)
    plt.ylabel('F1 Score', fontsize=14)
    plt.ylim(0, 1.1)  # Set y-axis limit with some margin for the text
    plt.grid(True, axis='y', linestyle='--', alpha=0.7)
    plt.xticks(rotation=45)
    plt.tight_layout()
    
    # Save the plot
    plt.savefig(f'{output_dir}/powerpc_f1_comparison.png', dpi=300)
    plt.close()

def main():
    # File paths
    train_benign_file = '/home/tommy/cross-architecture/Experiment2/csv/20250228_cleanedtrain_benign_file_features.csv'
    train_malware_file = '/home/tommy/cross-architecture/Experiment2/csv/20250228_cleanedtrain_malware_file_features.csv'
    test_benign_file = '/home/tommy/cross-architecture/Experiment2/csv/20250228_cleanedtest_benign_file_features.csv'
    test_malware_file = '/home/tommy/cross-architecture/Experiment2/csv/20250228_cleanedtest_malware_file_features.csv'
    
    # Load data
    train_benign_df, train_malware_df, test_benign_df, test_malware_df = load_data(
        train_benign_file, train_malware_file, test_benign_file, test_malware_file
    )
    
    # Print CPU distributions
    print("\nTraining data CPU distribution:")
    train_cpu_dist = pd.DataFrame({
        'Benign': train_benign_df['CPU'].value_counts(),
        'Malware': train_malware_df['CPU'].value_counts()
    }).fillna(0)
    print(train_cpu_dist)
    
    print("\nTesting data CPU distribution:")
    test_cpu_dist = pd.DataFrame({
        'Benign': test_benign_df['CPU'].value_counts(),
        'Malware': test_malware_df['CPU'].value_counts()
    }).fillna(0)
    print(test_cpu_dist)
    
    # Get features from all data
    all_data = pd.concat([train_benign_df, train_malware_df, test_benign_df, test_malware_df])
    feature_cols = get_features(all_data)
    print(f"\nUsing {len(feature_cols)} non-zero features")
    
    # Define CPU architectures for training
    # Get all available architectures and add 'all'
    available_cpus = sorted(list(set(train_benign_df['CPU'].unique()) | set(train_malware_df['CPU'].unique())))
    available_cpus = [cpu for cpu in available_cpus if not pd.isna(cpu)]
    train_cpus = available_cpus + ['all']
    
    # Evaluate each architecture on PowerPC
    results = {}
    for train_cpu in train_cpus:
        f1 = evaluate_on_powerpc(
            train_cpu,
            train_benign_df, train_malware_df,
            test_benign_df, test_malware_df,
            feature_cols
        )
        if f1 is not None:
            results[train_cpu] = f1
    
    # Plot and save results
    if results:
        plot_f1_comparison(results)
    else:
        print("No results to plot.")
    
    print("\nExperiment completed. Results saved to the 'results' directory.")

if __name__ == "__main__":
    main()

Loading data...

Training data CPU distribution:
                               Benign  Malware
CPU                                           
ARM                               627      779
Advanced Micro Devices X86-64     204      678
Intel 80386                       795      791
MIPS R3000                        672      740

Testing data CPU distribution:
         Benign  Malware
CPU                     
PowerPC     794      794

Using 33 non-zero features
Training on ARM, testing on PowerPC...
  Training data: 627 benign, 779 malware
  Testing data: 794 benign, 794 malware
  F1 Score: 0.5849
Training on Advanced Micro Devices X86-64, testing on PowerPC...
  Training data: 204 benign, 678 malware
  Testing data: 794 benign, 794 malware
  F1 Score: 0.7401
Training on Intel 80386, testing on PowerPC...
  Training data: 795 benign, 791 malware
  Testing data: 794 benign, 794 malware
  F1 Score: 0.7460
Training on MIPS R3000, testing on PowerPC...
  Training data: 672 benign, 740 malw

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import KFold, StratifiedKFold
import os
from datetime import datetime
import json
from scipy import stats

def load_data(train_benign_file, train_malware_file, test_benign_file, test_malware_file):
    """Load training and testing data"""
    print("Loading data...")
    
    # Load training data
    train_benign_df = pd.read_csv(train_benign_file)
    train_malware_df = pd.read_csv(train_malware_file)
    
    # Load testing data
    test_benign_df = pd.read_csv(test_benign_file)
    test_malware_df = pd.read_csv(test_malware_file)
    
    # Add labels
    train_benign_df['is_malware'] = 0
    train_malware_df['is_malware'] = 1
    test_benign_df['is_malware'] = 0
    test_malware_df['is_malware'] = 1
    
    return train_benign_df, train_malware_df, test_benign_df, test_malware_df

def get_features(df):
    """Get feature columns, excluding non-feature columns and zero columns"""
    exclude_cols = ['file_name', 'CPU', 'label', 'family', 'is_malware']
    feature_cols = [col for col in df.columns if col not in exclude_cols]
    
    # Filter out zero columns
    non_zero_features = []
    for col in feature_cols:
        if df[col].sum() > 0:
            non_zero_features.append(col)
    
    return non_zero_features

def calculate_metrics(y_true, y_pred, y_scores=None):
    """Calculate multiple performance metrics"""
    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred)
    }
    
    # Add ROC AUC if probabilities are available
    if y_scores is not None:
        metrics['roc_auc'] = roc_auc_score(y_true, y_scores)
    
    # Add confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    metrics['true_negatives'] = int(tn)
    metrics['false_positives'] = int(fp)
    metrics['false_negatives'] = int(fn)
    metrics['true_positives'] = int(tp)
    
    return metrics

def evaluate_on_powerpc(train_cpu, train_benign_df, train_malware_df, 
                        test_benign_df, test_malware_df, feature_cols, 
                        random_seeds=[42, 123, 456, 789, 101], cv_folds=5):
    """Train on specified architecture and test on PowerPC with multiple runs and CV"""
    print(f"Training on {train_cpu}, testing on PowerPC...")
    
    # Get training data
    if train_cpu == 'all':
        # Use all training data
        train_benign = train_benign_df
        train_malware = train_malware_df
    else:
        # Filter by CPU
        train_benign = train_benign_df[train_benign_df['CPU'] == train_cpu]
        train_malware = train_malware_df[train_malware_df['CPU'] == train_cpu]
    
    # Skip if not enough training samples
    if len(train_benign) < 10 or len(train_malware) < 10:
        print(f"  Skipping {train_cpu} (insufficient training samples)")
        return None
    
    # Get testing data for PowerPC
    test_benign = test_benign_df[test_benign_df['CPU'] == 'PowerPC']
    test_malware = test_malware_df[test_malware_df['CPU'] == 'PowerPC']
    
    # Skip if not enough testing samples
    if len(test_benign) < 5 or len(test_malware) < 5:
        print(f"  Skipping PowerPC (insufficient testing samples)")
        return None
    
    # Print data sizes
    print(f"  Training data: {len(train_benign)} benign, {len(train_malware)} malware")
    print(f"  Testing data: {len(test_benign)} benign, {len(test_malware)} malware")
    
    # Combine training data
    train_data = pd.concat([train_benign, train_malware])
    
    # Combine testing data
    test_data = pd.concat([test_benign, test_malware])
    
    # Prepare training data
    X_train = train_data[feature_cols].fillna(0)
    y_train = train_data['is_malware']
    
    # Prepare testing data
    X_test = test_data[feature_cols].fillna(0)
    y_test = test_data['is_malware']
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Results for multiple runs
    run_results = []
    
    # Run with multiple random seeds
    for seed in random_seeds:
        print(f"  Running with seed {seed}")
        
        # Train Random Forest
        rf = RandomForestClassifier(n_estimators=100, random_state=seed)
        rf.fit(X_train_scaled, y_train)
        
        # Make predictions
        y_pred = rf.predict(X_test_scaled)
        y_scores = rf.predict_proba(X_test_scaled)[:, 1]  # Probability for class 1
        
        # Calculate metrics
        metrics = calculate_metrics(y_test, y_pred, y_scores)
        metrics['seed'] = seed
        
        # Add to results
        run_results.append(metrics)
    
    # Cross-validation on training data
    print(f"  Performing {cv_folds}-fold cross-validation")
    cv_results = []
    
    # Use stratified k-fold to maintain class distribution
    skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_scaled, y_train)):
        # Split data
        X_train_fold = X_train_scaled[train_idx]
        y_train_fold = y_train.iloc[train_idx]
        X_val_fold = X_train_scaled[val_idx]
        y_val_fold = y_train.iloc[val_idx]
        
        # Train model
        rf_cv = RandomForestClassifier(n_estimators=100, random_state=42)
        rf_cv.fit(X_train_fold, y_train_fold)
        
        # Validate
        y_val_pred = rf_cv.predict(X_val_fold)
        y_val_scores = rf_cv.predict_proba(X_val_fold)[:, 1]
        
        # Calculate metrics
        fold_metrics = calculate_metrics(y_val_fold, y_val_pred, y_val_scores)
        fold_metrics['fold'] = fold + 1
        
        # Add to results
        cv_results.append(fold_metrics)
    
    # Calculate statistics across runs
    f1_scores = [result['f1'] for result in run_results]
    mean_f1 = np.mean(f1_scores)
    std_f1 = np.std(f1_scores)
    cv_f1_scores = [result['f1'] for result in cv_results]
    mean_cv_f1 = np.mean(cv_f1_scores)
    std_cv_f1 = np.std(cv_f1_scores)
    
    # Confidence interval (95%)
    ci_95 = stats.t.interval(0.95, len(f1_scores)-1, loc=mean_f1, scale=std_f1/np.sqrt(len(f1_scores)))
    
    print(f"  Mean F1 Score: {mean_f1:.4f} ± {std_f1:.4f}")
    print(f"  95% Confidence Interval: [{ci_95[0]:.4f}, {ci_95[1]:.4f}]")
    print(f"  Cross-Validation F1: {mean_cv_f1:.4f} ± {std_cv_f1:.4f}")
    
    # Compile results
    results = {
        'train_cpu': train_cpu,
        'train_samples': len(train_data),
        'test_samples': len(test_data),
        'mean_f1': mean_f1,
        'std_f1': std_f1,
        'ci_95_low': ci_95[0],
        'ci_95_high': ci_95[1],
        'mean_cv_f1': mean_cv_f1,
        'std_cv_f1': std_cv_f1,
        'run_results': run_results,
        'cv_results': cv_results
    }
    
    return results

def plot_f1_comparison(all_results, output_dir='./results'):
    """Plot and save F1 score comparison with error bars and custom colors"""
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Extract data for plotting
    train_cpus = [r['train_cpu'] for r in all_results]
    mean_f1s = [r['mean_f1'] for r in all_results]
    std_f1s = [r['std_f1'] for r in all_results]
    cv_f1s = [r['mean_cv_f1'] for r in all_results]
    
    # Create DataFrame
    df = pd.DataFrame({
        'Architecture': train_cpus,
        'Mean F1 Score': mean_f1s,
        'Std F1 Score': std_f1s,
        'CV F1 Score': cv_f1s
    })
    
    # Save to CSV
    df.to_csv(f'{output_dir}/{timestamp}_powerpc_f1_comparison.csv', index=False)
    
    # Save detailed results to JSON
    with open(f'{output_dir}/{timestamp}_detailed_results.json', 'w') as f:
        # Convert NumPy values to Python types for JSON serialization
        results_for_json = []
        for result in all_results:
            result_copy = result.copy()
            for key, value in result_copy.items():
                if isinstance(value, np.number):
                    result_copy[key] = float(value)
            results_for_json.append(result_copy)
        
        json.dump(results_for_json, f, indent=2)
    
    # Sort by mean F1 score
    df = df.sort_values('Mean F1 Score', ascending=False)
    
    # Define colors matching the provided example image
    # Deep purple, navy blue, teal, medium green, chartreuse
    custom_colors = ['#2D004A', '#304887', '#3D7D7C', '#4CAA66', '#B9CF45']
    
    # Make sure we have enough colors
    while len(custom_colors) < len(df):
        custom_colors.extend(custom_colors)
    
    # --------- Main F1 Score Chart ---------
    # Set the style for the plot
    sns.set_style('whitegrid')
    plt.figure(figsize=(14, 8))
    
    # Bar plot with error bars
    bars = plt.bar(df['Architecture'], df['Mean F1 Score'], 
                  yerr=df['Std F1 Score'], capsize=5,
                  color=custom_colors[:len(df)])
    
    # Add values on top of bars
    for i, bar in enumerate(bars):
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                 f'{height:.4f}±{df.iloc[i]["Std F1 Score"]:.4f}', 
                 ha='center', va='bottom', fontsize=10, fontweight='bold')
    
    # Add CV F1 scores as line plot
    plt.plot(df['Architecture'], df['CV F1 Score'], 'ro-', linewidth=2, 
             markersize=8, label='Cross-Validation F1')
    
    plt.title('F1 Scores on PowerPC by Training Architecture', fontsize=16)
    plt.xlabel('Training Architecture', fontsize=14)
    plt.ylabel('F1 Score', fontsize=14)
    plt.ylim(0, 1.1)  # Set y-axis limit with some margin for the text
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.xticks(rotation=45)
    plt.legend()
    plt.tight_layout()
    
    # Save the plot - both PNG and SVG formats
    plt.savefig(f'{output_dir}/{timestamp}_powerpc_f1_comparison.png', dpi=300)
    plt.savefig(f'{output_dir}/{timestamp}_powerpc_f1_comparison.svg', format='svg')
    plt.close()
    
    # --------- Simple Colorful Chart (no error bars) ---------
    # Create a simplified colorful chart as requested
    sns.set_style('whitegrid')
    plt.figure(figsize=(12, 7))
    
    # Create the bar chart with custom colors
    simple_bars = plt.bar(
        df['Architecture'], 
        df['Mean F1 Score'], 
        color=custom_colors[:len(df)]
    )
    
    # Add the values on top of each bar
    for bar in simple_bars:
        height = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width()/2., 
            height + 0.01,
            f'{height:.4f}', 
            ha='center', 
            va='bottom', 
            fontsize=12,
            fontweight='bold'
        )
    
    # Set the labels and title
    plt.title('F1 Scores on PowerPC by Training Architecture', fontsize=16)
    plt.xlabel('Training Architecture', fontsize=14)
    plt.ylabel('F1 Score', fontsize=14)
    
    # Set y-axis limits
    plt.ylim(0, 1.1)
    
    # Add a light grid for better readability
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.xticks(rotation=45)
    
    # Save both PNG and SVG formats
    plt.tight_layout()
    plt.savefig(f'{output_dir}/{timestamp}_simple_f1_chart.png', dpi=300)
    plt.savefig(f'{output_dir}/{timestamp}_simple_f1_chart.svg', format='svg')
    plt.close()
    
    # Create statistical significance heatmap
    if len(all_results) > 1:
        plot_significance_matrix(all_results, output_dir, timestamp)

def plot_significance_matrix(all_results, output_dir, timestamp):
    """Plot a heatmap of statistical significance between pairs of architectures"""
    n_archs = len(all_results)
    architectures = [r['train_cpu'] for r in all_results]
    
    # Initialize p-value matrix
    p_values = np.zeros((n_archs, n_archs))
    
    # Calculate p-values for all pairs
    for i in range(n_archs):
        for j in range(n_archs):
            if i == j:
                p_values[i, j] = 1.0  # Same architecture
            else:
                # Get F1 scores for both architectures
                f1_scores_i = [r['f1'] for r in all_results[i]['run_results']]
                f1_scores_j = [r['f1'] for r in all_results[j]['run_results']]
                
                # Perform t-test
                _, p_value = stats.ttest_ind(f1_scores_i, f1_scores_j)
                p_values[i, j] = p_value
    
    # Create significance mask (p < 0.05)
    sig_mask = p_values < 0.05
    
    # Set the style
    sns.set_style('white')
    plt.figure(figsize=(12, 10))
    
    # Use -log10(p) for better visualization
    log_p_values = -np.log10(p_values)
    np.fill_diagonal(log_p_values, 0)  # Set diagonal to 0
    
    # Create heatmap with improved styling
    sns.heatmap(log_p_values, annot=np.round(p_values, 3), 
                xticklabels=architectures, yticklabels=architectures,
                cmap='viridis', mask=p_values >= 0.05,
                linewidths=0.5, linecolor='white',
                cbar_kws={'label': '-log10(p-value)'})
    
    # Add a red outline to statistically significant cells (p < 0.05)
    for i in range(n_archs):
        for j in range(n_archs):
            if p_values[i, j] < 0.05 and i != j:
                plt.gca().add_patch(plt.Rectangle((j, i), 1, 1, fill=False, edgecolor='red', lw=2))
    
    plt.title('Statistical Significance Between Architectures', fontsize=16)
    plt.tight_layout()
    
    # Save the plot - both PNG and SVG formats
    plt.savefig(f'{output_dir}/{timestamp}_statistical_significance.png', dpi=300)
    plt.savefig(f'{output_dir}/{timestamp}_statistical_significance.svg', format='svg')
    plt.close()

def main():
    # File paths
    train_benign_file = '/home/tommy/cross-architecture/Experiment2/csv/20250228_cleanedtrain_benign_file_features.csv'
    train_malware_file = '/home/tommy/cross-architecture/Experiment2/csv/20250228_cleanedtrain_malware_file_features.csv'
    test_benign_file = '/home/tommy/cross-architecture/Experiment2/csv/20250228_cleanedtest_benign_file_features.csv'
    test_malware_file = '/home/tommy/cross-architecture/Experiment2/csv/20250228_cleanedtest_malware_file_features.csv'
    
    # Load data
    train_benign_df, train_malware_df, test_benign_df, test_malware_df = load_data(
        train_benign_file, train_malware_file, test_benign_file, test_malware_file
    )
    
    # Print CPU distributions
    print("\nTraining data CPU distribution:")
    train_cpu_dist = pd.DataFrame({
        'Benign': train_benign_df['CPU'].value_counts(),
        'Malware': train_malware_df['CPU'].value_counts()
    }).fillna(0)
    print(train_cpu_dist)
    
    print("\nTesting data CPU distribution:")
    test_cpu_dist = pd.DataFrame({
        'Benign': test_benign_df['CPU'].value_counts(),
        'Malware': test_malware_df['CPU'].value_counts()
    }).fillna(0)
    print(test_cpu_dist)
    
    # Get features from all data
    all_data = pd.concat([train_benign_df, train_malware_df, test_benign_df, test_malware_df])
    feature_cols = get_features(all_data)
    print(f"\nUsing {len(feature_cols)} non-zero features")
    
    # Define CPU architectures for training
    # Get all available architectures and add 'all'
    available_cpus = sorted(list(set(train_benign_df['CPU'].unique()) | set(train_malware_df['CPU'].unique())))
    available_cpus = [cpu for cpu in available_cpus if not pd.isna(cpu)]
    train_cpus = available_cpus + ['all']
    
    # Experiment configuration
    random_seeds = [42, 123, 456, 789, 101]  # Multiple seeds for reproducibility
    cv_folds = 5  # Number of cross-validation folds
    
    # Evaluate each architecture on PowerPC
    all_results = []
    for train_cpu in train_cpus:
        results = evaluate_on_powerpc(
            train_cpu,
            train_benign_df, train_malware_df,
            test_benign_df, test_malware_df,
            feature_cols,
            random_seeds=random_seeds,
            cv_folds=cv_folds
        )
        if results is not None:
            all_results.append(results)
    
    # Plot and save results
    if all_results:
        plot_f1_comparison(all_results)
    else:
        print("No results to plot.")
    
    print("\nExperiment completed. Results saved to the 'results' directory.")

if __name__ == "__main__":
    main()

Loading data...

Training data CPU distribution:
                               Benign  Malware
CPU                                           
ARM                               627      779
Advanced Micro Devices X86-64     204      678
Intel 80386                       795      791
MIPS R3000                        672      740

Testing data CPU distribution:
         Benign  Malware
CPU                     
PowerPC     794      794

Using 33 non-zero features
Training on ARM, testing on PowerPC...
  Training data: 627 benign, 779 malware
  Testing data: 794 benign, 794 malware
  Running with seed 42


  Running with seed 123
  Running with seed 456
  Running with seed 789
  Running with seed 101
  Performing 5-fold cross-validation
  Mean F1 Score: 0.5682 ± 0.0108
  95% Confidence Interval: [0.5548, 0.5816]
  Cross-Validation F1: 0.9806 ± 0.0042
Training on Advanced Micro Devices X86-64, testing on PowerPC...
  Training data: 204 benign, 678 malware
  Testing data: 794 benign, 794 malware
  Running with seed 42
  Running with seed 123
  Running with seed 456
  Running with seed 789
  Running with seed 101
  Performing 5-fold cross-validation
  Mean F1 Score: 0.7456 ± 0.0050
  95% Confidence Interval: [0.7393, 0.7518]
  Cross-Validation F1: 0.9734 ± 0.0064
Training on Intel 80386, testing on PowerPC...
  Training data: 795 benign, 791 malware
  Testing data: 794 benign, 794 malware
  Running with seed 42
  Running with seed 123
  Running with seed 456
  Running with seed 789
  Running with seed 101
  Performing 5-fold cross-validation
  Mean F1 Score: 0.7474 ± 0.0062
  95% Confidence

NameError: name 'bright_colors' is not defined

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

def create_f1_matrix_heatmap(output_file='cross_architecture_f1_matrix.svg'):
    """
    Create a heatmap of the F1 scores for cross-architecture experiments.
    
    This function creates a heatmap similar to Image 1, showing F1 scores when
    training on one architecture and testing on another.
    """
    # Define the CPU architectures (in the desired order)
    cpus = ['ARM', 'Advanced Micro Devices X86-64', 'Intel 80386', 'MIPS R3000']
    
    # Create a DataFrame to store the F1 scores
    f1_scores = pd.DataFrame(
        [
            [1.0000, 0.6205, 0.8015, 0.5441],
            [0.8889, 1.0000, 0.9817, 0.8391],
            [0.8388, 0.9442, 1.0000, 0.7473],
            [0.8660, 0.9244, 0.9674, 1.0000]
        ],
        index=cpus,
        columns=cpus
    )
    
    # Create the figure with appropriate size
    plt.figure(figsize=(10, 8))
    
    # Create the heatmap
    ax = sns.heatmap(
        f1_scores, 
        annot=True, 
        fmt='.4f', 
        cmap='YlGnBu_r',  # Reversed YlGnBu colormap to match the example
        vmin=0, 
        vmax=1,
        square=True,
        linewidths=1,
        cbar_kws={'label': ''}
    )
    
    # Set the labels and title
    plt.title('Cross-Architecture F1 Score', fontsize=16, pad=20)
    plt.xlabel('Test CPU', fontsize=14, labelpad=10)
    plt.ylabel('Train CPU', fontsize=14, labelpad=10)
    
    # Adjust layout and save as SVG
    plt.tight_layout()
    os.makedirs('./output', exist_ok=True)
    plt.savefig(f'./output/{output_file}', format='svg')
    print(f"Saved heatmap to ./output/{output_file}")
    
    plt.close()

def create_powerpc_comparison(output_file='powerpc_f1_comparison.svg'):
    """
    Create a bar chart of F1 scores when training on different architectures
    and testing on PowerPC.
    
    This function creates a bar chart similar to Image 2, showing F1 scores
    for each training architecture when testing on PowerPC.
    """
    # Define the data from the image
    architectures = ['all', 'MIPS R3000', 'Intel 80386', 'Advanced Micro Devices X86-64', 'ARM']
    f1_scores = [0.9535, 0.9361, 0.7460, 0.7401, 0.5849]
    
    # Create a DataFrame
    data = pd.DataFrame({
        'Architecture': architectures,
        'F1 Score': f1_scores
    })
    
    # Set the style for the plot
    sns.set_style('whitegrid')
    
    # Create the figure
    plt.figure(figsize=(12, 7))
    
    # Create the bar chart
    bars = plt.bar(data['Architecture'], data['F1 Score'], color='steelblue')
    
    # Add the values on top of each bar
    for bar in bars:
        height = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width()/2., 
            height + 0.01,
            f'{height:.4f}', 
            ha='center', 
            va='bottom', 
            fontsize=12
        )
    
    # Set the labels and title
    plt.title('F1 Scores on PowerPC by Training Architecture', fontsize=16)
    plt.xlabel('Training Architecture', fontsize=14)
    plt.ylabel('F1 Score', fontsize=14)
    
    # Set y-axis limits
    plt.ylim(0, 1.1)
    
    # Create output directory and save as SVG
    os.makedirs('./output', exist_ok=True)
    plt.tight_layout()
    plt.savefig(f'./output/{output_file}', format='svg')
    print(f"Saved bar chart to ./output/{output_file}")
    
    plt.close()

def main():
    """Generate both visualizations."""
    # Create output directory
    os.makedirs('./output', exist_ok=True)
    
    # Generate the cross-architecture F1 matrix heatmap
    create_f1_matrix_heatmap()
    
    # Generate the PowerPC comparison bar chart
    create_powerpc_comparison()
    
    print("All visualizations completed successfully!")
    print("SVG files saved to the ./output directory")

if __name__ == "__main__":
    main()

Saved heatmap to ./output/cross_architecture_f1_matrix.svg
Saved bar chart to ./output/powerpc_f1_comparison.svg
All visualizations completed successfully!
SVG files saved to the ./output directory


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

def create_powerpc_comparison(output_file='powerpc_f1_comparison.svg'):
    """
    Create a colorful bar chart of F1 scores when training on different architectures
    and testing on PowerPC.
    """
    # Define the data with simplified labels
    architectures = ['all', 'MIPS R3000', 'Intel 80386', 'X86-64', 'ARM']
    f1_scores = [0.9535, 0.9361, 0.7460, 0.7401, 0.5849]
    
    # Create a DataFrame
    data = pd.DataFrame({
        'Architecture': architectures,
        'F1 Score': f1_scores
    })
    
    # Sort by F1 score to make visualization clearer
    data = data.sort_values('F1 Score', ascending=False)
    
    # Set the style for the plot
    sns.set_style('whitegrid')
    plt.figure(figsize=(12, 7))
    
    # Generate colors using a colormap
    colormap = plt.cm.viridis
    colors = [colormap(i) for i in np.linspace(0, 0.9, len(data))]
    
    # Create the bar chart with different colors for each bar
    bars = plt.bar(
        data['Architecture'], 
        data['F1 Score'], 
        color=colors
    )
    
    # Add the values on top of each bar
    for bar in bars:
        height = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width()/2., 
            height + 0.01,
            f'{height:.4f}', 
            ha='center', 
            va='bottom', 
            fontsize=12
        )
    
    # Set the labels and title
    plt.title('F1 Scores on PowerPC by Training Architecture', fontsize=16)
    plt.xlabel('Training Architecture', fontsize=14)
    plt.ylabel('F1 Score', fontsize=14)
    
    # Set y-axis limits
    plt.ylim(0, 1.1)
    
    # Create output directory and save as SVG
    os.makedirs('./output', exist_ok=True)
    plt.tight_layout()
    plt.savefig(f'./output/{output_file}', format='svg')
    print(f"Saved colorful bar chart to ./output/{output_file}")
    
    plt.close()

def create_powerpc_comparison_brightcolors(output_file='powerpc_f1_comparison_bright.svg'):
    """
    Create an alternative colorful bar chart with bright colors.
    """
    # Define the data with simplified labels
    architectures = ['all', 'MIPS R3000', 'Intel 80386', 'X86-64', 'ARM']
    f1_scores = [0.9535, 0.9361, 0.7460, 0.7401, 0.5849]
    
    # Create a DataFrame
    data = pd.DataFrame({
        'Architecture': architectures,
        'F1 Score': f1_scores
    })
    
    # Sort by F1 score to make visualization clearer
    data = data.sort_values('F1 Score', ascending=False)
    
    # Define bright colors - one for each bar
    bright_colors = ['#FF9500', '#00B4D8', '#FF5C8D', '#4CAF50', '#9C27B0']
    
    # Set the style for the plot
    sns.set_style('whitegrid')
    plt.figure(figsize=(12, 7))
    
    # Create the bar chart with vibrant colors
    bars = plt.bar(
        data['Architecture'], 
        data['F1 Score'], 
        color=bright_colors[:len(data)]
    )
    
    # Add the values on top of each bar
    for bar in bars:
        height = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width()/2., 
            height + 0.01,
            f'{height:.4f}', 
            ha='center', 
            va='bottom', 
            fontsize=12,
            fontweight='bold'
        )
    
    # Set the labels and title
    plt.title('F1 Scores on PowerPC by Training Architecture', fontsize=16)
    plt.xlabel('Training Architecture', fontsize=14)
    plt.ylabel('F1 Score', fontsize=14)
    
    # Set y-axis limits
    plt.ylim(0, 1.1)
    
    # Add a light grid for better readability
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Create output directory and save as SVG
    os.makedirs('./output', exist_ok=True)
    plt.tight_layout()
    plt.savefig(f'./output/{output_file}', format='svg')
    print(f"Saved bright colorful bar chart to ./output/{output_file}")
    
    plt.close()

if __name__ == "__main__":
    # Generate both color versions
    create_powerpc_comparison()
    create_powerpc_comparison_brightcolors()
    print("Bar chart generation completed successfully!")

ValueError: Per-column arrays must each be 1-dimensional

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import f1_score, classification_report
import os

def load_data(train_malware_file, test_malware_file):
    """Load training and testing malware data"""
    print("Loading data...")
    
    # Load training data (only malware)
    train_malware_df = pd.read_csv(train_malware_file)
    
    # Load testing data (only malware)
    test_malware_df = pd.read_csv(test_malware_file)
    
    return train_malware_df, test_malware_df

def get_features(df):
    """Get feature columns, excluding non-feature columns and zero columns"""
    exclude_cols = ['file_name', 'CPU', 'label', 'family', 'is_malware']
    feature_cols = [col for col in df.columns if col not in exclude_cols]
    
    # Filter out zero columns
    non_zero_features = []
    for col in feature_cols:
        if df[col].sum() > 0:
            non_zero_features.append(col)
    
    return non_zero_features

def evaluate_family_classification(train_cpu, train_malware_df, test_malware_df, feature_cols):
    """Train on specified architecture and test on PowerPC for family classification"""
    print(f"Training on {train_cpu}, testing on PowerPC for family classification...")
    
    # Get training data
    if train_cpu == 'all':
        # Use all training data
        train_malware = train_malware_df
    else:
        # Filter by CPU
        train_malware = train_malware_df[train_malware_df['CPU'] == train_cpu]
    
    # Skip if not enough training samples
    if len(train_malware) < 10:
        print(f"  Skipping {train_cpu} (insufficient training samples)")
        return None
    
    # Get testing data for PowerPC
    test_malware = test_malware_df[test_malware_df['CPU'] == 'PowerPC']
    
    # Skip if not enough testing samples
    if len(test_malware) < 5:
        print(f"  Skipping PowerPC (insufficient testing samples)")
        return None
    
    # Print data sizes and family distributions
    print(f"  Training data: {len(train_malware)} malware samples")
    print(f"  Testing data: {len(test_malware)} malware samples")
    
    print("\n  Training family distribution:")
    print(train_malware['family'].value_counts())
    
    print("\n  Testing family distribution:")
    print(test_malware['family'].value_counts())
    
    # Encode family labels
    label_encoder = LabelEncoder()
    train_malware['family_encoded'] = label_encoder.fit_transform(train_malware['family'])
    # Apply the same transformation to test data
    test_malware['family_encoded'] = label_encoder.transform(test_malware['family'])
    
    # Prepare training data
    X_train = train_malware[feature_cols].fillna(0)
    y_train = train_malware['family_encoded']
    
    # Prepare testing data
    X_test = test_malware[feature_cols].fillna(0)
    y_test = test_malware['family_encoded']
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train Random Forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = rf.predict(X_test_scaled)
    
    # Calculate F1 score (weighted for multi-class)
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f"\n  Weighted F1 Score: {f1:.4f}")
    
    # Get more detailed performance metrics
    report = classification_report(y_test, y_pred, 
                                  target_names=label_encoder.classes_,
                                  output_dict=True)
    
    # Create a DataFrame from the report for easier manipulation
    report_df = pd.DataFrame(report).transpose()
    
    # Print detailed report
    print("\n  Classification Report:")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
    
    return f1, report_df, label_encoder.classes_

def plot_f1_comparison(results, output_dir='./results'):
    """Plot and save F1 score comparison"""
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Get training CPUs and F1 scores
    train_cpus = list(results.keys())
    f1_scores = [results[cpu][0] for cpu in train_cpus]  # Extract F1 scores
    
    # Create DataFrame
    df = pd.DataFrame({
        'Architecture': train_cpus,
        'F1 Score': f1_scores
    })
    
    # Save to CSV
    df.to_csv(f'{output_dir}/powerpc_family_f1_comparison.csv', index=False)
    
    # Sort by F1 score
    df = df.sort_values('F1 Score', ascending=False)
    
    # Plot
    plt.figure(figsize=(12, 7))
    
    # Bar plot
    bars = plt.bar(df['Architecture'], df['F1 Score'], color='steelblue')
    
    # Add values on top of bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                 f'{height:.4f}', ha='center', va='bottom', fontsize=12)
    
    plt.title('Family Classification F1 Scores on PowerPC by Training Architecture', fontsize=16)
    plt.xlabel('Training Architecture', fontsize=14)
    plt.ylabel('Weighted F1 Score', fontsize=14)
    plt.ylim(0, 1.1)  # Set y-axis limit with some margin for the text
    plt.grid(True, axis='y', linestyle='--', alpha=0.7)
    plt.xticks(rotation=45)
    plt.tight_layout()
    
    # Save the plot
    plt.savefig(f'{output_dir}/powerpc_family_f1_comparison.png', dpi=300)
    print(f"F1 score comparison plot saved to {output_dir}/powerpc_family_f1_comparison.png")
    plt.close()
    
    # Also save per-family performance for the best model
    best_arch = df.iloc[0]['Architecture']
    best_report = results[best_arch][1]
    best_families = results[best_arch][2]
    
    # Save detailed report to CSV
    best_report.to_csv(f'{output_dir}/best_family_classification_report.csv')
    
    # Plot per-family F1 scores for the best model
    plot_family_performance(best_report, best_families, best_arch, output_dir)

def plot_family_performance(report_df, families, architecture, output_dir):
    """Plot per-family performance metrics for the best model"""
    # Filter out the summary rows
    family_report = report_df.loc[families]
    
    # Get F1 scores for each family
    family_f1 = family_report['f1-score'].sort_values(ascending=False)
    
    # Plot
    plt.figure(figsize=(14, 8))
    
    # Bar plot
    bars = plt.bar(family_f1.index, family_f1.values, color='lightseagreen')
    
    # Add values on top of bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                 f'{height:.3f}', ha='center', va='bottom', fontsize=10, rotation=90)
    
    plt.title(f'Per-Family F1 Scores (Trained on {architecture})', fontsize=16)
    plt.xlabel('Malware Family', fontsize=14)
    plt.ylabel('F1 Score', fontsize=14)
    plt.ylim(0, 1.1)  # Set y-axis limit with some margin for the text
    plt.grid(True, axis='y', linestyle='--', alpha=0.7)
    plt.xticks(rotation=90)
    plt.tight_layout()
    
    # Save the plot
    plt.savefig(f'{output_dir}/per_family_f1_scores.png', dpi=300)
    print(f"Per-family F1 scores plot saved to {output_dir}/per_family_f1_scores.png")
    plt.close()

def main():
    # File paths
    train_malware_file = '/home/tommy/cross-architecture/Experiment2/csv/20250228_cleanedtrain_malware_file_features.csv'
    test_malware_file = '/home/tommy/cross-architecture/Experiment2/csv/20250228_cleanedtest_malware_file_features.csv'
    
    # Load data (only malware)
    train_malware_df, test_malware_df = load_data(train_malware_file, test_malware_file)
    
    # Print CPU distributions
    print("\nTraining data CPU distribution:")
    train_cpu_dist = pd.DataFrame({
        'Malware': train_malware_df['CPU'].value_counts()
    }).fillna(0)
    print(train_cpu_dist)
    
    print("\nTesting data CPU distribution:")
    test_cpu_dist = pd.DataFrame({
        'Malware': test_malware_df['CPU'].value_counts()
    }).fillna(0)
    print(test_cpu_dist)
    
    # Print family distributions
    print("\nTraining data family distribution:")
    print(train_malware_df['family'].value_counts())
    
    print("\nTesting data family distribution:")
    print(test_malware_df['family'].value_counts())
    
    # Get features from all data
    all_data = pd.concat([train_malware_df, test_malware_df])
    feature_cols = get_features(all_data)
    print(f"\nUsing {len(feature_cols)} non-zero features")
    
    # Define CPU architectures for training
    # Get all available architectures and add 'all'
    available_cpus = sorted(list(set(train_malware_df['CPU'].unique())))
    available_cpus = [cpu for cpu in available_cpus if not pd.isna(cpu)]
    train_cpus = available_cpus + ['all']
    
    # Evaluate each architecture on PowerPC
    results = {}
    for train_cpu in train_cpus:
        result = evaluate_family_classification(
            train_cpu,
            train_malware_df,
            test_malware_df,
            feature_cols
        )
        if result is not None:
            results[train_cpu] = result
    
    # Plot and save results
    if results:
        plot_f1_comparison(results)
    else:
        print("No results to plot.")
    
    print("\nExperiment completed. Results saved to the 'results' directory.")

if __name__ == "__main__":
    main()