In [33]:
import os
import pandas as pd
import random
from sklearn.model_selection import train_test_split

# Paths (adjust based on your directory structure)
malimg_base = "C:\\Users\\youssef khaled\\Desktop\\GP\\malimg_dataset"  # Contains train/, val/, test/
benign_path = "C:\\Users\\youssef khaled\\Desktop\\GP\\benign_data\\benign_imgs"  # Benign images




In [42]:
# Initialize DataFrames for splits
df_train = pd.DataFrame(columns=['img_code', 'target'])
df_val = pd.DataFrame(columns=['img_code', 'target'])
df_test = pd.DataFrame(columns=['img_code', 'target'])

# Process MALWARE classes from pre-split malimg_dataset
for split in ['train', 'val', 'test']:
    split_path = os.path.join(malimg_base, split)

    # Iterate through all malware class directories in the split
    for class_name in os.listdir(split_path):
        class_dir = os.path.join(split_path, class_name)

        # Get all image paths for this class
        images = [os.path.join(class_dir, img) for img in os.listdir(class_dir)]

        # Add to the corresponding split DataFrame
        temp_df = pd.DataFrame({'img_code': images, 'target': class_name})

        if split == 'train':
            df_train = pd.concat([df_train, temp_df], ignore_index=True)
        elif split == 'val':
            df_val = pd.concat([df_val, temp_df], ignore_index=True)
        elif split == 'test':
            df_test = pd.concat([df_test, temp_df], ignore_index=True)

# Process BENIGN class (split into 8:1:1 and add to DataFrames)
benign_images = [os.path.join(benign_path, img) for img in os.listdir(benign_path)]
random.shuffle(benign_images)  # Shuffle to avoid order bias

# Automate splitting using train_test_split
# First, split into train (80%) and temp (20%)
benign_train, benign_temp = train_test_split(benign_images, train_size=0.8, random_state=42)

# Then, split temp into val (50%) and test (50%) to get 10% each
benign_val, benign_test = train_test_split(
    benign_temp, train_size=0.5, random_state=42)

# Add Benign to DataFrames
df_train = pd.concat([df_train, pd.DataFrame(
    {'img_code': benign_train, 'target': 'Benign'})], ignore_index=True)
df_val = pd.concat([df_val, pd.DataFrame(
    {'img_code': benign_val, 'target': 'Benign'})], ignore_index=True)
df_test = pd.concat([df_test, pd.DataFrame(
    {'img_code': benign_test, 'target': 'Benign'})], ignore_index=True)

output_dir = "../CSVs"

df_train.to_csv(os.path.join(
    output_dir, "train_combined_malimg.csv"), index=False)
df_val.to_csv(os.path.join(output_dir, "val_combined_malimg.csv"), index=False)
df_test.to_csv(os.path.join(
    output_dir, "test_combined_malimg.csv"), index=False)