# Dataset Creation for Steganography

## Import Required Libraries

In [2]:
import os
import shutil
import glob
import random
import string
from PIL import Image
import numpy as np
from tqdm import tqdm
import pandas as pd

## Copies cover images from a source directory to a destination directory

In [9]:
def copy_cover_images(dataset_folder):
    source_dir = 'dataset_8/cover/'
    destination_dir = dataset_folder + '/cover'
    os.makedirs(destination_dir, exist_ok=True)
    
    for file in glob.glob(os.path.join(source_dir, '*')):
        if os.path.exists(os.path.join(destination_dir, os.path.basename(file))):
            continue
        shutil.copy(file, destination_dir)

## Generates metadata for the cover images, storing details such as image name, label(stego/cover), start index, message length

In [10]:
def make_metadata_cover(dataset_path):
    metadata_file_path = os.path.join(dataset_path, 'metadata_cover.txt')

    with open(metadata_file_path, 'w') as metadata_file:
        metadata_file.write('image_name,is_stego,start_index,message_length_bits\n')

        cover_files = glob.glob(os.path.join(dataset_path, 'cover', '*'))
        
        for cover_file in cover_files:
            file_name = os.path.basename(cover_file)
            metadata_file.write(f'{file_name},0,0,0\n')

## Encodes a binary message into an image using the LSB method(random start index, sequential encoding)

In [None]:
def string_to_binary(message):
    return ''.join(format(ord(c), '08b') for c in message)

def random_message(length):
    characters = string.ascii_letters + string.digits + string.punctuation + ' '
    return ''.join(random.choice(characters) for _ in range(length))

def encode_lsb_row_wise(image, binary_message, bits=1):
    encoded_image = image.copy()
    height, width, channels = encoded_image.shape
    bin_len = len(binary_message)
    
    total_pixels = height * width * channels
    random_start = random.randint(0, max(0, total_pixels - (bin_len // bits) - 10))
    flat_image = encoded_image.reshape(-1)
    
    # Calculate maximum number of bits we can encode
    max_bits = (len(flat_image) - random_start) * bits
    if len(binary_message) > max_bits:
        binary_message = binary_message[:max_bits]
    
    index = random_start
    binary_index = 0
    
    while binary_index < len(binary_message) and index < len(flat_image):
        bit_end = min(binary_index + bits, len(binary_message))
        bits_to_encode = binary_message[binary_index:bit_end]
        
        # Pad with zeros if needed
        bits_to_encode = bits_to_encode.ljust(bits, '0')
        
        # Convert binary string to integer
        binary_value = int(bits_to_encode, 2)
        
        if bits == 8:
            # For 8-bit encoding, completely replace the byte
            flat_image[index] = binary_value
        else:

            mask = 256 - (2**bits)
            pixel_value = flat_image[index]
            pixel_value = (pixel_value & mask) | binary_value
            flat_image[index] = pixel_value
        
        index += 1
        binary_index += bits
    
    encoded_image = flat_image.reshape(height, width, channels)
    return encoded_image, random_start

## Generates metadata for the stego images, storing details such as image name, label(stego/cover), start index, message length

In [12]:
def create_stego_metadata(dataset_dir, bits):
    os.makedirs(os.path.join(dataset_dir, 'stego'), exist_ok=True)
    cover_files = glob.glob(os.path.join(dataset_dir, 'cover', '*'))
    metadata_file_path = os.path.join(dataset_dir, 'metadata_stego.txt')
    
    with open(metadata_file_path, 'w') as metadata_file:
        metadata_file.write('image_name,is_stego,start_index,message_length_bits\n')

        for cover_file in tqdm(cover_files, desc='Processing cover images'):
            cover_image = Image.open(cover_file)
            cover_image = cover_image.convert('RGB')
            cover_image_array = np.array(cover_image)
            
            message_length = random.randint(2000, 3000)
            message = random_message(message_length)
            binary_message = string_to_binary(message)
            
            encoded_image, random_start = encode_lsb_row_wise(cover_image_array, binary_message, bits=bits)
            
            stego_image = Image.fromarray(encoded_image.astype('uint8'))
            stego_file_name = os.path.join(dataset_dir, 'stego', os.path.basename(cover_file))
            stego_image.save(stego_file_name)
            metadata_file.write(f'{os.path.basename(cover_file)},1,{random_start},{len(binary_message)}\n')

## Combining both the metadata for stego and cover

In [13]:
def combine_metadata(dataset_path):
    metadata_cover_path = os.path.join(dataset_path, 'metadata_cover.txt')
    metadata_stego_path = os.path.join(dataset_path, 'metadata_stego.txt')
    
    cover_df = pd.read_csv(metadata_cover_path)
    stego_df = pd.read_csv(metadata_stego_path)
    
    combined_df = pd.concat([cover_df, stego_df], ignore_index=True)
    combined_df.to_csv(os.path.join(dataset_path, 'metadata.txt'), index=False)

# Function for creating the dataset

In [14]:
def create_dataset(dataset_folder, bits):
    copy_cover_images(dataset_folder)
    make_metadata_cover(dataset_folder)
    create_stego_metadata(dataset_folder, bits=bits)
    combine_metadata(dataset_folder)

## Create Dataset with 8 Bits

In [None]:
make_metadata_cover('dataset_8')
create_stego_metadata('dataset_8', bits=8)
combine_metadata('dataset_8')

## Create Dataset with 7 Bits

In [None]:
create_dataset('dataset_7', bits=7)

## Create Dataset with 6 Bits

In [9]:
create_dataset('dataset_6', bits=6)

Processing cover images: 100%|██████████| 16338/16338 [23:34<00:00, 11.55it/s]


## Create Dataset with 5 Bits

In [10]:
create_dataset('dataset_5', bits=5)

Processing cover images: 100%|██████████| 16338/16338 [19:17<00:00, 14.12it/s]


## Create Dataset with 4 Bits

In [11]:
create_dataset('dataset_4', bits=4)

Processing cover images: 100%|██████████| 16338/16338 [19:40<00:00, 13.84it/s]


## Create Dataset with 3 Bits

In [12]:
create_dataset('dataset_3', bits=3)

Processing cover images: 100%|██████████| 16338/16338 [41:04<00:00,  6.63it/s]


## Create Dataset with 2 Bits

In [13]:
create_dataset('dataset_2', bits=2)

Processing cover images: 100%|██████████| 16338/16338 [40:04<00:00,  6.79it/s]


## Create Dataset with 1 Bit

In [15]:
create_dataset('dataset_1', bits=1)

Processing cover images: 100%|██████████| 16338/16338 [30:34<00:00,  8.91it/s]
