# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import os 
import random

# Combine train and test dataset

In [None]:
data = []
def merge_csv_files(file1, file2, output_file):
    # Read data from both CSV files
    data1 = pd.read_csv(file1)
    data2 = pd.read_csv(file2)
    
    # Concatenate the dataframes vertically
    merged_data = pd.concat([data1, data2], ignore_index=True)
    
    # Move the first column to the last
    columns = merged_data.columns.tolist()
    columns = columns[1:] + [columns[0]]
    data = merged_data[columns]
    
    # Write the merged data to a new Excel file
    data.to_excel(output_file, index=False)

merge_csv_files("./fashion-mnist_test.csv", "./fashion-mnist_train.csv", "./fashion-mnist.xlsx")

# Generate client datasets(full-non-iid)

In [None]:
# Load the dataset
data = pd.read_excel("./fashion-mnist.xlsx") 

# Define the number of classes and class labels
num_classes = 10
class_labels = np.arange(num_classes)

new_dataset = []

# Create a directory to store the Excel files for each node
if not os.path.exists('./client-datasets/full-non-iid'):
    os.makedirs('./client-datasets/full-non-iid')

# Function to split the dataset into nodes and save to Excel
def create_full_non_iid_dataset(class_label):
    # Filter data belonging to the specified class
    class_data = data[data['label'] == class_label]
    
    # Calculate the midpoint of the class data
    midpoint = len(class_data) // 2
    
    # Take the first half of the data
    class_data = class_data.iloc[:midpoint, :]
    
    # Save data to Excel
    class_data.to_excel(f'./client-datasets/full-non-iid/client-{class_label+1}.xlsx', index=False)
    
    # Append data to the new_dataset list
    new_dataset.append(class_data)

# Iterate over each class and create a dataset for each node
for class_label in class_labels:
    create_full_non_iid_dataset(class_label)

# Concatenate dataframes in new_dataset into a single dataframe
new_dataset_combined = pd.concat(new_dataset, ignore_index=True)

# Save the combined dataframe to Excel
new_dataset_combined.to_excel(f'./fashion-mnist-new.xlsx', index=False)


# Generate client datasets(99%-non-iid)

In [None]:
# Load the dataset
data = pd.read_excel("./fashion-mnist-new.xlsx") 

# Define the number of classes and class labels
num_classes = 10
class_labels = np.arange(num_classes)

# Create a directory to store the Excel files for each node 
if not os.path.exists('./client-datasets/99-non-iid'):
    os.makedirs('./client-datasets/99-non-iid')

# Function to split the dataset into nodes and save to Excel
def create_99_non_iid_datasets():
    # Shuffle the data
    data_shuffled = data.sample(frac=1, random_state=42)
    
    # Initialize a dictionary to store data for each node
    node_data = {label: pd.DataFrame(columns=data.columns) for label in class_labels}
    
    # Assign 99% of data to different classes
    for label in class_labels:
        class_data = data_shuffled[data_shuffled['label'] == label]
        node_data[label] = class_data.iloc[:int(0.99 * len(class_data)), :]
    
    # Distribute the remaining 1% evenly among all nodes
    remaining_data = data_shuffled.iloc[int(0.99 * len(data_shuffled)):, :]
    remaining_data_indices = remaining_data.index.tolist()
    random.shuffle(remaining_data_indices)
    
    for i, idx in enumerate(remaining_data_indices):
        node_label = class_labels[i % num_classes]
        node_data[node_label] = node_data[node_label].append(data_shuffled.loc[idx])
    
    # Save data to CSV for each node
    for label, node_df in node_data.items():
        
        node_df.to_excel(f'./client-datasets/99-non-iid/client-{label+1}.xlsx', index=False)

# Call the function to create node datasets
create_99_non_iid_datasets()