In [13]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from sklearn.model_selection import StratifiedKFold

In [54]:
iris_data = load_iris()
features = iris_data.data
labels = iris_data.target
features = MinMaxScaler().fit_transform(features)
df = pd.DataFrame(iris_data.data, columns=iris_data.feature_names)
df["class"] = pd.Series(iris_data.target)
random_seed = 123
k = 5
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=random_seed)
train_index_total = []
test_index_total = []
for i, (train_index, test_index) in enumerate(skf.split(features, labels)):
    train_index_total.append(train_index)
    test_index_total.append(test_index)

train_index_total = np.reshape(train_index_total, (k, -1)).T
test_index_total = np.reshape(test_index_total, (k, -1)).T

In [55]:
# Convert the numpy array to a DataFrame
train_index_df = pd.DataFrame(train_index_total)
test_index_df = pd.DataFrame(test_index_total)

# Save the DataFrame to an Excel file
train_index_df.to_excel("data/train_index_total.xlsx", index=False, header=False)
test_index_df.to_excel("data/test_index_total.xlsx", index=False, header=False)

In [16]:
split = [1, 2, 3, 4, 5]
for k in split:
    split = k
    k_2 = 5
    train_index_2 = train_index_total[:, split - 1]
    test_index_2 = test_index_total[:, split - 1]
    skf_2 = StratifiedKFold(n_splits=k_2, shuffle=True, random_state=random_seed)

    train_features = features[train_index_2]
    train_labels = labels[train_index_2]
    train_index_2_keep_total = []
    train_index_2_exclude_total = []
    for i, (train_index_2_keep, train_index_2_exclude) in enumerate(skf_2.split(train_features, train_labels)):
        train_index_2_keep_total.append(train_index_2_keep)
        train_index_2_exclude_total.append(train_index_2_exclude)
    train_index_2_keep_total = train_index_2_keep_total[0]
    train_index_2_exclude_total = train_index_2_exclude_total[0]

    test_features = features[test_index_2]
    test_labels = labels[test_index_2]
    test_index_2_keep_total = []
    test_index_2_exclude_total = []
    for i, (test_index_2_keep, test_index_2_exclude) in enumerate(skf_2.split(test_features, test_labels)):
        test_index_2_keep_total.append(test_index_2_keep)
        test_index_2_exclude_total.append(test_index_2_exclude)
    test_index_2_keep_total = test_index_2_keep_total[0]
    test_index_2_exclude_total = test_index_2_exclude_total[0]

    for i in range(len(train_index_2_keep_total)):
        f = train_features[train_index_2_keep_total[i]]
        for j in range(len(labels)):
            if np.array_equal(f, features[j]):
                train_index_2_keep_total[i] = j
                break

    for i in range(len(train_index_2_exclude_total)):
        f = train_features[train_index_2_exclude_total[i]]
        for j in range(len(labels)):
            if np.array_equal(f, features[j]):
                train_index_2_exclude_total[i] = j
                break

    for i in range(len(test_index_2_keep_total)):
        f = test_features[test_index_2_keep_total[i]]
        for j in range(len(labels)):
            if np.array_equal(f, features[j]):
                test_index_2_keep_total[i] = j
                break

    for i in range(len(test_index_2_exclude_total)):
        f = test_features[test_index_2_exclude_total[i]]
        for j in range(len(labels)):
            if np.array_equal(f, features[j]):
                test_index_2_exclude_total[i] = j
                break

    train_index_2_exclude_total_class0 = train_index_2_exclude_total[0:len(train_index_2_exclude_total)//3]
    train_index_2_exclude_total_class1 = train_index_2_exclude_total[len(train_index_2_exclude_total)//3:2*len(train_index_2_exclude_total)//3]
    train_index_2_exclude_total_class2 = train_index_2_exclude_total[2*len(train_index_2_exclude_total)//3:]
    test_index_2_exclude_total_class0 = test_index_2_exclude_total[0:len(test_index_2_exclude_total)//3]
    test_index_2_exclude_total_class1 = test_index_2_exclude_total[len(test_index_2_exclude_total)//3:2*len(test_index_2_exclude_total)//3]
    test_index_2_exclude_total_class2 = test_index_2_exclude_total[2*len(test_index_2_exclude_total)//3:]

    train_index_2_combined_class0 = np.sort(np.concatenate((train_index_2_keep_total, train_index_2_exclude_total_class0)))
    train_index_2_combined_class1 = np.sort(np.concatenate((train_index_2_keep_total, train_index_2_exclude_total_class1)))
    train_index_2_combined_class2 = np.sort(np.concatenate((train_index_2_keep_total, train_index_2_exclude_total_class2)))
    test_index_2_combined_class0 = np.sort(np.concatenate((test_index_2_keep_total, test_index_2_exclude_total_class0)))
    test_index_2_combined_class1 = np.sort(np.concatenate((test_index_2_keep_total, test_index_2_exclude_total_class1)))
    test_index_2_combined_class2 = np.sort(np.concatenate((test_index_2_keep_total, test_index_2_exclude_total_class2)))

    train_index_2_combined = (np.concatenate((train_index_2_combined_class0, train_index_2_combined_class1, train_index_2_combined_class2)))
    train_index_2_combined = np.reshape(train_index_2_combined, (3, -1)).T
    test_index_2_combined = (np.concatenate((test_index_2_combined_class0, test_index_2_combined_class1, test_index_2_combined_class2)))
    test_index_2_combined = np.reshape(test_index_2_combined, (3, -1)).T

    train_index_2_combined_df = pd.DataFrame(train_index_2_combined)
    test_index_2_combined_df = pd.DataFrame(test_index_2_combined)

    train_index_2_combined_df.to_excel(f"data/train_index_split{split}_new.xlsx", index=False, header=False)
    test_index_2_combined_df.to_excel(f"data/test_index_split{split}_new.xlsx", index=False, header=False)

In [53]:
split = [1, 2, 3, 4, 5]
for k in split:
    split = k
    k_2 = 5
    train_index_2 = train_index_total[:, split - 1]
    test_index_2 = test_index_total[:, split - 1]
    skf_2 = StratifiedKFold(n_splits=k_2, shuffle=True, random_state=random_seed)

    train_features = features[train_index_2]
    train_labels = labels[train_index_2]
    train_index_2_keep_total = []
    train_index_2_exclude_total = []
    for i, (train_index_2_exclude, train_index_2_keep) in enumerate(skf_2.split(train_features, train_labels)):
        train_index_2_exclude_total.append(train_index_2_exclude)
        train_index_2_keep_total.append(train_index_2_keep)
    train_index_2_keep_total = train_index_2_keep_total[0]
    train_index_2_exclude_total = train_index_2_exclude_total[0]

    test_features = features[test_index_2]
    test_labels = labels[test_index_2]
    test_index_2_keep_total = []
    test_index_2_exclude_total = []
    for i, (test_index_2_exclude, test_index_2_keep) in enumerate(skf_2.split(test_features, test_labels)):
        test_index_2_exclude_total.append(test_index_2_exclude)
        test_index_2_keep_total.append(test_index_2_keep)
    test_index_2_keep_total = test_index_2_keep_total[0]
    test_index_2_exclude_total = test_index_2_exclude_total[0]

    for i in range(len(train_index_2_keep_total)):
        f = train_features[train_index_2_keep_total[i]]
        for j in range(len(labels)):
            if np.array_equal(f, features[j]):
                train_index_2_keep_total[i] = j
                break

    for i in range(len(train_index_2_exclude_total)):
        f = train_features[train_index_2_exclude_total[i]]
        for j in range(len(labels)):
            if np.array_equal(f, features[j]):
                train_index_2_exclude_total[i] = j
                break

    for i in range(len(test_index_2_keep_total)):
        f = test_features[test_index_2_keep_total[i]]
        for j in range(len(labels)):
            if np.array_equal(f, features[j]):
                test_index_2_keep_total[i] = j
                break

    for i in range(len(test_index_2_exclude_total)):
        f = test_features[test_index_2_exclude_total[i]]
        for j in range(len(labels)):
            if np.array_equal(f, features[j]):
                test_index_2_exclude_total[i] = j
                break

    train_index_2_exclude_total_class0 = train_index_2_exclude_total[0:len(train_index_2_exclude_total)//3]
    train_index_2_exclude_total_class1 = train_index_2_exclude_total[len(train_index_2_exclude_total)//3:2*len(train_index_2_exclude_total)//3]
    train_index_2_exclude_total_class2 = train_index_2_exclude_total[2*len(train_index_2_exclude_total)//3:]

    test_index_2_exclude_total_class0 = test_index_2_exclude_total[0:len(test_index_2_exclude_total)//3]
    test_index_2_exclude_total_class1 = test_index_2_exclude_total[len(test_index_2_exclude_total)//3:2*len(test_index_2_exclude_total)//3]
    test_index_2_exclude_total_class2 = test_index_2_exclude_total[2*len(test_index_2_exclude_total)//3:]

    train_index_2_combined_class0 = np.sort(np.concatenate((train_index_2_keep_total, train_index_2_exclude_total_class0)))
    train_index_2_combined_class1 = np.sort(np.concatenate((train_index_2_keep_total, train_index_2_exclude_total_class1)))
    train_index_2_combined_class2 = np.sort(np.concatenate((train_index_2_keep_total, train_index_2_exclude_total_class2)))
    test_index_2_combined_class0 = np.sort(np.concatenate((test_index_2_keep_total, test_index_2_exclude_total_class0)))
    test_index_2_combined_class1 = np.sort(np.concatenate((test_index_2_keep_total, test_index_2_exclude_total_class1)))
    test_index_2_combined_class2 = np.sort(np.concatenate((test_index_2_keep_total, test_index_2_exclude_total_class2)))

    train_index_2_combined = (np.concatenate((train_index_2_combined_class0, train_index_2_combined_class1, train_index_2_combined_class2)))
    train_index_2_combined = np.reshape(train_index_2_combined, (3, -1)).T
    test_index_2_combined = (np.concatenate((test_index_2_combined_class0, test_index_2_combined_class1, test_index_2_combined_class2)))
    test_index_2_combined = np.reshape(test_index_2_combined, (3, -1)).T

    train_index_2_combined_df = pd.DataFrame(train_index_2_combined)
    test_index_2_combined_df = pd.DataFrame(test_index_2_combined)

    train_index_2_combined_df.to_excel(f"data/train_index_split{split}.xlsx", index=False, header=False)
    test_index_2_combined_df.to_excel(f"data/test_index_split{split}.xlsx", index=False, header=False)