In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold

In [None]:
df_C = pd.read_csv("./data/paired_classification/shuffled_data.csv")

# filter out long sequences
correct_length = df_C.apply(
    lambda x: len(x["h_sequence"]) + len(x["l_sequence"]) <= 315, axis=1
)
df_C.drop(index=correct_length[correct_length == False].index, inplace=True)
print(len(df_C))

In [None]:
k = 5
seed = 42

In [None]:
# returns lists of indices that are shuffled, stratified k-fold cv
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
X = df_C.drop("label", axis=1)
y = df_C.loc[:, "label"].astype("int64")

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f"Fold {i}:")

    # select data by each CV fold
    train = df_C.loc[train_index].sample(frac=1, random_state=seed)
    test = df_C.loc[test_index].sample(frac=1, random_state=seed)

    print(train["label"].value_counts())
    print(test["label"].value_counts(), "\n")

    # reset index
    train = train.reset_index(drop=True)
    test = test.reset_index(drop=True)

    # save as csvs
    train.to_csv(
        f"./data/paired_classification/train-test_splits/native-0_shuffled-1_train{i}.csv",
        index=False,
    )
    test.to_csv(
        f"./data/paired_classification/train-test_splits/native-0_shuffled-1_test{i}.csv",
        index=False,
    )