# Data Pre-processing for Native vs Shuffled Pair Classification

shuffle pairs for half of the given dataset of paired Ab sequences; return a class-balanced dataset (class-balancing takes place within each donor to allow for train/test splitting by donor)

split for classifier head training using 5-fold CV

In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold

# for random selection of sequences to shuffle
seed = 42

# for k-fold cv
k = 5

## dataset download

In [2]:
%%bash

# download the native vs shuffled pair datasets if they don't already exist
if [ ! -e "../data/C_native-0_shuffled-1.csv" ] && [ ! -e "../data/D_native-0_shuffled-1.csv" ]; then
    curl -o 'pair_classification.tar.gz' -L 'https://zenodo.org/records/13973760/files/pair_classification.tar.gz?download=1'
    tar xzvf 'pair_classification.tar.gz' -C ../data
    rm 'pair_classification.tar.gz'
fi

In [3]:
# for reference: function used to shuffle pairs (Zenodo dataset downloads are already shuffled)
def pair_class_dataset(hd_data, seed=42):
    """
    shuffle pairs for half of the given dataset of paired Ab sequences
    return a class-balanced dataset (class-balancing takes place within each donor to allow for train/test splitting by donor
    """

    def sep_chains(row):
        h, l = row["text"].split("<cls><cls>")
        return {
            "name": row["sequence_id"],
            "h_sequence": h, 
            "l_sequence": l,
            "donor": row["donor"],
        }
    hd_data.reset_index(drop=True, inplace=True)
    hd_data = pd.DataFrame(list(hd_data.apply(sep_chains, axis=1)))

    # 50-50 split of native and shuffled pairs
    def shuffle_lc(df, seed=seed):
        df.reset_index(drop=True, inplace=True)
        
        # native pairs
        native_pairs = df.sample(frac = 0.5, random_state = seed)
        native_pairs["h_sequence_id"] = native_pairs["name"]
        native_pairs["l_sequence_id"] = native_pairs["name"]
        native_pairs["label"] = 0
        native_pairs.reset_index(inplace=True, drop=True)
    
        # remaining data for making shuffled pairs
        shuffled = df.drop(native_pairs.index)
        shuffled["h_sequence_id"] = shuffled["name"]
        shuffled["l_sequence_id"] = shuffled["name"]
        shuffled["label"] = 1
        shuffled.reset_index(inplace=True, drop=True)
        print(len(shuffled))
    
        # shuffle light chains
        shuffled_lc = shuffled[["l_sequence_id", "l_sequence"]].sample(frac=1.0, random_state=seed)
        shuffled_lc.reset_index(inplace=True, drop=True)
        
        # append to shuffled df
        shuffled.rename(columns={"l_sequence": "old_lc", 
                                 "l_sequence_id": "old_l_sequence_id"}, inplace=True)
        shuffled = pd.concat([shuffled, shuffled_lc], axis=1)
        
        # only take ones where the light chain pairing got changed
        shuffled = shuffled[shuffled["l_sequence"] != shuffled["old_lc"]].reset_index(drop=True)
        print(len(shuffled))
    
        # no native pairs appear in the shuffled dataset (e.g. a heavy chain has multiple productive light chain pairings and one of those is in the shuffled dataset)
        if len(pd.merge(hd_data[["h_sequence", "l_sequence"]], shuffled[["h_sequence", "l_sequence"]], how="inner")) == 0:
            print("no native pairs in shuffled dataset.")
    
        # note: to match class sizes
        pair_data = pd.concat([native_pairs.sample(n = len(shuffled), random_state = seed)[["name", "h_sequence_id", "l_sequence_id", "h_sequence", "l_sequence", "donor", "label"]], 
                               shuffled[["name", "h_sequence_id", "l_sequence_id", "h_sequence", "l_sequence", "donor", "label"]]]).reset_index(drop=True)
        
        # concat names for easier parsing later on
        pair_data["name"] = pair_data["h_sequence_id"] + "|" + pair_data["l_sequence_id"]
        pair_data.drop(columns=["h_sequence_id", "l_sequence_id"], inplace=True)
        
        return pair_data

    # apply chain splitting to each donor separately (to allow for class-balanced training/test sets each with independent donors)
    classifier_df = pd.DataFrame(columns=["name", "h_sequence", "l_sequence", "donor", "label"])
    for donor in hd_data["donor"].unique():
        print(donor)
        shuffled_df = shuffle_lc(hd_data[hd_data["donor"] == donor])
        print(shuffled_df["label"].value_counts(), "\n")
        classifier_df = pd.concat([classifier_df, shuffled_df]).reset_index(drop=True)
    
    print("--- \noverall class sizes:")
    print(classifier_df["label"].value_counts())

    return classifier_df

## 5-fold CV splits for classifier training

### Dataset C

In [4]:
# filter out long sequences
df_C = pd.read_csv("../data/C_native-0_shuffled-1.csv")
correct_length = df_C.apply(lambda x: len(x["h_sequence"]) + len(x["l_sequence"]) <= 315, axis=1)
df_C.drop(index=correct_length[correct_length == False].index, inplace=True)
print(len(df_C))

64874


In [5]:
# returns lists of indices that are shuffled, stratified k-fold cv
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
X = df_C.drop("label", axis=1)
y = df_C.loc[:, "label"].astype("int64")

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f"Fold {i}:")

    # select data by each CV fold
    train = df_C.loc[train_index].sample(frac=1, random_state=seed)
    test = df_C.loc[test_index].sample(frac=1, random_state=seed)

    print(train["label"].value_counts())
    print(test["label"].value_counts(), "\n")

    # reset index
    train = train.reset_index(drop=True)
    test = test.reset_index(drop=True)
    
    # save as csvs
    train.to_csv(f'./train-test_splits/C_native-0_shuffled-1_train{i}.csv', index=False)
    test.to_csv(f'./train-test_splits/C_native-0_shuffled-1_test{i}.csv', index=False)

Fold 0:
1    25950
0    25949
Name: label, dtype: int64
0    6488
1    6487
Name: label, dtype: int64 

Fold 1:
1    25950
0    25949
Name: label, dtype: int64
0    6488
1    6487
Name: label, dtype: int64 

Fold 2:
0    25950
1    25949
Name: label, dtype: int64
1    6488
0    6487
Name: label, dtype: int64 

Fold 3:
0    25950
1    25949
Name: label, dtype: int64
1    6488
0    6487
Name: label, dtype: int64 

Fold 4:
1    25950
0    25950
Name: label, dtype: int64
0    6487
1    6487
Name: label, dtype: int64 



### Dataset D

In [6]:
# filter out long sequences
df_D = pd.read_csv("../data/D_native-0_shuffled-1.csv")
correct_length = df_D.apply(lambda x: len(x["h_sequence"]) + len(x["l_sequence"]) <= 315, axis=1)
df_D.drop(index=correct_length[correct_length == False].index, inplace=True)
print(len(df_D))

146668


In [7]:
# returns lists of indices that are shuffled, stratified k-fold cv
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
X = df_D.drop("label", axis=1)
y = df_D.loc[:, "label"].astype("int64")

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f"Fold {i}:")

    # select data by each CV fold
    train = df_D.loc[train_index].sample(frac=1, random_state=seed)
    test = df_D.loc[test_index].sample(frac=1, random_state=seed)

    print(train["label"].value_counts())
    print(test["label"].value_counts(), "\n")

    # reset index
    train = train.reset_index(drop=True)
    test = test.reset_index(drop=True)
    
    # save as csvs
    train.to_csv(f'./train-test_splits/D_native-0_shuffled-1_train{i}.csv', index=False)
    test.to_csv(f'./train-test_splits/D_native-0_shuffled-1_test{i}.csv', index=False)

Fold 0:
0    58667
1    58667
Name: label, dtype: int64
1    14667
0    14667
Name: label, dtype: int64 

Fold 1:
0    58667
1    58667
Name: label, dtype: int64
1    14667
0    14667
Name: label, dtype: int64 

Fold 2:
0    58667
1    58667
Name: label, dtype: int64
1    14667
0    14667
Name: label, dtype: int64 

Fold 3:
1    58668
0    58667
Name: label, dtype: int64
0    14667
1    14666
Name: label, dtype: int64 

Fold 4:
0    58668
1    58667
Name: label, dtype: int64
1    14667
0    14666
Name: label, dtype: int64 

