In [1]:
from sklearn.model_selection import StratifiedKFold, train_test_split
import pandas as pd
import os
import numpy as np

from ccarl.ccarl import CCARLClassifier, _calculate_binders, _log_rfu_values
from ccarl.glycan_parsers.cfg_array_versions import get_likely_cfg_array_version
import ccarl

In [2]:
DATA_DIR = '../Data/CFG_Data_Files/CFG_CSV_Data_Files/'


csv_files = ["ABA_14361_100ug_v5.0_DATA.csv",
             "ConA_13799-10ug_V5.0_DATA.csv",
             'PNA_14030_10ug_v5.0_DATA.csv',
             "RCAI_10ug_14110_v5.0_DATA.csv",
             "PHA-E-10ug_13853_V5.0_DATA.csv",
             "PHA-L-10ug_13856_V5.0_DATA.csv",
             "LCA_10ug_13934_v5.0_DATA.csv",
             "SNA_10ug_13631_v5.0_DATA.csv",
             "MAL-I_10ug_13883_v5.0_DATA.csv",
             "MAL_II_10ug_13886_v5.0_DATA.csv",
             "GSL-I-B4_10ug_13920_v5.0_DATA.csv",
             "jacalin-1ug_14301_v5.0_DATA.csv", # Not a vector labs, but can't find one in the data. Only EY.
             'WGA_14057_1ug_v5.0_DATA.csv',
             "UEAI_100ug_13806_v5.0_DATA.csv",
             "SBA_14042_10ug_v5.0_DATA.csv",
             "DBA_100ug_13897_v5.0_DATA.csv",
             "PSA_14040_10ug_v5.0_DATA.csv",
             "HA_PuertoRico_8_34_13829_v5_DATA.csv",
             'H3N8-HA_16686_v5.1_DATA.csv',
             "Human-DC-Sign-tetramer_15320_v5.0_DATA.csv"]

In [3]:
# TODO Need to run frequent subtree mining on full list of glycans.

def generate_test_training_splits(csv_file, random_seed=None):
    if random_seed:
        np.random.seed(random_seed)
    CV_DATA_DIR = './Data/CV_Folds/'
    thresholds=(1.5, 3.5)
    csv_data = pd.read_csv(os.path.join('./Data/CFG_Data_Files/CFG_CSV_Data_Files/', csv_file))
    rfu_data = csv_data.RFU
    structures = list(csv_data.Structure)
    log_rfu = _log_rfu_values(rfu_data)
    binders = _calculate_binders(log_rfu, thresholds=thresholds)
    
    if random_seed:
        np.random.seed(random_seed)
    
    # Read in all glycan structures from csv. Match to closest CFG array version.
    # Reads in glycan structures from respective array version.
    glycan_list, array_version, mismatches, sum_lev = get_likely_cfg_array_version(structures, distance_threshold=2)
    
    rfu_dict = {key: value for key, value in zip(glycan_list, list(rfu_data))}

    filtered_glycan_list = [x for x, binding in zip(glycan_list, binders) if binding != 1]
    binding_class = binders[binders != 1]
    binding_class[binding_class == 2] = 1
    
    # Generate test and training datasets.
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    fold = 0
    for train, test in skf.split(filtered_glycan_list, binding_class):
        fold += 1
        glycan_list_train = [filtered_glycan_list[x] for x in train]
        glycan_list_test = [filtered_glycan_list[x] for x in test]
        df_train = pd.DataFrame({'glycan': glycan_list_train, 'binding': binding_class[train], 'rfu': [rfu_dict[x] for x in glycan_list_train]})
        df_test = pd.DataFrame({'glycan': glycan_list_test, 'binding': binding_class[test], 'rfu': [rfu_dict[x] for x in glycan_list_test]})
        df_train.to_csv(os.path.join(CV_DATA_DIR, f"fold_{fold}", f"training_set_{csv_file}") , sep=',', index=False)
        df_test.to_csv(os.path.join(CV_DATA_DIR, f"fold_{fold}", f"test_set_{csv_file}") , sep=',', index=False)    
    return

In [4]:
for csv_file in csv_files:
    generate_test_training_splits(csv_file, random_seed=42)