# Start

In [1]:
datadir = '../data';
wkdir = '../results';

mk_cd_dir(wkdir, false);
%imatlab_export_fig('print-png')

% loocv inputs

prepare_loocv = true

demo_loocv = true

demo_loocv_number_or_list = 'number' % 'number' or 'list'

demo_loocv_number_cmpds = 2

demo_loocv_list_cmpds = {'BRD-K04804440','BRD-K01507359','BRD-K87202646','BRD-K59853741', 'BRD-K27302037'} % Ciprofloxacin, Rifampin, Isoniazid, Q203, Thioacetazone

results_subdir_prefix = 'loocv_pcls/leave_out_cmpd_'

loocv_save_out = false % save tabular file for each treatment's PCL similarity score

loocv_corr_filename = 'sGR_for_pcls_pearson_corr' % for faster runtime and lower storage, only will consider KABX dsCGI profiles in LOOCV

loocv_col_meta_all_filename = 'col_meta_kabx.txt' % for faster runtime and lower storage, only will consider KABX dsCGI profiles in LOOCV

unique_kabx_cmpds_tbl_path = '../results/kabx_pert_ids_tbl_for_loocv.txt'

% general inputs

clusters_gmt_filename = 'clusters_spectral_clust.gmt'

corr_filename = 'sGR_kabx_gsk_brd4310_pearson_corr'

col_meta_all_filename = 'col_meta.txt'

col_meta_kabx_filename = 'col_meta_kabx_for_pcls.txt'

save_out = false % save tabular file for each treatment's PCL similarity score

min_clust_size = 2

print_multi_target = false

stringify_cids = false

% previous Spectral Clustering inputs

thrsh_rank = 20 % threshold for average pairwise rank of correlation across KABX to connect treatments as mutual nearest-neighbors

dynamic_thrsh_per_moa = false % if true then threshold is round(log(size of MOA) * thrsh_rank), otherwise identical threshold for every MOA

k_type = 'k_med_gap_den' % eigengap heuristic to take for estimating number of K clusters: k_num_zero, k_num_zero_plus_one, k_med_gap_den, k_gap_den (see create_laplacian_matrix.m for additional information)

if dynamic_thrsh_per_moa
    prev_outdir_name = sprintf('clusters_spectral_clustering_thrsh_rank_le%dxlogsize_%s', thrsh_rank, k_type) % log(MOA size), i.e. the number of treatments/dsCGI profiles in the MOA
    outdir_name = sprintf('pcls_spectral_clustering_thrsh_rank_le%dxlogsize_%s', thrsh_rank, k_type) % log(MOA size), i.e. the number of treatments/dsCGI profiles in the MOA
else
    prev_outdir_name = sprintf('clusters_spectral_clustering_thrsh_rank_le%d_%s', thrsh_rank, k_type)
    outdir_name = sprintf('pcls_spectral_clustering_thrsh_rank_le%d_%s', thrsh_rank, k_type)
end


prepare_loocv =

  logical

   1


demo_loocv =

  logical

   1


demo_loocv_number_or_list =

    'number'


demo_loocv_number_cmpds =

     2


demo_loocv_list_cmpds =

  1x5 cell array

  Columns 1 through 3

    {'BRD-K04804440'}    {'BRD-K01507359'}    {'BRD-K87202646'}

  Columns 4 through 5

    {'BRD-K59853741'}    {'BRD-K27302037'}


results_subdir_prefix =

    'loocv_pcls/leave_out_cmpd_'


loocv_save_out =

  logical

   0


loocv_corr_filename =

    'sGR_for_pcls_pearson_corr'


loocv_col_meta_all_filename =

    'col_meta_kabx.txt'


unique_kabx_cmpds_tbl_path =

    '../results/kabx_pert_ids_tbl_for_loocv.txt'


clusters_gmt_filename =

    'clusters_spectral_clust.gmt'


corr_filename =

    'sGR_kabx_gsk_brd4310_pearson_corr'


col_meta_all_filename =

    'col_meta.txt'


col_meta_kabx_filename =

    'col_meta_kabx_for_pcls.txt'


save_out =

  logical

   0


min_clust_size =

     2


print_multi_target =

  logical

   0


stringify_cids =

  logical

   0


th

# Run PCL similarity scoring 

In [2]:
clusters_path = fullfile(wkdir, prev_outdir_name, clusters_gmt_filename)
g = glob(fullfile(wkdir, [corr_filename,'_n*.gctx']));
c_path = g{1}
c_rank_path = []
col_meta_path = fullfile(wkdir, col_meta_all_filename)
col_meta_kabx_path = fullfile(wkdir, col_meta_kabx_filename)
outdir = fullfile(wkdir, outdir_name)


clusters_path =

    '../results/clusters_spectral_clustering_thrsh_rank_le20_k_med_gap_den/clusters_spectral_clust.gmt'


c_path =

    '../results/sGR_kabx_gsk_brd4310_pearson_corr_n10819x10819.gctx'


c_rank_path =

     []


col_meta_path =

    '../results/col_meta.txt'


col_meta_kabx_path =

    '../results/col_meta_kabx.txt'


outdir =

    '../results/pcls_spectral_clustering_thrsh_rank_le20_k_med_gap_den'



In [3]:
assert(exist(clusters_path) > 0)

assert(exist(c_path) > 0)

assert(exist(col_meta_path) > 0)

assert(exist(col_meta_kabx_path) > 0 )

## Troubleshoot

In [19]:
% Section to check clusters for their most similar KABX dsCGI profile and define them as PCLs if the profiles are in-MOA (share same MOA as PCL)
% and exclude them as uninterpretable clusters for MOA prediction otherwise
disp('Beginning check on clusters for their most similar KABX dsCGI profile and define them as PCLs if the profiles are in-MOA (share same MOA as PCL) and exclude them as uninterpretable clusters for MOA prediction otherwise');

% Parse gctx with the cluster similarity score
g = glob(fullfile(outdir,'cluster_median_corr_n*.gctx'))
ss_all = parse_gctx(g{1}); % cluster similarity score of all treatments

col_meta_ss_all = cell2table([ss_all.cid,ss_all.cdesc],'VariableNames',['cid';ss_all.chd]);
col_meta_ss_all.target_description = any2str(col_meta_ss_all.target_description);
disp('Head of col_meta_ss_all tbl:');
disp(headt(col_meta_ss_all));
disp('Unique KABX MOA annotations including multi-target:');
disp(unique(col_meta_ss_all.target_description));

row_meta_ss_all = cell2table([ss_all.rid,ss_all.rdesc],'VariableNames',['rid';ss_all.rhd]);
row_meta_ss_all.cluster_desc = any2str(row_meta_ss_all.cluster_desc);
disp('Head of row_meta_ss_all tbl:');
disp(headt(row_meta_ss_all));
disp('Unique PCL MOA annotations:');
disp(unique(row_meta_ss_all.cluster_desc));

[a,b] = ind2sub(size(ss_all.mat),1:numel(ss_all.mat));

ss_all_tbl = [col_meta_ss_all(b,:),row_meta_ss_all(a,:)];
disp('Size of ss_all tbl:');
disp(size(ss_all_tbl));
ss_all_tbl.cluster_similarity_score = ss_all.mat(:);

ss_all_same_moa = ss_all;
ss_all_same_moa.mat = zeros(size(ss_all.mat));
target_desc = ss_all_same_moa.cdesc(:, ss_all_same_moa.cdict('target_description'));

% Set print_multi_target if not provided in the input
if isempty(print_multi_target)
	print_multi_target = false; 
end

for ii = 1:numel(ss_all_same_moa.rid)
    cluster_desc = ss_all_same_moa.rdesc(ii,ss_all_same_moa.rdict('cluster_desc'));
    %ss_all_same_moa.mat(ii,:) = ismember(target_desc,cluster_desc);
    %ss_all_same_moa.mat(ii,:) = contains(target_desc,cluster_desc);
    ss_all_same_moa.mat(ii,:) = ismember(target_desc,cluster_desc) | (contains(target_desc, '|') & contains(target_desc,cluster_desc));
    
    %check_matches = target_desc(~ismember(target_desc,cluster_desc) & contains(target_desc,cluster_desc));
    check_matches = target_desc(~ismember(target_desc,cluster_desc) & (contains(target_desc, '|') & contains(target_desc,cluster_desc)));
    
    if print_multi_target & length(check_matches) > 0
        cluster_desc
        check_matches
    end
end

disp('Sum of treatment-PCL pairs with matching MOA (binary label of 1):');
disp(sum(ss_all_same_moa.mat,'all'));

ss_all_tbl.cluster_and_moa_agree = ss_all_same_moa.mat(:);


g =

  1x1 cell array

    {'../results/pcls_spectral_clustering_thrsh_rank_le20_k_med_gap_den/cluster_median_corr_n10819x1947.gctx'}

Reading ../results/pcls_spectral_clustering_thrsh_rank_le20_k_med_gap_den/cluster_median_corr_n10819x1947.gctx [1947x10819]
Done [1.34 s].
Head of col_meta_ss_all tbl:
    idx               field                                                                        value                                                          
    ___    ___________________________    ________________________________________________________________________________________________________________________

     1     {'cid'                    }    {'kabx2:BRD-A05468928-003-01-4:0.097500uM'                                                                             }
     2     {'broad_id'               }    {'BRD-A05468928-003-01-4'                                                                                              }
     3     {'canonical_smiles'       }    {

In [20]:
% Load KABX column metadata
if isstr(col_meta_kabx_path)
	col_meta_kabx = rtable(col_meta_kabx_path);
end

% Select profiles with annotated MOA from KABX
% cidx = ~cellfun(@isempty,s.cdesc(:,s.cdict('target_description')));
%cidx = ismember(s.cid, col_meta_kabx.cid);
cidx = ismember(ss_all_tbl.cid, col_meta_kabx.cid);
disp('Number of rows of ss_all_tbl from KABX treatments:');
disp(sum(cidx));

%ss_kabx = ds_slice(ss_all,'cidx',cidx)
ss_kabx_tbl = ss_all_tbl(cidx,:);

disp('Size of ss_all tbl:');
disp(size(ss_all_tbl));

disp(sprintf('Number of pert_ids in ss_all_tbl: %d', numel(unique(ss_all_tbl.pert_id))));
disp(sprintf('Number of broad_ids in ss_all_tbl: %d', numel(unique(ss_all_tbl.broad_id))));
disp(sprintf('Number of proj_broad_ids in ss_all_tbl: %d', numel(unique(ss_all_tbl.proj_broad_id))));
disp(sprintf('Number of treatments (cids) in ss_all_tbl: %d', numel(unique(ss_all_tbl.cid))));
disp(sprintf('Number of treatment-PCL pairs with matching MOA (binary label of 1) in ss_all_tbl: %d', sum(ss_all_tbl.cluster_and_moa_agree)));

disp('Size of ss_kabx tbl:');
disp(size(ss_kabx_tbl));

disp(sprintf('Number of pert_ids in ss_kabx_tbl: %d', numel(unique(ss_kabx_tbl.pert_id))));
disp(sprintf('Number of broad_ids in ss_kabx_tbl: %d', numel(unique(ss_kabx_tbl.broad_id))));
disp(sprintf('Number of proj_broad_ids in ss_kabx_tbl: %d', numel(unique(ss_kabx_tbl.proj_broad_id))));
disp(sprintf('Number of treatments (cids) in ss_kabx_tbl: %d', numel(unique(ss_kabx_tbl.cid))));
disp(sprintf('Number of treatment-PCL pairs with matching MOA (binary label of 1) in ss_kabx_tbl: %d', sum(ss_kabx_tbl.cluster_and_moa_agree)));


cluster_list = unique(ss_kabx_tbl.rid);
disp(sprintf('Number of clusters: %d', length(cluster_list)));

moa_cluster_list = unique(ss_kabx_tbl.cluster_desc);
disp(sprintf('Number of MOAs represented in clusters: %d', length(moa_cluster_list)));

% Filter out uninterpretable clusters (non MOA-separable) to find PCL clusters whose most similar treatments are in-MOA rather than out-of-MOA
disp('Using ss_kabx_tbl to find most similar KABX treatment to each cluster');

% Find the row with the maximum cluster_similarity_score for each rid

maxSimilarityScore = groupsummary(ss_kabx_tbl, 'rid', @max, 'cluster_similarity_score');

size(maxSimilarityScore)

head(maxSimilarityScore)

disp('Size of ss_kabx tbl:');
disp(size(ss_kabx_tbl));

ss_kabx_tbl = innerjoin(ss_kabx_tbl, maxSimilarityScore, 'Keys', 'rid');

disp('Size of ss_kabx tbl:');
disp(size(ss_kabx_tbl));

disp('Head of ss_kabx tbl:');
disp(headt(ss_kabx_tbl));

% Get the rows with the maximum cluster_similarity_score
maxRows = ss_kabx_tbl(ss_kabx_tbl.cluster_similarity_score >= ss_kabx_tbl.fun1_cluster_similarity_score,:);

size(maxRows)

headt(maxRows)

% Find the rows where cluster_and_moa_agree = 1
pclListRows = maxRows(maxRows.cluster_and_moa_agree == 1,:);

% Get the rids of these rows
pcl_list = unique(pclListRows.rid);

disp(sprintf('Length of PCL cluster list: %d', length(pcl_list)));

moa_pcl_list = unique(pclListRows.cluster_desc);
disp(sprintf('Number of MOAs represented in PCL cluster list: %d', length(moa_pcl_list)));

% Remove clusters whose most similar KABX dsCGI profile is of a different MOA reflecting signal not specific to one particular MOA
% while clusters whose most similar KABX dsCGI profile is in-MOA are defined as PCL clusters and used for making MOA predictions
disp('Removing clusters whose most similar KABX dsCGI profile is of a different MOA reflecting signal not specific to one particular MOA');
disp('Clusters whose most similar KABX dsCGI profile is in-MOA are defined as PCL clusters and will be used for making MOA predictions');

num_clusters_ini = length(cluster_list);
num_pcl_clusters = length(pcl_list);
num_uninterpretable_clusters = num_clusters_ini - num_pcl_clusters;
disp(sprintf('The initial number of clusters is %d', num_clusters_ini));
disp(sprintf('Removing %d uninterpretable clusters with most similar KABX dsCGI profile out of MOA', num_uninterpretable_clusters));
disp(sprintf('The remaining number of PCL clusters is %d', num_pcl_clusters));

pcls = parse_gmt(clusters_path);
pcls_head = struct2table(pcls).head;
length(pcls_head)
pcls(~ismember(pcls_head, pcl_list)) = []; % remove uninterpretable clusters 
pcls_tbl = struct2table(pcls);
disp('Head and size of PCLs tbl');
disp(headt(pcls_tbl));
disp(size(pcls_tbl));

mkgmt(fullfile(outdir, 'pcls.gmt'), pcls)

Reading ../results/col_meta_kabx.txt

Number of rows of ss_all_tbl from KABX treatments:
    18354369

Size of ss_all tbl:
    21064593          31

Number of pert_ids in ss_all_tbl: 611
Number of broad_ids in ss_all_tbl: 679
Number of proj_broad_ids in ss_all_tbl: 1141
Number of treatments (cids) in ss_all_tbl: 10819
Number of treatment-PCL pairs with matching MOA (binary label of 1) in ss_all_tbl: 1006223
Size of ss_kabx tbl:
    18354369          31

Number of pert_ids in ss_kabx_tbl: 437
Number of broad_ids in ss_kabx_tbl: 505
Number of proj_broad_ids in ss_kabx_tbl: 967
Number of treatments (cids) in ss_kabx_tbl: 9427
Number of treatment-PCL pairs with matching MOA (binary label of 1) in ss_kabx_tbl: 1006223
Number of clusters: 1947
Number of MOAs represented in clusters: 71
Using ss_kabx_tbl to find most similar KABX treatment to each cluster

ans =

        1947           3


ans =

  8x3 table

               rid               GroupCount    fun1_cluster_similarity_score
    ___

In [21]:
disp('Size of ss_all tbl:');
disp(size(ss_all_tbl));

ss_all_tbl = innerjoin(ss_all_tbl, maxSimilarityScore, 'Keys', 'rid');

ss_all_tbl = ss_all_tbl(ismember(ss_all_tbl.rid, pcl_list),:);

disp('Size of ss_all tbl:');
disp(size(ss_all_tbl));

if any(strcmp('pcl_desc', ss_all_tbl.Properties.VariableNames))
    ss_all_tbl = removevars(ss_all_tbl, 'pcl_desc');
end

if any(strcmp('GroupCount', ss_all_tbl.Properties.VariableNames))
    ss_all_tbl = removevars(ss_all_tbl, 'GroupCount');
end

if any(strcmp('fun1_cluster_similarity_score', ss_all_tbl.Properties.VariableNames))
    ss_all_tbl.cluster_max_kabx_similarity_score =  ss_all_tbl.fun1_cluster_similarity_score;

    ss_all_tbl = removevars(ss_all_tbl, 'fun1_cluster_similarity_score');
end

% Get the current column names of the table
col_names = ss_all_tbl.Properties.VariableNames;

% Replace "cluster" with "pcl" in each column name
new_col_names = strrep(col_names, 'cluster', 'pcl');

% Assign the updated column names back to the table
ss_all_tbl.Properties.VariableNames = new_col_names;

ss_all_tbl.pcl_id = ss_all_tbl.rid;

disp('Size and head of ss_all tbl:');
disp(headt(ss_all_tbl));
disp(size(ss_all_tbl));

Size of ss_all tbl:
    21064593          31

Size of ss_all tbl:
    12333660          33

Head of ss_all tbl:
    idx                  field                                                                                                   value                                                                                  
    ___    _________________________________    ________________________________________________________________________________________________________________________________________________________________________

     1     {'cid'                          }    {'kabx2:BRD-A05468928-003-01-4:0.097500uM'                                                                                                                             }
     2     {'broad_id'                     }    {'BRD-A05468928-003-01-4'                                                                                                                                              }
     3     {'canoni

In [22]:
% Make gctx
ss_all_tbl.cid_idx = grp2idx(ss_all_tbl.cid);
ss_all_tbl.pcl_id_idx = grp2idx(ss_all_tbl.pcl_id);

% Find indices
[~,ridx] = unique(ss_all_tbl.pcl_id_idx);
rid = ss_all_tbl.pcl_id(ridx);
[~,cidx] = unique(ss_all_tbl.cid_idx);
cid = ss_all_tbl.cid(cidx);

% Create metadata

col_names = ss_all_tbl.Properties.VariableNames;

% Find all column names that do not contain "pcl"
non_pcl_col_names = col_names(~contains(col_names, 'pcl') & ~contains(col_names, {'rid','cid_idx'}));
pcl_col_names = col_names(contains(col_names, 'pcl') & ~contains(col_names, {'pcl_similarity_score','pcl_and_moa_agree','pcl_id_idx'}));

col_meta = ss_all_tbl(cidx, non_pcl_col_names);

disp('Size and head of final col_meta for gct files:');
disp(size(col_meta));
disp(headt(col_meta));

row_meta = ss_all_tbl(ridx, pcl_col_names);

disp('Size and head of final row_meta for gct files:');
disp(size(row_meta));
disp(headt(row_meta));

Size and head of final col_meta for gct files:
       10819          21

    idx               field                                                                        value                                                          
    ___    ___________________________    ________________________________________________________________________________________________________________________

     1     {'cid'                    }    {'kabx2:BRD-A05468928-003-01-4:0.097500uM'                                                                             }
     2     {'broad_id'               }    {'BRD-A05468928-003-01-4'                                                                                              }
     3     {'canonical_smiles'       }    {'CC1CN(CCN1)c1cc2n(cc(C(O)=O)c(=O)c2cc1F)-c1ccc(F)cc1F'                                                               }
     4     {'pert_class'             }    {'fluoroquinolone'                                                   

In [23]:
% Create an empty matrix
mat = nan(numel(rid),numel(cid));
ind = sub2ind(size(mat),ss_all_tbl.pcl_id_idx,ss_all_tbl.cid_idx);

ds = mkgctstruct(mat,'rid',rid','cid',cid);
ds = annotate_ds(ds,table2struct(col_meta),'dim','column','keyfield','cid');
ds = annotate_ds(ds,table2struct(row_meta),'dim','row','keyfield','pcl_id');

% Create gctx files with various data
fields = {'pcl_similarity_score','pcl_and_moa_agree'};

for ii = 1:numel(fields)
    disp(fields{ii})
    ds_tmp = ds;
    ds_tmp.mat(ind) = ss_all_tbl.(fields{ii});

    % Verify
    rid1 = ds_tmp.rid(2);
    cid1 = ds_tmp.cid(1);
    corr1 = ds_tmp.mat(2,1);
    idx = ismember(ss_all_tbl.cid,cid1)&ismember(ss_all_tbl.pcl_id,rid1);
    assert(sum(idx)>0,'Rows or columns of the gct structure do not match');
    assert(ss_all_tbl.(fields{ii})(idx)==corr1,'Value in the matrix do not match the one in the table');
    mkgctx(fullfile(outdir,[fields{ii},'.gctx']), ds_tmp)
end

% Save the output table
%if ~isempty(project_id)
%    wtable(ss_all_tbl,fullfile(outdir,[project_id, '_correlation_to_selected_pcls.txt']));
%else
%    wtable(ss_all_tbl,fullfile(outdir,'correlation_to_selected_pcls.txt'));
%end

pcl_similarity_score
Saving HDF5 dataset to: ../results/pcls_spectral_clustering_thrsh_rank_le20_k_med_gap_den/pcl_similarity_score_n10819x1140.gctx...
/0/DATA/0 exists, deleting
Disabling compression.
Setting chunk size to: 1000x262
done [45.95s].

ans =

    '../results/pcls_spectral_clustering_thrsh_rank_le20_k_med_gap_den/pcl_similarity_score_n10819x1140.gctx'

pcl_and_moa_agree
Saving HDF5 dataset to: ../results/pcls_spectral_clustering_thrsh_rank_le20_k_med_gap_den/pcl_and_moa_agree_n10819x1140.gctx...
/0/DATA/0 exists, deleting
Disabling compression.
Setting chunk size to: 1000x262
done [23.76s].

ans =

    '../results/pcls_spectral_clustering_thrsh_rank_le20_k_med_gap_den/pcl_and_moa_agree_n10819x1140.gctx'



## Troubleshoot end

In [None]:
pcl_similarity_scoring(clusters_path,c_path,c_rank_path,col_meta_path,col_meta_kabx_path,outdir,min_clust_size,print_multi_target,stringify_cids)

The initial number of clusters is 1947
Removing clusters with size less than 2
The remaining number of clusters is 1947
Reading ../results/col_meta.txt

Reading ../results/sGR_kabx_gsk_brd4310_pearson_corr_n10819x10819.gctx [10819x10819]
Done [4.81 s].

ans =

  0x1 empty cell array


ans =

       10819


ans =

        7229

Saving gctx files
Saving HDF5 dataset to: ../results/pcls_spectral_clustering_thrsh_rank_le20_k_med_gap_den/ds_corr_n7229x10819.gctx...
/0/DATA/0 exists, deleting
Disabling compression.
Setting chunk size to: 1000x262
done [10.63s].

ans =

    '../results/pcls_spectral_clustering_thrsh_rank_le20_k_med_gap_den/ds_corr_n7229x10819.gctx'

Sorting indices by broad_id

ans =

  logical

   0


ans =

  logical

   1


ans =

       10819


num_clusters =

        1947


out_tbl =

  0x0 empty cell array

1/1947
100/1947
200/1947
300/1947
400/1947
500/1947
600/1947
700/1947
800/1947
900/1947
1000/1947
1100/1947
1200/1947
1300/1947
1400/1947
1500/1947
1600/1947
1700/19

# LOOCV section

In [2]:
if prepare_loocv

    unique_kabx_cmpds_tbl = rtable(unique_kabx_cmpds_tbl_path);

    size(unique_kabx_cmpds_tbl)
    headt(unique_kabx_cmpds_tbl)
    
    unique_kabx_cmpds_list = unique(unique_kabx_cmpds_tbl.kabx_cmpd);

    length(unique_kabx_cmpds_list)
    
    number_of_cmpds_loocv = length(unique_kabx_cmpds_list)
    
    if demo_loocv
       if strcmp(demo_loocv_number_or_list, 'number')
           number_of_cmpds_loocv = max(1, demo_loocv_number_cmpds)
           
           index_cmpds_loocv = 1:number_of_cmpds_loocv
           
       elseif strcmp(demo_loocv_number_or_list, 'list')
           number_of_cmpds_loocv = length(demo_loocv_list_cmpds)
           
           index_cmpds_loocv = find(ismember(unique_kabx_cmpds_list, demo_loocv_list_cmpds))'
       else
           error('Invalid input for demo_loocv_number_or_list: number or list')
       end
    end
    
    for i = index_cmpds_loocv
    
        % If the current iteration number is a multiple of 50
        if mod(i, 50) == 0
            % Print a status message
            fprintf('Currently at iteration %d\n', i);
        end

        leave_out_cmpd = unique_kabx_cmpds_list(i);
        
        loo_wkdir = fullfile(wkdir, strcat(results_subdir_prefix, strjoin(unique_kabx_cmpds_list(i))));

        mk_cd_dir(loo_wkdir, false);
        
        loo_outdir = fullfile(loo_wkdir, outdir_name)
        
        mk_cd_dir(loo_outdir, false);
        
        % step specific commands
    
        clusters_path = fullfile(loo_wkdir, prev_outdir_name, clusters_gmt_filename)
        g = glob(fullfile(wkdir, [loocv_corr_filename,'_n*.gctx'])); % calculating PCL similarity scores for all KABX dsCGI profiles and excluding test compounds
        c_path = g{1}
        c_rank_path = []
        col_meta_path = fullfile(wkdir, loocv_col_meta_all_filename) % col_meta is for all of KABX only excluding test compounds
        col_meta_kabx_path = fullfile(loo_wkdir, col_meta_kabx_filename) % col_meta_kabx (in this iteration of LOOCV with a KABX compound treated as unknown) is the col_meta_kabx_for_pcls file in its subdirectory
        outdir = loo_outdir
        
        assert(exist(clusters_path) > 0)

        assert(exist(c_path) > 0)

        assert(exist(col_meta_path) > 0)

        assert(exist(col_meta_kabx_path) > 0 )
        
        pcl_similarity_scoring(clusters_path,c_path,c_rank_path,col_meta_path,col_meta_kabx_path,outdir,min_clust_size,print_multi_target,stringify_cids)
        
    end
    
end

Reading ../results/kabx_pert_ids_tbl_for_loocv.txt


ans =

   437     2


ans =

  2x3 table

    idx          field                value      
    ___    _________________    _________________

     1     {'kabx_cmpd_idx'}    {[            1]}
     2     {'kabx_cmpd'    }    {'BRD-A02179977'}


ans =

   437


number_of_cmpds_loocv =

   437


number_of_cmpds_loocv =

     2


index_cmpds_loocv =

     1     2


loo_outdir =

    '../results/loocv_pcls/leave_out_cmpd_BRD-A02179977/pcls_spectral_clustering_thrsh_rank_le20_k_med_gap_den'


clusters_path =

    '../results/loocv_pcls/leave_out_cmpd_BRD-A02179977/clusters_spectral_clustering_thrsh_rank_le20_k_med_gap_den/clusters_spectral_clust.gmt'


c_path =

    '../results/sGR_for_pcls_pearson_corr_n9427x9427.gctx'


c_rank_path =

     []


col_meta_path =

    '../results/col_meta_kabx.txt'


col_meta_kabx_path =

    '../results/loocv_pcls/leave_out_cmpd_BRD-A02179977/col_meta_kabx_for_pcls.txt'


outdir =

    '../results/loocv_p