This notebook creates the chromosomal splits used for training CNN models.

In [6]:
import json

import numpy
import pandas

# Library for solving the multi-way number partitioning problem
# Used to decide which chromosomes to group
import prtpy

In [2]:
# Load data
data_full = pandas.read_csv(
    '../preprocess_data/Zb_5UTR_MPRA.tsv',
    index_col=0,
    sep='\t',
)
data_full

Unnamed: 0,chr,strand,external_gene_name,utr_length,insert_length,n_uORFs,GC_content,mxfold,index,index_base,...,diff_log2_TPM_input_6-2hpf,diff_log2_TPM_input_10-2hpf,MRL_2hpf,log2_MRL_2hpf,MRL_4hpf,log2_MRL_4hpf,MRL_6hpf,log2_MRL_6hpf,MRL_10hpf,log2_MRL_10hpf
ENSDARG00000000001_ENSDART00000000004_19058_slc35a5_20318,chr9,-,slc35a5,103,103,2,52.427184,23.9,20318.0,20318,...,-0.012463,-0.558270,5.026628,2.329591,6.713248,2.747011,7.002960,2.807965,6.480393,2.696081
ENSDARG00000000018_ENSDART00000181044_14421_nrf1_72681,chr4,-,nrf1,134,134,0,61.940299,35.3,72681.0,72681,...,-0.055230,-0.243785,5.450300,2.446336,6.196041,2.631347,8.359783,3.063465,4.441558,2.151066
ENSDARG00000000019_ENSDART00000124452_14118_ube2h_27446,chr4,+,ube2h,178,178,1,46.629213,30.1,27446.0,27446,...,-0.058991,-0.058277,5.911159,2.563441,10.441205,3.384216,7.626433,2.931009,6.260806,2.646348
ENSDARG00000000068_ENSDART00000000069_2438_slc9a3r1a_113092,chr12,+,slc9a3r1a,152,152,0,46.052632,26.1,113092.0,113092,...,0.102668,0.830415,14.368484,3.844836,12.294140,3.619899,11.405933,3.511713,9.108052,3.187143
ENSDARG00000000069_ENSDART00000000070_12170_dap_20320,chr24,-,dap,153,153,1,47.058824,31.8,20320.0,20320,...,-0.081402,0.238507,7.103448,2.828519,6.990700,2.805437,8.646954,3.112192,4.903643,2.293854
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSDARG00000025554_ENSDART00000103273_1746_wdr83os_27060,chr11,+,wdr83os,126,126,1,37.301587,22.8,27060.0,27060,...,0.390930,0.082304,8.956233,3.162892,13.598507,3.765376,8.083156,3.014919,20.048542,4.325425
ENSDARG00000103318_ENSDART00000161570_7325_mrpl3_86762,chr19,+,mrpl3,111,111,2,34.234234,12.5,86762.0,86762,...,0.514384,-0.326691,11.157909,3.479995,11.045525,3.465390,8.266639,3.047301,12.727341,3.669859
ENSDARG00000036698_ENSDART00000053300_7697_znf865_21263.6,chr19,-,znf865,1305,197,4,31.979695,25.9,21263.6,21263,...,-0.001837,0.051171,13.771432,3.783607,19.339298,4.273464,17.089539,4.095042,7.074025,2.822531
ENSDARG00000056892_ENSDART00000148517_5556_mpp6a_23746.2,chr16,-,mpp6a,311,161,1,39.130435,37.6,23746.2,23746,...,-0.215189,0.088363,13.484335,3.753213,14.144042,3.822123,19.909238,4.315366,16.071784,4.006458


In [4]:
# Count number of sequences per chromosome
chr_counts = data_full.groupby('chr').count().iloc[:, 0]
display(chr_counts)
print(chr_counts.sum())

chr
chr1      849
chr10     646
chr11     639
chr12     677
chr13     745
chr14     700
chr15     578
chr16     876
chr17     645
chr18     590
chr19     848
chr2      851
chr20     772
chr21     634
chr22     540
chr23     708
chr24     511
chr25     601
chr3      970
chr4      509
chr5     1036
chr6      785
chr7      951
chr8      706
chr9      654
Name: strand, dtype: int64

18021


In [7]:
# Partition the data in 10 subsets with roughly the same number of sequences
chr_counts_dict = chr_counts.to_dict()

chr_partitions = prtpy.partition(algorithm=prtpy.partitioning.greedy, numbins=10, items=chr_counts_dict)
for chr_partition in chr_partitions:
    nseqs = [chr_counts_dict[c] for c in chr_partition]
    print(f"{chr_partition}: {numpy.sum(nseqs)} sequences total.")

['chr5', 'chr25']: 1637 sequences total.
['chr3', 'chr21']: 1604 sequences total.
['chr7', 'chr11']: 1590 sequences total.
['chr16', 'chr17']: 1521 sequences total.
['chr2', 'chr10', 'chr24']: 2008 sequences total.
['chr1', 'chr9', 'chr4']: 2012 sequences total.
['chr19', 'chr12']: 1525 sequences total.
['chr6', 'chr14', 'chr22']: 2025 sequences total.
['chr20', 'chr8', 'chr15']: 2056 sequences total.
['chr13', 'chr23', 'chr18']: 2043 sequences total.


In [8]:
# Assemble split data structure
# Each partition gets to be the test and validation set once
# random.seed(1)
splits_info = []

for partition_idx in range(len(chr_partitions)):
    split_info = {}
    
    test_chrs = chr_partitions[partition_idx]
    val_chrs = chr_partitions[(partition_idx + 1)%len(chr_partitions)]
    train_chrs = [c for c in chr_partitions if (c!=test_chrs and c!=val_chrs)]
    train_chrs = [chr for chrs in train_chrs for chr in chrs]
    
    split_info['train'] = train_chrs
    split_info['val'] = val_chrs
    split_info['test'] = test_chrs
    
    splits_info.append(split_info)

splits_info

[{'train': ['chr7',
   'chr11',
   'chr16',
   'chr17',
   'chr2',
   'chr10',
   'chr24',
   'chr1',
   'chr9',
   'chr4',
   'chr19',
   'chr12',
   'chr6',
   'chr14',
   'chr22',
   'chr20',
   'chr8',
   'chr15',
   'chr13',
   'chr23',
   'chr18'],
  'val': ['chr3', 'chr21'],
  'test': ['chr5', 'chr25']},
 {'train': ['chr5',
   'chr25',
   'chr16',
   'chr17',
   'chr2',
   'chr10',
   'chr24',
   'chr1',
   'chr9',
   'chr4',
   'chr19',
   'chr12',
   'chr6',
   'chr14',
   'chr22',
   'chr20',
   'chr8',
   'chr15',
   'chr13',
   'chr23',
   'chr18'],
  'val': ['chr7', 'chr11'],
  'test': ['chr3', 'chr21']},
 {'train': ['chr5',
   'chr25',
   'chr3',
   'chr21',
   'chr2',
   'chr10',
   'chr24',
   'chr1',
   'chr9',
   'chr4',
   'chr19',
   'chr12',
   'chr6',
   'chr14',
   'chr22',
   'chr20',
   'chr8',
   'chr15',
   'chr13',
   'chr23',
   'chr18'],
  'val': ['chr16', 'chr17'],
  'test': ['chr7', 'chr11']},
 {'train': ['chr5',
   'chr25',
   'chr3',
   'chr21',
   'ch

In [9]:
# Verification
# no chromosomes in common between train, val, test
for split_info in splits_info:
    assert(len(set(split_info['train']).intersection(set(split_info['val'])))==0)
    assert(len(set(split_info['train']).intersection(set(split_info['test'])))==0)
    assert(len(set(split_info['val']).intersection(set(split_info['test'])))==0)
    assert(len(set(split_info['train']).union(set(split_info['val']).union(set(split_info['test']))))==25)

# no UTRs in common between train, val, and test
# Use index_base
for split_info in splits_info:
    train_chrs = split_info['train']
    val_chrs = split_info['val']
    test_chrs = split_info['test']
    # split summary table
    summary_train = data_full[data_full['chr'].isin(train_chrs)]
    summary_val = data_full[data_full['chr'].isin(val_chrs)]
    summary_test = data_full[data_full['chr'].isin(test_chrs)]
    
    assert((~summary_train['index_base'].isin(summary_val['index_base'])).any())
    assert((~summary_train['index_base'].isin(summary_test['index_base'])).any())
    assert((~summary_val['index_base'].isin(summary_test['index_base'])).any()) 

# get number of sequences in each set
for split_idx, split_info in enumerate(splits_info):
    nseqs_train = numpy.sum([chr_counts_dict[c] for c in split_info['train']])
    nseqs_val = numpy.sum([chr_counts_dict[c] for c in split_info['val']])
    nseqs_test = numpy.sum([chr_counts_dict[c] for c in split_info['test']])
    print(f"Split {split_idx}: {nseqs_train} / {nseqs_val} / {nseqs_test} sequences")

Split 0: 14780 / 1604 / 1637 sequences
Split 1: 14827 / 1590 / 1604 sequences
Split 2: 14910 / 1521 / 1590 sequences
Split 3: 14492 / 2008 / 1521 sequences
Split 4: 14001 / 2012 / 2008 sequences
Split 5: 14484 / 1525 / 2012 sequences
Split 6: 14471 / 2025 / 1525 sequences
Split 7: 13940 / 2056 / 2025 sequences
Split 8: 13922 / 2043 / 2056 sequences
Split 9: 14341 / 1637 / 2043 sequences


In [10]:
# Save split info
with open('chr_splits.json', 'w') as f:
    json.dump(splits_info, f)