In [1]:
import os
import re
import numpy as np
import pandas as pd
from collections import defaultdict
from itertools import islice

In [2]:
metadata = pd.read_table('/nfs/turbo/dcmb-class/bioinf593/groups/group_05/output/trgt/repeatregion_10_parsedvcf.txt')

In [3]:
metadata

Unnamed: 0,chrom,pos,trid,end_pos,motif,len_motif,struc,MC,confidence_interval,flanking_reads,inrepeat_reads,spanning_reads,sample_name,len_repeat_region
0,chr1,44836,chr1_44835_44867,44867,AAAT,4,(AAAT)n,89,8-8/9-9,"(1, 5), (2, 4), (3, 2), (4, 1), (6, 4), (7, 1)...",(),"(8, 24), (9, 15)",HG01891,32
1,chr1,167135,chr1_167134_167150,167150,TTTA,4,(TTTA)n,44,4-4/4-4,"(1, 2), (2, 1), (3, 3), (4, 4)",(),"(4, 28)",HG01891,16
2,chr1,370633,chr1_370632_370648,370648,TTTC,4,(TTTC)n,44,4-4/4-4,"(1, 5), (2, 6), (3, 1), (4, 3)",(),"(4, 38)",HG01891,16
3,chr1,371009,chr1_371008_371033,371033,TTTTG,5,(TTTTG)n,55,4-4/5-5,"(1, 3), (2, 4), (3, 2), (4, 3), (5, 2), (8, 1)",(),"(4, 6), (5, 45)",HG01891,25
4,chr1,371275,chr1_371274_371292,371292,TG,2,(TG)n,98,9-9/10-10,"(2, 1), (3, 1), (4, 3), (6, 3), (7, 3), (8, 2)...",(),"(8, 3), (9, 36), (10, 21)",HG01891,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6519475,chrX,155781112,chrX_155781111_155781127,155781127,AG,2,(AG)n,88,8-8/8-8,"(1, 1), (2, 3), (3, 1), (5, 1), (6, 1), (7, 1)",(),"(8, 25)",HG01071,16
6519476,chrX,155786790,chrX_155786789_155786809,155786809,TTTA,4,(TTTA)n,55,5-5/5-5,"(2, 2), (4, 2), (5, 2), (6, 1)",(),"(5, 22), (10, 1)",HG01071,20
6519477,chrX,155805374,chrX_155805373_155805397,155805397,AAC,3,(AAC)n,88,8-8/8-8,"(1, 5), (2, 1), (3, 2), (4, 1), (5, 1), (7, 1)...",(),"(2, 1), (5, 1), (8, 39), (10, 1)",HG01071,24
6519478,chrX,155846315,chrX_155846314_155846329,155846329,ATTTT,5,(ATTTT)n,44,4-4/4-4,"(1, 2), (2, 2), (3, 4), (4, 5)",(),"(4, 44)",HG01071,15


In [4]:
metadata['sample_name'].unique()

array(['HG01891', 'HG03492', 'HG02630', 'HG02257', 'NA19240', 'HG03098',
       'HG01243', 'HG01358', 'NA20129', 'HG01175', 'HG01106', 'HG01258',
       'HG01361', 'HG00621', 'HG03516', 'HG01952', 'HG00735', 'NA18906',
       'HG00438', 'HG03453', 'HG00733', 'HG00741', 'HG02080', 'HG01928',
       'HG03579', 'HG03540', 'HG02055', 'HG01109', 'HG03486', 'HG02572',
       'HG02622', 'HG02723', 'HG01978', 'HG00673', 'HG02886', 'HG02145',
       'HG02717', 'HG02818', 'HG02148', 'HG01071'], dtype=object)

In [5]:
select_samples = ['HG01891', 'HG03492', 'HG02630', 'HG02257', 'NA19240', 'HG03098', 'HG01243']

## Get train/test split chromosomes

In [6]:
chrom_counts = metadata[metadata['sample_name'].isin(select_samples)]['chrom'].value_counts()

In [7]:
chrom_counts.sum()

1140909

In [8]:
0.8 * chrom_counts.sum()

912727.2000000001

In [9]:
chrom_counts

chrom
chr2     94934
chr1     92631
chr3     78554
chr4     75866
chr5     71197
chr6     69517
chr7     63875
chrX     58030
chr8     57827
chr12    54978
chr11    51450
chr9     47236
chr13    40999
chr14    35658
chr17    34741
chr10    33943
chr16    33894
chr18    32340
chr15    31535
chr19    27811
chr20    24906
chr21    14973
chr22    14014
Name: count, dtype: int64

In [10]:
sum = 0 
train_chroms = []
test_chroms = []
for count in chrom_counts.items():
    if sum < 912727:
        sum += count[1]
        train_chroms.append(count[0])
    else:
        test_chroms.append(count[0])

In [11]:
train_chroms

['chr2',
 'chr1',
 'chr3',
 'chr4',
 'chr5',
 'chr6',
 'chr7',
 'chrX',
 'chr8',
 'chr12',
 'chr11',
 'chr9',
 'chr13',
 'chr14',
 'chr17']

In [12]:
test_chroms

['chr10', 'chr16', 'chr18', 'chr15', 'chr19', 'chr20', 'chr21', 'chr22']

## Filter metadata table for only loci that made it through OHE processing and label them with train/test in new column

In [55]:
ohe_dir = '/nfs/turbo/dcmb-class/bioinf593/groups/group_05/STRonvoli/data/ohe'
trids = []
filenames = defaultdict(lambda: defaultdict(list))
for sample in select_samples:
    ohe_path = os.path.join(ohe_dir, sample)
    for file in os.listdir(ohe_path):
        trid_split = re.split('[_.]', file)
        trid = f'{trid_split[1]}_{trid_split[2]}_{trid_split[3]}'
        trids.append(trid)
        filenames[trid_split[0]][trid_split[1]].append(file)

In [14]:
len(trids)

1140880

In [39]:
filtered_metadata = metadata[(metadata['trid'].isin(trids)) & (metadata['sample_name'].isin(select_samples))]

In [40]:
filtered_metadata

Unnamed: 0,chrom,pos,trid,end_pos,motif,len_motif,struc,MC,confidence_interval,flanking_reads,inrepeat_reads,spanning_reads,sample_name,len_repeat_region
0,chr1,44836,chr1_44835_44867,44867,AAAT,4,(AAAT)n,89,8-8/9-9,"(1, 5), (2, 4), (3, 2), (4, 1), (6, 4), (7, 1)...",(),"(8, 24), (9, 15)",HG01891,32
1,chr1,167135,chr1_167134_167150,167150,TTTA,4,(TTTA)n,44,4-4/4-4,"(1, 2), (2, 1), (3, 3), (4, 4)",(),"(4, 28)",HG01891,16
2,chr1,370633,chr1_370632_370648,370648,TTTC,4,(TTTC)n,44,4-4/4-4,"(1, 5), (2, 6), (3, 1), (4, 3)",(),"(4, 38)",HG01891,16
3,chr1,371009,chr1_371008_371033,371033,TTTTG,5,(TTTTG)n,55,4-4/5-5,"(1, 3), (2, 4), (3, 2), (4, 3), (5, 2), (8, 1)",(),"(4, 6), (5, 45)",HG01891,25
4,chr1,371275,chr1_371274_371292,371292,TG,2,(TG)n,98,9-9/10-10,"(2, 1), (3, 1), (4, 3), (6, 3), (7, 3), (8, 2)...",(),"(8, 3), (9, 36), (10, 21)",HG01891,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1140904,chrX,155781112,chrX_155781111_155781127,155781127,AG,2,(AG)n,88,8-8/8-8,"(4, 3), (6, 1), (8, 1)",(),"(8, 32)",HG01243,16
1140905,chrX,155786790,chrX_155786789_155786809,155786809,TTTA,4,(TTTA)n,55,5-5/5-5,"(3, 1), (4, 3)",(),"(5, 23)",HG01243,20
1140906,chrX,155805374,chrX_155805373_155805397,155805397,AAC,3,(AAC)n,87,7-7/8-8,"(1, 3), (2, 1), (5, 1), (6, 3), (7, 1)",(),"(7, 16), (8, 17)",HG01243,24
1140907,chrX,155846315,chrX_155846314_155846329,155846329,ATTTT,5,(ATTTT)n,44,4-4/4-4,"(1, 1), (2, 4), (3, 2), (4, 3)",(),"(4, 22)",HG01243,15


In [68]:
missing_files = []
missing_indices = []
for row in filtered_metadata[['trid','sample_name']].itertuples():
    filename = f'{row[2]}_{row[1]}.npy'
    chr = row[1].split('_')[0]
    if filename not in filenames[row[2]][chr]:
        missing_files.append(filename)
        missing_indices.append(row[0])

In [69]:
missing_files

['HG03492_chr6_31356477_31356492.npy',
 'HG03492_chr7_8803792_8803837.npy',
 'HG03492_chr9_113018278_113018298.npy',
 'HG03492_chrX_89785621_89785661.npy',
 'HG02257_chrX_52740629_52740647.npy',
 'NA19240_chr5_71157098_71157126.npy',
 'HG03098_chr9_42226505_42226515.npy',
 'HG03098_chr9_42273338_42273362.npy',
 'HG03098_chr9_42277908_42277928.npy',
 'HG03098_chr9_42290064_42290084.npy',
 'HG03098_chr9_42421130_42421140.npy',
 'HG03098_chr9_42423455_42423473.npy',
 'HG03098_chr9_42438921_42438941.npy',
 'HG03098_chr9_42484937_42484953.npy',
 'HG03098_chr9_62037601_62037625.npy',
 'HG03098_chrX_18776209_18776245.npy',
 'HG03098_chrX_90971849_90971865.npy',
 'HG03098_chrX_112753270_112753288.npy',
 'HG01243_chrX_52740629_52740647.npy',
 'HG01243_chrX_68650481_68650514.npy',
 'HG01243_chrX_112753270_112753288.npy',
 'HG01243_chrX_118845429_118845485.npy']

In [71]:
final_filtered_metadata = filtered_metadata.drop(index=missing_indices)

In [72]:
final_filtered_metadata

Unnamed: 0,chrom,pos,trid,end_pos,motif,len_motif,struc,MC,confidence_interval,flanking_reads,inrepeat_reads,spanning_reads,sample_name,len_repeat_region
0,chr1,44836,chr1_44835_44867,44867,AAAT,4,(AAAT)n,89,8-8/9-9,"(1, 5), (2, 4), (3, 2), (4, 1), (6, 4), (7, 1)...",(),"(8, 24), (9, 15)",HG01891,32
1,chr1,167135,chr1_167134_167150,167150,TTTA,4,(TTTA)n,44,4-4/4-4,"(1, 2), (2, 1), (3, 3), (4, 4)",(),"(4, 28)",HG01891,16
2,chr1,370633,chr1_370632_370648,370648,TTTC,4,(TTTC)n,44,4-4/4-4,"(1, 5), (2, 6), (3, 1), (4, 3)",(),"(4, 38)",HG01891,16
3,chr1,371009,chr1_371008_371033,371033,TTTTG,5,(TTTTG)n,55,4-4/5-5,"(1, 3), (2, 4), (3, 2), (4, 3), (5, 2), (8, 1)",(),"(4, 6), (5, 45)",HG01891,25
4,chr1,371275,chr1_371274_371292,371292,TG,2,(TG)n,98,9-9/10-10,"(2, 1), (3, 1), (4, 3), (6, 3), (7, 3), (8, 2)...",(),"(8, 3), (9, 36), (10, 21)",HG01891,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1140904,chrX,155781112,chrX_155781111_155781127,155781127,AG,2,(AG)n,88,8-8/8-8,"(4, 3), (6, 1), (8, 1)",(),"(8, 32)",HG01243,16
1140905,chrX,155786790,chrX_155786789_155786809,155786809,TTTA,4,(TTTA)n,55,5-5/5-5,"(3, 1), (4, 3)",(),"(5, 23)",HG01243,20
1140906,chrX,155805374,chrX_155805373_155805397,155805397,AAC,3,(AAC)n,87,7-7/8-8,"(1, 3), (2, 1), (5, 1), (6, 3), (7, 1)",(),"(7, 16), (8, 17)",HG01243,24
1140907,chrX,155846315,chrX_155846314_155846329,155846329,ATTTT,5,(ATTTT)n,44,4-4/4-4,"(1, 1), (2, 4), (3, 2), (4, 3)",(),"(4, 22)",HG01243,15


In [81]:
np.select([final_filtered_metadata['chrom'].isin(train_chroms)], [0], 1)

array([0, 0, 0, ..., 0, 0, 0])

In [89]:
final_filtered_metadata

Unnamed: 0,chrom,pos,trid,end_pos,motif,len_motif,struc,MC,confidence_interval,flanking_reads,inrepeat_reads,spanning_reads,sample_name,len_repeat_region,split
0,chr1,44836,chr1_44835_44867,44867,AAAT,4,(AAAT)n,89,8-8/9-9,"(1, 5), (2, 4), (3, 2), (4, 1), (6, 4), (7, 1)...",(),"(8, 24), (9, 15)",HG01891,32,0
1,chr1,167135,chr1_167134_167150,167150,TTTA,4,(TTTA)n,44,4-4/4-4,"(1, 2), (2, 1), (3, 3), (4, 4)",(),"(4, 28)",HG01891,16,0
2,chr1,370633,chr1_370632_370648,370648,TTTC,4,(TTTC)n,44,4-4/4-4,"(1, 5), (2, 6), (3, 1), (4, 3)",(),"(4, 38)",HG01891,16,0
3,chr1,371009,chr1_371008_371033,371033,TTTTG,5,(TTTTG)n,55,4-4/5-5,"(1, 3), (2, 4), (3, 2), (4, 3), (5, 2), (8, 1)",(),"(4, 6), (5, 45)",HG01891,25,0
4,chr1,371275,chr1_371274_371292,371292,TG,2,(TG)n,98,9-9/10-10,"(2, 1), (3, 1), (4, 3), (6, 3), (7, 3), (8, 2)...",(),"(8, 3), (9, 36), (10, 21)",HG01891,18,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1140904,chrX,155781112,chrX_155781111_155781127,155781127,AG,2,(AG)n,88,8-8/8-8,"(4, 3), (6, 1), (8, 1)",(),"(8, 32)",HG01243,16,0
1140905,chrX,155786790,chrX_155786789_155786809,155786809,TTTA,4,(TTTA)n,55,5-5/5-5,"(3, 1), (4, 3)",(),"(5, 23)",HG01243,20,0
1140906,chrX,155805374,chrX_155805373_155805397,155805397,AAC,3,(AAC)n,87,7-7/8-8,"(1, 3), (2, 1), (5, 1), (6, 3), (7, 1)",(),"(7, 16), (8, 17)",HG01243,24,0
1140907,chrX,155846315,chrX_155846314_155846329,155846329,ATTTT,5,(ATTTT)n,44,4-4/4-4,"(1, 1), (2, 4), (3, 2), (4, 3)",(),"(4, 22)",HG01243,15,0


In [94]:
final_filtered_metadata.to_csv('/nfs/turbo/dcmb-class/bioinf593/groups/group_05/STRonvoli/data/metadata.tsv', sep='\t', index=False)

In [2]:
final_filtered_metadata = pd.read_table('/nfs/turbo/dcmb-class/bioinf593/groups/group_05/STRonvoli/data/metadata.tsv')

In [11]:
final_filtered_metadata_no_periods = final_filtered_metadata[~final_filtered_metadata['MC'].str.contains('\.')]

In [13]:
final_filtered_metadata_no_periods

Unnamed: 0,chrom,pos,trid,end_pos,motif,len_motif,struc,MC,confidence_interval,flanking_reads,inrepeat_reads,spanning_reads,sample_name,len_repeat_region,split
0,chr1,44836,chr1_44835_44867,44867,AAAT,4,(AAAT)n,89,8-8/9-9,"(1, 5), (2, 4), (3, 2), (4, 1), (6, 4), (7, 1)...",(),"(8, 24), (9, 15)",HG01891,32,0
1,chr1,167135,chr1_167134_167150,167150,TTTA,4,(TTTA)n,44,4-4/4-4,"(1, 2), (2, 1), (3, 3), (4, 4)",(),"(4, 28)",HG01891,16,0
2,chr1,370633,chr1_370632_370648,370648,TTTC,4,(TTTC)n,44,4-4/4-4,"(1, 5), (2, 6), (3, 1), (4, 3)",(),"(4, 38)",HG01891,16,0
3,chr1,371009,chr1_371008_371033,371033,TTTTG,5,(TTTTG)n,55,4-4/5-5,"(1, 3), (2, 4), (3, 2), (4, 3), (5, 2), (8, 1)",(),"(4, 6), (5, 45)",HG01891,25,0
4,chr1,371275,chr1_371274_371292,371292,TG,2,(TG)n,98,9-9/10-10,"(2, 1), (3, 1), (4, 3), (6, 3), (7, 3), (8, 2)...",(),"(8, 3), (9, 36), (10, 21)",HG01891,18,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1140875,chrX,155781112,chrX_155781111_155781127,155781127,AG,2,(AG)n,88,8-8/8-8,"(4, 3), (6, 1), (8, 1)",(),"(8, 32)",HG01243,16,0
1140876,chrX,155786790,chrX_155786789_155786809,155786809,TTTA,4,(TTTA)n,55,5-5/5-5,"(3, 1), (4, 3)",(),"(5, 23)",HG01243,20,0
1140877,chrX,155805374,chrX_155805373_155805397,155805397,AAC,3,(AAC)n,87,7-7/8-8,"(1, 3), (2, 1), (5, 1), (6, 3), (7, 1)",(),"(7, 16), (8, 17)",HG01243,24,0
1140878,chrX,155846315,chrX_155846314_155846329,155846329,ATTTT,5,(ATTTT)n,44,4-4/4-4,"(1, 1), (2, 4), (3, 2), (4, 3)",(),"(4, 22)",HG01243,15,0


In [12]:
# FINAL METADATA
final_filtered_metadata_no_periods.to_csv('/nfs/turbo/dcmb-class/bioinf593/groups/group_05/STRonvoli/data/metadata.tsv', sep='\t', index=False)