In [4]:
# Generate gene index.
import os
import pandas as pd
from tqdm import tqdm

def generate_gene_index(gene_dir, index_dir):
    chr_dirs = [f"chr{i+1}" for i in range(0, 24)]
    chrs_path = [os.path.join(gene_dir, a) for a in chr_dirs]
    
    index_path = os.path.join(index_dir, "gene_index.csv")
    os.makedirs(os.path.dirname(index_path), exist_ok=True)
    if os.path.exists(index_path):
        os.remove(index_path)
    index = open(index_path, "x")
    index.write("chr,gene,size\n")
    for chr_dir in tqdm(chrs_path, total=len(chrs_path), desc="Creating index"):
        file_names = [a for a in os.listdir(chr_dir) if os.path.isfile(os.path.join(chr_dir, a))]
        for fname in file_names:
            # Count gene length.
            fpath = os.path.join(chr_dir, fname)
            df = pd.read_csv(fpath)
            len_sequence = 0
            for i, r in df.iterrows():                
                len_sequence += len(r["sequence"])

            index.write(f"{os.path.basename(chr_dir)},{fname},{len_sequence}\n")
    index.close()

gene_dir = os.path.join("data", "genome", "seqlab.positive.strand")
index_dir = os.path.join("data", "genome", "seqlab.positive.strand")
generate_gene_index(gene_dir, index_dir)

Creating index: 100%|██████████| 24/24 [06:53<00:00, 17.21s/it]


In [None]:
"""
Merge genes.
"""
import os
from data_preparation import merge_csv
for c in range(4, 24):
    chr = f"chr{c+1}"
    src_dir = os.path.join("workspace", "seq2seq-stride.384", chr)
    dest_file = os.path.join("workspace", "seq2seq-stride.384", chr, "bundle.csv")
    src_files = [os.path.join(src_dir, fname) for fname in os.listdir(src_dir)] 
    status = merge_csv(src_files, dest_file)
    if not status:
        print(f"Something wrong with merging files in directory {src_dir}.")
        raise Exception("something wrong.")


In [None]:
"""
Make sample from bundle.csv.
"""
from utils.utils import create_n_sample
import os

n_sample = 100
for c in range(24):
    bundle_csv = os.path.join("workspace", "seq2seq-stride.384", f"chr{c+1}", "bundle.csv")
    bundle_sample_csv = os.path.join("workspace", "seq2seq-stride.384", f"chr{c+1}", "bundle.sample.csv")
    create_n_sample(bundle_csv, n_sample, bundle_sample_csv)

In [None]:
"""
Make train, validation, and test bundle for each chr.
"""
from utils.utils import split_and_store_csv
import os

fractions = [0.7, 0.2, 0.1]
file_types = ["train", "validation", "test"]
dest_dir = [os.path.join("workspace", "seq2seq-stride.384", f"chr{c+1}") for c in range(24)]
src_files = [os.path.join("workspace", "seq2seq-stride.384", f"chr{c+1}", "bundle.csv") for c in range(24)]
dest_train_files = [os.path.join("workspace", "seq2seq-stride.384", f"chr{c+1}", "bundle.train.csv") for c in range(24)]
dest_valid_files = [os.path.join("workspace", "seq2seq-stride.384", f"chr{c+1}", "bundle.validation.csv") for c in range(24)]
dest_test_files = [os.path.join("workspace", "seq2seq-stride.384", f"chr{c+1}", "bundle.test.csv") for c in range(24)]

for src, dest_train, dest_valid, dest_test in zip(src_files, dest_train_files, dest_valid_files, dest_test_files):
    split_and_store_csv(src, fractions, [
        dest_train, dest_valid, dest_test
    ])

In [None]:
"""
Merge bundle.sample.csv from every chr.
"""
from data_preparation import merge_csv
import os
chr_bundle_sample_csvs = [os.path.join("workspace", "seq2seq-stride.384", f"chr{c+1}", "bundle.sample.csv") for c in range(24)]
merged_bundle_sample = os.path.join("workspace", "seq2seq-stride.384", "bundle.sample.csv")
merge_csv(chr_bundle_sample_csvs, merged_bundle_sample)

In [None]:
"""
Merge bundle from every chr.
"""
from data_preparation import merge_csv
import os
chr_bundle_csvs = [os.path.join("workspace", "seq2seq-stride.384", f"chr{c+1}", "bundle.csv") for c in range(24)]
merged_bundle = os.path.join("workspace", "seq2seq-stride.384", "bundle.csv")
merge_csv(chr_bundle_csvs, merged_bundle)

In [None]:
"""
Merge bundle train, validation, and test from every chr.
"""
from data_preparation import merge_csv
import os

for t in ["train", "validation", "test"]:
    chr_bundle_csvs = [os.path.join("workspace", "seq2seq-stride.384", f"chr{c+1}", f"bundle.{t}.csv") for c in range(24)]
    merged_bundle = os.path.join("workspace", "seq2seq-stride.384", f"bundle.{t}.csv")
    merge_csv(chr_bundle_csvs, merged_bundle)

In [None]:
"""
Merge bundle training, validation, and test from every chr.
"""
from data_preparation import merge_csv
import os
for t in ["train", "validation", "test"]:
    chr_bundle_csvs = [os.path.join("workspace", "seq2seq-stride.384", f"chr{c+1}", f"bundle.{t}.csv") for c in range(24)]
    merged_bundle = os.path.join("workspace", "seq2seq-stride.384", f"bundle.{t}.csv")
    merge_csv(chr_bundle_csvs, merged_bundle)

In [None]:
from data_preparation import kmer
import pandas as pd
import os

_cols = ['sequence', 'label']
_stride = 256
_length = 512
for c in ['chr{}'.format(i+1) for i in range(24)]:
    genes_dir_by_chr = os.path.join('data', 'genome', 'sequential-labelling-positive-strand', c)
    genes_expansion_dir_by_chr = os.path.join('workspace', f"sequential-labelling-stride.{_stride}", c)
    if not os.path.exists(genes_expansion_dir_by_chr):
        os.makedirs(genes_expansion_dir_by_chr, exist_ok=True)
    for gene_file in os.listdir(genes_dir_by_chr):
        gene_file_path = os.path.join(genes_dir_by_chr, gene_file)
        gene_expansion_file_path = os.path.join(genes_expansion_dir_by_chr, f"{gene_file.split('.')[0]}.expanded.csv")
        if os.path.exists(gene_expansion_file_path):
            os.remove(gene_expansion_file_path)
        target_file = open(gene_expansion_file_path, 'x')
        target_file.write(f"sequence,label\n")
        df = pd.read_csv(gene_file_path)
        print(f"Working on {c} {gene_file_path}                                 ", end='\r')
        for _, row in df.iterrows():
            seq_chunks = kmer(row['sequence'].strip(), _length, _stride)
            label_chunks = kmer(row['label'].strip(), _length, _stride)
            for seq, label in zip(seq_chunks, label_chunks):
                target_file.write(f"{seq},{label}\n")
        target_file.close()


In [None]:
"""
Filter index based on gene name.
"""
import os
import pandas as pd
_chr_indices = [
    'NC_000001.11.csv',
    'NC_000002.12.csv',
    'NC_000003.12.csv',
    'NC_000004.12.csv',
    'NC_000005.10.csv',
    'NC_000006.12.csv',
    'NC_000007.14.csv',
    'NC_000008.11.csv',
    'NC_000009.12.csv',
    'NC_000010.11.csv',
    'NC_000011.10.csv',
    'NC_000012.12.csv',
    'NC_000013.11.csv',
    'NC_000014.9.csv',
    'NC_000015.10.csv',
    'NC_000016.10.csv',
    'NC_000017.11.csv',
    'NC_000018.10.csv',
    'NC_000019.10.csv',
    'NC_000020.11.csv',
    'NC_000021.9.csv',
    'NC_000022.11.csv',
    'NC_000023.11.csv',
    'NC_000024.10.csv']
_chr_dir = ["chr{}".format(i+1) for i in range(len(_chr_indices))]

for chr, chr_number in zip(_chr_indices, _chr_dir):
    path = os.path.join('data', 'genome', 'grch38', 'csvs_strand', chr)
    df = pd.read_csv(path)
    genes = list(df['gene'].unique())
    genes = [a for a in genes if not pd.isnull(a)]
    for g in genes:
        ndf = df[df['gene'] == g]
        path = os.path.join('data', 'genome', 'grch38', 'genes', chr_number,"{}.csv".format(g))
        if not os.path.exists(os.path.dirname(path)):
            os.mkdir(os.path.dirname(path))
        ndf.to_csv(path, index=False)
        print("Success: {}                                                      ".format(path), end="\r")

In [8]:
"""
Generate sequence from fasta based on gene indices.
"""
import os
from Bio import SeqIO
import pandas as pd
from tqdm import tqdm
from data_dir import chr_fasta_mapname

strand = '+'
chrs = ['chr{}'.format(i+1) for i in range(24)] # Folder name.
# chrs = ['chr{}'.format(i+1) for i in range(2)] # Folder name.

for c in chrs:
    chr_genes_indices_dir = os.path.join("data", "genome", "grch38", "genes", c)
    chr_genes_sequence_csv_dir = os.path.join("data", "genome", "seqlab.positive.strand",c)
    chr_fasta = os.path.join("data", "chr", chr_fasta_mapname[c])

    if not os.path.exists(chr_fasta):
        raise FileNotFoundError("Fasta {} not found.".format(chr_fasta))

    """
    Read whole chromosome here and return its complete sequence.
    Yes, it's long.
    """
    records = SeqIO.parse(chr_fasta, "fasta")
    chr_records = next(records)
    chr_sequence = str(chr_records.seq)
    genome_sequence = chr_sequence # Use genome sequence reader here.

    _columns = ['sequence', 'label']
    for fname in os.listdir(chr_genes_indices_dir):
        print("Working on chr {} gene {}                                    ".format(c, fname), end='\r')
        fpath = os.path.join(chr_genes_indices_dir, fname)
        if os.path.isfile(fpath):
            """
            Gene index found. Read the index and cross-reference with genome sequence.
            """
            index_df = pd.read_csv(fpath)
            gene_region = index_df[index_df['region'] == "gene"]
            if strand != None:
                gene_region = gene_region[gene_region['strand'] == strand]

            if len(gene_region) > 0:
                gene_df = pd.DataFrame(columns=_columns)
                for i, g in gene_region.iterrows():
                    gene_start_index = int(g['start_index'])
                    gene_end_index = int(g['end_index'])
                    gene_sequence = genome_sequence[gene_start_index:gene_end_index + 1]
                    # print("Gene sequence: {}".format(gene_sequence))
                    # If gene sequence isn't None then this gene is available in chromosome sequence.
                    if gene_sequence != None:
                        gene_sequential_labelling = ['i' for a in gene_sequence]
                        exons = index_df[index_df['region'] == "exon"]
                        #if strand != None:
                        #    exons = index_df[index_df['strand'] == strand]
                        for j, r in exons.iterrows():
                            start_index = int(r['start_index'])
                            end_index = int(r['end_index'])
                            if (start_index >= gene_start_index and end_index <= gene_end_index):
                                start_index = int(r['start_index']) - gene_start_index
                                end_index = int(r['end_index']) - gene_start_index
                                for k in range(start_index, end_index + 1):
                                    gene_sequential_labelling[k] = 'E'
                        gene_sequential_labelling = ''.join(gene_sequential_labelling)
                        #endfor
                        gene_df = pd.concat([gene_df, pd.DataFrame([[gene_sequence, gene_sequential_labelling]], columns=_columns)])

                target_path = os.path.join(chr_genes_sequence_csv_dir, fname)
                # data\genome\sequential-labelling\chr1
                # Only write if dataframe is not empty.
                if gene_df.shape[0] > 0:
                    if not os.path.exists(chr_genes_sequence_csv_dir):
                        os.makedirs(chr_genes_sequence_csv_dir, exist_ok=True)
                    if gene_df.shape[0] > 0:    
                        gene_df.to_csv(target_path, index=False)
                    

Working on chr chr24 gene ZNF92P1Y.csv                                                     

In [1]:
"""Chunk each gene into 512 characters, for each chromosome."""
import pandas as pd
from data_preparation import kmer
import os

chrs = [f"chr{i + 1}" for i in range(24)] # Test one chromosome.
chr_paths = [os.path.join("data", "genome", "seqlab.positive.strand", f"{chr}") for chr in chrs]
dest_paths = [os.path.join("data", "genome", "seqlab.strand-positive.stride-512", f"{chr}") for chr in chrs]
for cp, dp in zip(chr_paths, dest_paths):
    genes = os.listdir(cp)
    genes = [g for g in genes if "expanded" not in g.split('.')]
    for gene in genes:
        gene_path = os.path.join(cp, gene)
        gene_df = pd.read_csv(gene_path)
        print(f"Processing {os.path.basename(cp)} {gene}                    ", end="\r")
        for i, r in gene_df.iterrows():
            sequence = r["sequence"]
            label = r["label"]
            seq_chunks = kmer(sequence, 512, 512)
            label_chunks = kmer(label, 512, 512)
            dest_path = os.path.join(dp, f"{gene.split('.')[0]}.csv")
            if os.path.exists(dest_path):
                os.remove(dest_path)
            if not os.path.exists(os.path.dirname(dest_path)):
                os.makedirs(os.path.dirname(dest_path), exist_ok=True)
            dest = open(dest_path, "x")
            dest.write("sequence,label\n")
            for c, l in zip(seq_chunks, label_chunks):
                dest.write(f"{c},{l}\n")
            dest.close()


Processing chr24 ZFY.csv                                       

In [None]:
"""Move file around"""

import os
for i in range(24):
    k = i + 1
    src_path = os.path.join("data", "genome", "seqlab.positive.strand", f"chr{k}")
    dest_path = os.path.join("data", "genome", "seqlab.positive.strand", f"chr{k}.expanded")
    if not os.path.exists(dest_path):
        os.makedirs(dest_path, exist_ok=True)
    files = os.listdir(src_path)
    files = [f for f in files if "expanded" in f.split('.')]
    for f in files:
        oldpath = os.path.join(src_path, f)
        newpath = os.path.join(dest_path, f)
        if not os.path.exists(newpath):
            os.rename(oldpath, newpath)
        else:
            print(f"Skip {oldpath}                                                    ", end="\r")

In [None]:
"""
Rename files in expanded folder
filename.expanded.csv => filename.csv
"""
src_paths = [os.path.join("data", "genome", "seqlab.positive.strand", f"chr{i + 1}.expanded") for i in range(24)]
for srcdir in src_paths:
    files = os.listdir(srcdir)
    for f in files:
        oldpath = os.path.join(srcdir, f)
        newpath = os.path.join(srcdir, f"{f.split('.')[0]}.csv")
        os.rename(oldpath, newpath)

In [4]:
"""Generate kmer version from sequence."""
import os
import pandas as pd
from data_preparation import str_kmer

# src_paths = [os.path.join("data", "genome", "seqlab.positive.strand", f"chr{i + 1}") for i in range(24)]
src_paths = [os.path.join("data", "genome", "seqlab.strand-positive.stride-512", f"chr{i + 1}") for i in range(24)]
# dest_paths = [os.path.join("data", "genome", "seqlab.strand-positive.kmer", f"chr{i + 1}") for i in range(24)]
dest_paths = [os.path.join("data", "genome", "seqlab.strand-positive.kmer.stride-512", f"chr{i + 1}") for i in range(24)]
for srcdir, destdir in zip(src_paths, dest_paths):
    files = os.listdir(srcdir)
    for f in files:
        src = os.path.join(srcdir, f)
        dest = os.path.join(destdir, f)
        if not os.path.exists(destdir):
            os.makedirs(destdir, exist_ok=True)
        if os.path.exists(dest):
            os.remove(dest)
        dest = open(dest, "x")
        dest.write("sequence,label\n")
        df = pd.read_csv(src)
        for i, r in df.iterrows():
            sequence = r["sequence"]
            label = r["label"]
            dest.write(f"{str_kmer(sequence, 3)},{str_kmer(label, 3)}\n")
        dest.close()
            

In [1]:
"""Chunk each gene into 510 token. Since sequence is already in token form, ``kmer`` function cannot be used, had to create another script."""
import pandas as pd
from data_preparation import kmer
import os

def chunk_kmer_sequence(chunk: str, size: int, stride: int) -> str:
    arr = chunk.split(' ')
    arr = kmer(arr, size, window_size=stride)
    return arr

chrs = [f"chr{i + 1}" for i in range(24)] # Test one chromosome.
chr_paths = [os.path.join("data", "genome", "seqlab.strand-positive.kmer", f"{chr}") for chr in chrs]
dest_paths = [os.path.join("data", "genome", "seqlab.strand-positive.kmer.stride-205", f"{chr}") for chr in chrs]
for cp, dp in zip(chr_paths, dest_paths):
    genes = os.listdir(cp)
    genes = [g for g in genes if "expanded" not in g.split('.')]
    for gene in genes:
        gene_path = os.path.join(cp, gene)
        gene_df = pd.read_csv(gene_path)
        print(f"Processing {os.path.basename(cp)} {gene}                    ", end="\r")
        for i, r in gene_df.iterrows():
            sequence = r["sequence"]
            label = r["label"]
            seq_chunks = chunk_kmer_sequence(sequence, 510, 205)
            label_chunks = chunk_kmer_sequence(label, 510, 205)    
            dest_path = os.path.join(dp, f"{gene.split('.')[0]}.csv")
            if os.path.exists(dest_path):
                os.remove(dest_path)
            if not os.path.exists(os.path.dirname(dest_path)):
                os.makedirs(os.path.dirname(dest_path), exist_ok=True)
            dest = open(dest_path, "x")
            dest.write("sequence,label\n")
            for c, l in zip(seq_chunks, label_chunks):
                dest.write(f"{' '.join(c)},{' '.join(l)}\n")
            dest.close()

Processing chr24 ZFY.csv                                       

In [15]:
"""
Split gene_index for training, validation, and testing.
"""
import os
index_dir = os.path.join("index")
gene_index = os.path.join(index_dir, f"gene_index.csv")
train_index = os.path.join(index_dir, "gene_train_index.csv")
val_index = os.path.join(index_dir, "gene_validation_index.csv")
test_index = os.path.join(index_dir, "gene_test_index.csv")

import pandas as pd
df = pd.read_csv(gene_index)
train_df = df.sample(frac=0.8)
test_df = df.drop(train_df.index)
val_df = test_df.sample(frac=0.5)
test_df = test_df.drop(val_df.index)
train_df.to_csv(train_index, index=False)
val_df.to_csv(val_index, index=False)
test_df.to_csv(test_index, index=False)

In [9]:
# Enrich index with gene length.
import os
import pandas as pd
from tqdm import tqdm

index_dir = os.path.join("index")
gene_index = os.path.join(index_dir, "gene_index.csv")
gene_train_index = os.path.join(index_dir, "gene_train_index.csv")
gene_validation_index = os.path.join(index_dir, "gene_validation_index.csv")
gene_test_index = os.path.join(index_dir, "gene_test_index.csv")
gene_dir = os.path.join("data", "gene_dir")

for index in [gene_index, gene_train_index, gene_validation_index, gene_test_index]:
    df = pd.read_csv(index)
    len_sequences = []
    for i, r in tqdm(df.iterrows(), total=df.shape[0], desc="Counting sequence length"):
        chr_name = r["chr"]
        gene_filename = r["gene"]
        gene_filepath = os.path.join(gene_dir, chr_name, gene_filename)
        gene_df = pd.read_csv(gene_filepath)
        length = 0
        if gene_df.shape[0] > 1:
            print(chr_name, gene_filename)
        for i, j in gene_df.iterrows():
            length += len(j["sequence"])
        len_sequences.append(length)
    df["length"] = len_sequences
    df.to_csv(index)


Counting sequence length:   0%|          | 21/19235 [00:00<01:32, 207.83it/s]

chr1 AADACL3.csv
chr1 AADACL4.csv
chr1 ABCD3.csv
chr1 ACADM.csv
chr1 ACBD3-AS1.csv
chr1 ACKR1.csv
chr1 ACOT11.csv
chr1 ACTL8.csv
chr1 ACTN2.csv
chr1 ACTRT2.csv
chr1 ADAM15.csv
chr1 ADAMTSL4.csv
chr1 ADGRL2.csv
chr1 ADORA1.csv
chr1 ADPRS.csv
chr1 AGBL4-AS1.csv
chr1 AGL.csv
chr1 AGO1.csv
chr1 AGO3.csv
chr1 AGO4.csv
chr1 AGRN.csv
chr1 AGTRAP.csv
chr1 AHCYL1.csv
chr1 AJAP1.csv
chr1 AK4.csv
chr1 AK5.csv
chr1 AKIRIN1.csv
chr1 AKR1A1.csv
chr1 ALG14-AS1.csv
chr1 ALG6.csv
chr1 ALPL.csv
chr1 AMPD2.csv
chr1 AMY1A.csv
chr1 AMY1C.csv
chr1 AMY2A.csv
chr1 AMY2B.csv


Counting sequence length:   0%|          | 66/19235 [00:00<01:27, 220.14it/s]

chr1 ANGPTL3.csv
chr1 ANGPTL7.csv
chr1 ANKRD65-AS1.csv
chr1 ANXA9.csv
chr1 AP4B1-AS1.csv
chr1 APCS.csv
chr1 AQP10.csv
chr1 ARF1.csv
chr1 ARHGAP29-AS1.csv
chr1 ARHGEF10L.csv
chr1 ARHGEF16.csv
chr1 ARHGEF2-AS1.csv
chr1 ARHGEF2-AS2.csv
chr1 ARID1A.csv
chr1 ARMH1.csv
chr1 ARTN.csv
chr1 ARV1.csv
chr1 ASH1L-AS1.csv
chr1 ATAD3A.csv
chr1 ATAD3B.csv
chr1 ATAD3C.csv
chr1 ATF3.csv
chr1 ATF6.csv
chr1 ATG4C.csv
chr1 ATP1A1.csv
chr1 ATP1A2.csv
chr1 ATP1A4.csv
chr1 ATP1B1.csv
chr1 ATP2B4.csv
chr1 ATP5IF1.csv
chr1 ATP5PB.csv
chr1 ATP6V0B.csv
chr1 ATP8B2.csv
chr1 ATXN7L2.csv
chr1 AXDND1.csv
chr1 AZIN2.csv
chr1 B3GALT6.csv
chr1 B4GALT2.csv
chr1 BCAN.csv
chr1 BCAR3-AS1.csv
chr1 BCL10-AS1.csv
chr1 BCL9.csv
chr1 BECN2.csv
chr1 BGLAP.csv
chr1 BLZF1.csv
chr1 BMP8A.csv


Counting sequence length:   1%|          | 116/19235 [00:00<01:23, 229.71it/s]

chr1 BMP8B-AS1.csv
chr1 BNIPL.csv
chr1 BOLA1.csv
chr1 BRDT.csv
chr1 BRINP2.csv
chr1 BRINP3-DT.csv
chr1 BROX.csv
chr1 BSND.csv
chr1 BTBD19.csv
chr1 BTBD8.csv
chr1 BTF3L4.csv
chr1 BTG2.csv
chr1 C1orf100.csv
chr1 C1orf105.csv
chr1 C1orf112.csv
chr1 C1orf115.csv
chr1 C1orf122.csv
chr1 C1orf146.csv
chr1 C1orf162.csv
chr1 C1orf167.csv
chr1 C1orf185.csv
chr1 C1orf21.csv
chr1 C1orf220.csv
chr1 C1orf226.csv
chr1 C1orf50.csv
chr1 C1orf53.csv
chr1 C1orf54.csv
chr1 C1orf56.csv
chr1 C1orf68.csv
chr1 C1orf94.csv
chr1 C1QA.csv
chr1 C1QB.csv
chr1 C1QC.csv
chr1 C2CD4D-AS1.csv
chr1 C4BPA.csv
chr1 C4BPB.csv
chr1 C8A.csv
chr1 CA14.csv
chr1 CA6.csv
chr1 CACHD1.csv
chr1 CACNA1E.csv


Counting sequence length:   1%|          | 139/19235 [00:00<01:40, 189.54it/s]

chr1 CACYBP.csv
chr1 CADM3.csv
chr1 CALML6.csv
chr1 CAMK1G.csv
chr1 CAMSAP2.csv
chr1 CAMTA1.csv
chr1 CAP1.csv
chr1 CAPN2.csv
chr1 CAPN9.csv
chr1 CAPZA1.csv
chr1 CASQ1.csv
chr1 CATSPER4.csv
chr1 CATSPERE.csv
chr1 CCDC18.csv
chr1 CCDC185.csv
chr1 CCDC24.csv
chr1 CCDC27.csv
chr1 CCDC28B.csv
chr1 CCDC30.csv
chr1 CCN1.csv
chr1 CD101.csv
chr1 CD160.csv
chr1 CD1A.csv
chr1 CD1C.csv
chr1 CD1D.csv
chr1 CD1E.csv
chr1 CD2.csv
chr1 CD46.csv
chr1 CD52.csv
chr1 CD53.csv
chr1 CD55.csv
chr1 CDA.csv
chr1 CDC14A.csv
chr1 CDC20.csv
chr1 CDC42.csv
chr1 CDC7.csv
chr1 CDC73.csv

Counting sequence length:   1%|          | 198/19235 [00:00<01:20, 237.49it/s]


chr1 CDCA8.csv
chr1 CDK18.csv
chr1 CDKN2C.csv
chr1 CELA2A.csv
chr1 CELA2B.csv
chr1 CELA3A.csv
chr1 CELA3B.csv
chr1 CELSR2.csv
chr1 CENPF.csv
chr1 CENPS-CORT.csv
chr1 CENPS.csv
chr1 CEP350.csv
chr1 CEP85.csv
chr1 CEPT1.csv
chr1 CFAP107.csv
chr1 CFAP57.csv
chr1 CFH.csv
chr1 CFHR1.csv
chr1 CFHR2.csv
chr1 CFHR3.csv
chr1 CFHR4.csv
chr1 CFHR5.csv
chr1 CGN.csv
chr1 CH17-125A10.2.csv
chr1 CHD1L.csv
chr1 CHI3L2.csv
chr1 CHIA.csv
chr1 CHRM3.csv
chr1 CHRNB2.csv
chr1 CHTOP.csv
chr1 CIART.csv
chr1 CKS1B.csv
chr1 CLCA1.csv
chr1 CLCA2.csv
chr1 CLCA4.csv
chr1 CLCN6.csv
chr1 CLCNKA.csv
chr1 CLCNKB.csv
chr1 CLIC4.csv
chr1 CMPK1.csv
chr1 CNIH3.csv
chr1 CNIH4.csv
chr1 CNKSR1.csv
chr1 CNN3-DT.csv
chr1 CNST.csv
chr1 CNTN2.csv
chr1 COA6.csv
chr1 COG2.csv
chr1 COQ8A.csv
chr1 CORT.csv
chr1 COX20.csv
chr1 CPT2.csv
chr1 CPTP.csv
chr1 CR1.csv


Counting sequence length:   1%|▏         | 252/19235 [00:01<01:15, 252.48it/s]

chr1 CR1L.csv
chr1 CR2.csv
chr1 CRB1.csv
chr1 CRCT1.csv
chr1 CREB3L4.csv
chr1 CROCC.csv
chr1 CSF1.csv
chr1 CSMD2-AS1.csv
chr1 CSRP1-AS1.csv
chr1 CTH.csv
chr1 CTPS1.csv
chr1 CTRC.csv
chr1 CTTNBP2NL.csv
chr1 CTXND2.csv
chr1 CYB561D1.csv
chr1 CYP4A22.csv
chr1 CYP4B1.csv
chr1 CYP4X1.csv
chr1 CYP4Z1.csv
chr1 DAB1-AS1.csv
chr1 DAP3.csv
chr1 DARS2.csv
chr1 DCAF6.csv
chr1 DCAF8-DT.csv
chr1 DCDC2B.csv
chr1 DCLRE1B.csv
chr1 DCST1.csv
chr1 DDI2.csv
chr1 DDR2.csv
chr1 DDX20.csv
chr1 DDX59-AS1.csv
chr1 DEGS1.csv
chr1 DEPDC1-AS1.csv
chr1 DESI2.csv
chr1 DFFB.csv
chr1 DHCR24-DT.csv
chr1 DHDDS.csv
chr1 DHX9.csv
chr1 DIO1.csv
chr1 DISC1-IT1.csv
chr1 DISC1.csv
chr1 DISP1.csv
chr1 DISP3.csv
chr1 DMAP1.csv
chr1 DMBX1.csv


Counting sequence length:   1%|▏         | 278/19235 [00:01<01:26, 218.86it/s]

chr1 DMRTB1.csv
chr1 DNAH14.csv
chr1 DNAI3.csv
chr1 DNAJB4.csv
chr1 DNAJC16.csv
chr1 DNAJC6.csv
chr1 DNALI1.csv
chr1 DNASE2B.csv
chr1 DNM3-IT1.csv
chr1 DNM3.csv
chr1 DPH2.csv
chr1 DPYD-AS1.csv
chr1 DPYD-AS2.csv
chr1 DR1.csv
chr1 DRAXIN.csv
chr1 DTL.csv
chr1 DUSP12.csv
chr1 DUSP23.csv
chr1 DYNLT5.csv
chr1 DYRK3.csv
chr1 ECE1-AS1.csv
chr1 ECM1.csv
chr1 EDARADD.csv
chr1 EFCAB14-AS1.csv
chr1 EFCAB2.csv
chr1 EFCAB7.csv
chr1 EFHD2.csv
chr1 EFNA1.csv
chr1 EFNA3.csv
chr1 EFNA4.csv
chr1 EIF3I.csv
chr1 ELAPOR1.csv
chr1 ELAVL4.csv
chr1 ELF3.csv
chr1 ELOA.csv
chr1 EMC1-AS1.csv
chr1 ENO1-AS1.csv
chr1 EPB41.csv
chr1 EPHA2-AS1.csv
chr1 EPHA8.csv
chr1 EPHB2.csv
chr1 EPHX1.csv
chr1 EPHX4.csv
chr1 EPS15-AS1.csv


Counting sequence length:   2%|▏         | 342/19235 [00:01<01:11, 262.98it/s]

chr1 ERICH3-AS1.csv
chr1 ERLNC1.csv
chr1 ERMAP.csv
chr1 ESPN.csv
chr1 EXO1.csv
chr1 EXO5.csv
chr1 EXOSC10-AS1.csv
chr1 EXTL1.csv
chr1 FAAH.csv
chr1 FAF1-AS1.csv
chr1 FALEC.csv
chr1 FAM102B.csv
chr1 FAM110D.csv
chr1 FAM163A.csv
chr1 FAM167B.csv
chr1 FAM177B.csv
chr1 FAM183A.csv
chr1 FAM20B.csv
chr1 FAM43B.csv
chr1 FAM71A.csv
chr1 FAM72D.csv
chr1 FAM76A.csv
chr1 FAM78B-AS1.csv
chr1 FAM87B.csv
chr1 FASLG.csv
chr1 FBLIM1.csv
chr1 FBXO28.csv
chr1 FBXO44.csv
chr1 FBXO6.csv
chr1 FCER1A.csv
chr1 FCER1G.csv
chr1 FCGR1A.csv
chr1 FCGR2A.csv
chr1 FCGR2B.csv
chr1 FCRL6.csv
chr1 FCRLA.csv
chr1 FCRLB.csv
chr1 FDPS.csv
chr1 FGGY.csv
chr1 FHAD1.csv
chr1 FLAD1.csv
chr1 FLG-AS1.csv
chr1 FLJ39095.csv
chr1 FLVCR1.csv
chr1 FMN2.csv
chr1 FMO1.csv
chr1 FMO2.csv
chr1 FMO3.csv
chr1 FMO4.csv
chr1 FNBP1L.csv
chr1 FNDC7.csv
chr1 FOXD2.csv
chr1 FOXD3.csv
chr1 FOXE3.csv
chr1 FOXO6.csv
chr1 FPGT-TNNI3K.csv


Counting sequence length:   2%|▏         | 400/19235 [00:01<01:10, 266.26it/s]

chr1 FPGT.csv
chr1 G0S2.csv
chr1 GABPB2.csv
chr1 GABRD.csv
chr1 GADD45A.csv
chr1 GALNT2.csv
chr1 GAS5-AS1.csv
chr1 GBP6.csv
chr1 GCSAML.csv
chr1 GGPS1.csv
chr1 GIPC2.csv
chr1 GJA4.csv
chr1 GJA8.csv
chr1 GJB3.csv
chr1 GJB4.csv
chr1 GJB5.csv
chr1 GJC2.csv
chr1 GMEB1.csv
chr1 GNAI3.csv
chr1 GNB1-DT.csv
chr1 GNG12-AS1.csv
chr1 GNPAT.csv
chr1 GORAB.csv
chr1 GPR137B.csv
chr1 GPR25.csv
chr1 GPR3.csv
chr1 GPR37L1.csv
chr1 GPR52.csv
chr1 GPR61.csv
chr1 GPR88.csv
chr1 GPR89A.csv
chr1 GPR89B.csv
chr1 GPSM2.csv
chr1 GPX7.csv
chr1 GRHL3.csv
chr1 GS1-204I12.4.csv
chr1 GSTM1.csv
chr1 GSTM2.csv
chr1 GSTM4.csv
chr1 GSTM5.csv
chr1 GUCA2B.csv
chr1 GUK1.csv
chr1 H2AC19.csv
chr1 H2AC20.csv
chr1 H2BU1.csv
chr1 H3-3A.csv
chr1 H3C15.csv
chr1 H4C14.csv
chr1 H6PD.csv
chr1 HAO2-IT1.csv
chr1 HAO2.csv
chr1 HAPLN2.csv
chr1 HAX1.csv
chr1 HCN3.csv
chr1 HCRTR1.csv
chr1 HDAC1.csv
chr1 HES3.csv


Counting sequence length:   2%|▏         | 459/19235 [00:01<01:10, 265.30it/s]

chr1 HHAT.csv
chr1 HHLA3.csv
chr1 HIPK1.csv
chr1 HLX.csv
chr1 HMCN1.csv
chr1 HMGB4.csv
chr1 HMGN2.csv
chr1 HNRNPCL3.csv
chr1 HOOK1.csv
chr1 HPCA.csv
chr1 HPDL.csv
chr1 HS2ST1.csv
chr1 HSD11B1.csv
chr1 HSD17B7.csv
chr1 HSD3B1.csv
chr1 HSD3B2.csv
chr1 HSPA6.csv
chr1 HTR6.csv
chr1 IARS2.csv
chr1 IBA57.csv
chr1 ICMT-DT.csv
chr1 IER5.csv
chr1 IFI16.csv
chr1 IFI44.csv
chr1 IFI44L.csv
chr1 IGFN1.csv
chr1 IGSF21.csv
chr1 IKBKE.csv
chr1 IL12RB2.csv
chr1 IL19.csv
chr1 IL20.csv
chr1 IL23R.csv
chr1 IL24.csv
chr1 IL6R.csv
chr1 INAVA.csv
chr1 INKA2-AS1.csv
chr1 INTS3.csv
chr1 IPO13.csv
chr1 IPO9.csv
chr1 IQCC.csv
chr1 ISG15.csv
chr1 IVL.csv
chr1 KAZN.csv
chr1 KCNAB2.csv


Counting sequence length:   3%|▎         | 486/19235 [00:02<01:12, 257.29it/s]

chr1 KCNC4.csv
chr1 KCND3-AS1.csv
chr1 KCNJ9.csv
chr1 KCNK1.csv
chr1 KCNK2.csv
chr1 KCNQ4.csv
chr1 KCTD3.csv
chr1 KDM1A.csv
chr1 KDM4A.csv
chr1 KHDRBS1.csv
chr1 KIAA1522.csv
chr1 KIAA1614.csv
chr1 KIF1B.csv
chr1 KIF26B.csv
chr1 KIF2C.csv
chr1 KIRREL1-IT1.csv
chr1 KIRREL1.csv
chr1 KLF17.csv
chr1 KLHDC7A.csv
chr1 KLHDC9.csv
chr1 KLHL17.csv
chr1 KLHL20.csv
chr1 KMO.csv
chr1 KPNA6.csv
chr1 KPRP.csv
chr1 L1TD1.csv
chr1 LAMC1.csv
chr1 LAMC2.csv
chr1 LAMTOR2.csv
chr1 LAMTOR5-AS1.csv
chr1 LAX1.csv
chr1 LCE1A.csv
chr1 LCE1B.csv
chr1 LCE1D.csv
chr1 LCE1E.csv
chr1 LCE1F.csv
chr1 LCE2A.csv
chr1 LCE2B.csv
chr1 LCE2C.csv
chr1 LCE2D.csv
chr1 LCE3B.csv
chr1 LCE3C.csv
chr1 LCE4A.csv
chr1 LCE5A.csv
chr1 LCE6A.csv
chr1 LCK.csv
chr1 LDLRAD2.csv
chr1 LDLRAP1.csv
chr1 LELP1.csv
chr1 LEMD1-AS1.csv
chr1 LEMD1-DT.csv


Counting sequence length:   3%|▎         | 544/19235 [00:02<01:08, 271.69it/s]

chr1 LENEP.csv
chr1 LEPR.csv
chr1 LEPROT.csv
chr1 LEXM.csv
chr1 LGALS8.csv
chr1 LGR6.csv
chr1 LHX4.csv
chr1 LHX8.csv
chr1 LHX9.csv
chr1 LIN28A.csv
chr1 LINC00184.csv
chr1 LINC00210.csv
chr1 LINC00260.csv
chr1 LINC00272.csv
chr1 LINC00302.csv
chr1 LINC00339.csv
chr1 LINC00467.csv
chr1 LINC00538.csv
chr1 LINC00623.csv
chr1 LINC00626.csv
chr1 LINC00853.csv
chr1 LINC00869.csv
chr1 LINC01031.csv
chr1 LINC01036.csv
chr1 LINC01037.csv
chr1 LINC01128.csv
chr1 LINC01132.csv
chr1 LINC01133.csv
chr1 LINC01134.csv
chr1 LINC01135.csv
chr1 LINC01140.csv
chr1 LINC01144.csv
chr1 LINC01221.csv
chr1 LINC01226.csv
chr1 LINC01307.csv
chr1 LINC01341.csv
chr1 LINC01342.csv
chr1 LINC01346.csv
chr1 LINC01352.csv
chr1 LINC01353.csv
chr1 LINC01357.csv
chr1 LINC01358.csv
chr1 LINC01360.csv
chr1 LINC01362.csv
chr1 LINC01397.csv
chr1 LINC01409.csv
chr1 LINC01525.csv
chr1 LINC01633.csv
chr1 LINC01641.csv
chr1 LINC01645.csv
chr1 LINC01646.csv
chr1 LINC01647.csv
chr1 LINC01649.csv
chr1 LINC01653.csv
chr1 LINC01654.cs

Counting sequence length:   3%|▎         | 611/19235 [00:02<01:03, 294.69it/s]

chr1 LINC01657.csv
chr1 LINC01661.csv
chr1 LINC01672.csv
chr1 LINC01681.csv
chr1 LINC01685.csv
chr1 LINC01686.csv
chr1 LINC01688.csv
chr1 LINC01693.csv
chr1 LINC01696.csv
chr1 LINC01698.csv
chr1 LINC01699.csv
chr1 LINC01701.csv
chr1 LINC01702.csv
chr1 LINC01703.csv
chr1 LINC01707.csv
chr1 LINC01709.csv
chr1 LINC01712.csv
chr1 LINC01714.csv
chr1 LINC01717.csv
chr1 LINC01719.csv
chr1 LINC01720.csv
chr1 LINC01732.csv
chr1 LINC01736.csv
chr1 LINC01738.csv
chr1 LINC01740.csv
chr1 LINC01741.csv
chr1 LINC01745.csv
chr1 LINC01750.csv
chr1 LINC01755.csv
chr1 LINC01757.csv
chr1 LINC01758.csv
chr1 LINC01761.csv
chr1 LINC01763.csv
chr1 LINC01765.csv
chr1 LINC01767.csv
chr1 LINC01768.csv
chr1 LINC01772.csv
chr1 LINC01774.csv
chr1 LINC01776.csv
chr1 LINC01777.csv
chr1 LINC01778.csv
chr1 LINC01779.csv
chr1 LINC01780.csv
chr1 LINC01781.csv
chr1 LINC01786.csv
chr1 LINC01788.csv
chr1 LINC02238.csv
chr1 LINC02474.csv
chr1 LINC02591.csv
chr1 LINC02596.csv
chr1 LINC02607.csv
chr1 LINC02766.csv
chr1 LINC027

Counting sequence length:   4%|▎         | 675/19235 [00:02<01:01, 301.02it/s]

chr1 LINC02781.csv
chr1 LINC02783.csv
chr1 LINC02785.csv
chr1 LINC02788.csv
chr1 LINC02789.csv
chr1 LINC02790.csv
chr1 LINC02792.csv
chr1 LINC02793.csv
chr1 LINC02794.csv
chr1 LINC02796.csv
chr1 LINC02797.csv
chr1 LINC02798.csv
chr1 LINC02800.csv
chr1 LINC02801.csv
chr1 LINC02804.csv
chr1 LINC02805.csv
chr1 LINC02806.csv
chr1 LINC02810.csv
chr1 LINC02813.csv
chr1 LINC02816.csv
chr1 LINC02819.csv
chr1 LINC02868.csv
chr1 LINC02869.csv
chr1 LIX1L-AS1.csv
chr1 LMNA.csv
chr1 LMO4.csv
chr1 LMX1A-AS1.csv
chr1 LMX1A-AS2.csv
chr1 LNCTAM34A.csv
chr1 LOC100288175.csv
chr1 LOC100507634.csv
chr1 LOC100996583.csv
chr1 LOC100996740.csv
chr1 LOC100996756.csv
chr1 LOC101926907.csv
chr1 LOC101926944.csv
chr1 LOC101927164.csv
chr1 LOC101927247.csv
chr1 LOC101927342.csv
chr1 LOC101927452.csv
chr1 LOC101927604.csv
chr1 LOC101927711.csv
chr1 LOC101927787.csv
chr1 LOC101927871.csv
chr1 LOC101927975.csv
chr1 LOC101928009.csv
chr1 LOC101928034.csv
chr1 LOC101928059.csv
chr1 LOC101928120.csv
chr1 LOC101928163.c

Counting sequence length:   4%|▍         | 745/19235 [00:02<00:58, 318.39it/s]

chr1 LOC101929626.csv
chr1 LOC101929750.csv
chr1 LOC101929788.csv
chr1 LOC101929805.csv
chr1 LOC102606465.csv
chr1 LOC102723321.csv
chr1 LOC102723348.csv
chr1 LOC102723480.csv
chr1 LOC102723529.csv
chr1 LOC102723760.csv
chr1 LOC102723834.csv
chr1 LOC102724319.csv
chr1 LOC102724382.csv
chr1 LOC102724429.csv
chr1 LOC102724572.csv
chr1 LOC102724830.csv
chr1 LOC102724856.csv
chr1 LOC102724857.csv
chr1 LOC103021295.csv
chr1 LOC105371175.csv
chr1 LOC105371206.csv
chr1 LOC105371209.csv
chr1 LOC105371214.csv
chr1 LOC105371215.csv
chr1 LOC105371217.csv
chr1 LOC105371223.csv
chr1 LOC105371227.csv
chr1 LOC105371233.csv
chr1 LOC105371254.csv
chr1 LOC105371288.csv
chr1 LOC105371406.csv
chr1 LOC105371433.csv
chr1 LOC105371441.csv
chr1 LOC105371446.csv
chr1 LOC105371458.csv
chr1 LOC105371459.csv
chr1 LOC105371460.csv
chr1 LOC105371468.csv
chr1 LOC105371473.csv
chr1 LOC105371600.csv
chr1 LOC105371601.csv
chr1 LOC105371606.csv
chr1 LOC105371608.csv
chr1 LOC105371610.csv
chr1 LOC105371614.csv
chr1 LOC10

Counting sequence length:   4%|▍         | 777/19235 [00:02<00:58, 314.28it/s]

chr1 LOC105371689.csv
chr1 LOC105371692.csv
chr1 LOC105371694.csv
chr1 LOC105371701.csv
chr1 LOC105371702.csv
chr1 LOC105371729.csv
chr1 LOC105372873.csv
chr1 LOC105372880.csv
chr1 LOC105372884.csv
chr1 LOC105372887.csv
chr1 LOC105372888.csv
chr1 LOC105372889.csv
chr1 LOC105372895.csv
chr1 LOC105372901.csv
chr1 LOC105372912.csv
chr1 LOC105372915.csv
chr1 LOC105372916.csv
chr1 LOC105372923.csv
chr1 LOC105372928.csv
chr1 LOC105372930.csv
chr1 LOC105372932.csv
chr1 LOC105372945.csv
chr1 LOC105373034.csv
chr1 LOC105373061.csv
chr1 LOC105373124.csv
chr1 LOC105373143.csv
chr1 LOC105373164.csv
chr1 LOC105373167.csv
chr1 LOC105373171.csv
chr1 LOC105373172.csv
chr1 LOC105373196.csv
chr1 LOC105373201.csv
chr1 LOC105373206.csv
chr1 LOC105373207.csv
chr1 LOC105373209.csv
chr1 LOC105373211.csv
chr1 LOC105373215.csv
chr1 LOC105373218.csv
chr1 LOC105373220.csv
chr1 LOC105373222.csv
chr1 LOC105373230.csv
chr1 LOC105373231.csv
chr1 LOC105373235.csv
chr1 LOC105373260.csv
chr1 LOC105373269.csv
chr1 LOC10

Counting sequence length:   4%|▍         | 854/19235 [00:03<00:52, 348.45it/s]

chr1 LOC105376825.csv
chr1 LOC105376828.csv
chr1 LOC105376829.csv
chr1 LOC105376830.csv
chr1 LOC105376845.csv
chr1 LOC105376850.csv
chr1 LOC105376863.csv
chr1 LOC105376864.csv
chr1 LOC105376876.csv
chr1 LOC105376885.csv
chr1 LOC105376888.csv
chr1 LOC105376892.csv
chr1 LOC105376910.csv
chr1 LOC105378586.csv
chr1 LOC105378593.csv
chr1 LOC105378598.csv
chr1 LOC105378599.csv
chr1 LOC105378600.csv
chr1 LOC105378601.csv
chr1 LOC105378602.csv
chr1 LOC105378607.csv
chr1 LOC105378608.csv
chr1 LOC105378612.csv
chr1 LOC105378618.csv
chr1 LOC105378625.csv
chr1 LOC105378631.csv
chr1 LOC105378638.csv
chr1 LOC105378639.csv
chr1 LOC105378640.csv
chr1 LOC105378646.csv
chr1 LOC105378649.csv
chr1 LOC105378651.csv
chr1 LOC105378654.csv
chr1 LOC105378658.csv
chr1 LOC105378662.csv
chr1 LOC105378663.csv
chr1 LOC105378665.csv
chr1 LOC105378667.csv
chr1 LOC105378673.csv
chr1 LOC105378675.csv
chr1 LOC105378678.csv
chr1 LOC105378693.csv
chr1 LOC105378699.csv
chr1 LOC105378706.csv
chr1 LOC105378708.csv
chr1 LOC10

Counting sequence length:   5%|▍         | 923/19235 [00:03<00:54, 335.95it/s]

chr1 LOC105378790.csv
chr1 LOC105378793.csv
chr1 LOC105378797.csv
chr1 LOC105378805.csv
chr1 LOC105378808.csv
chr1 LOC105378810.csv
chr1 LOC105378814.csv
chr1 LOC105378816.csv
chr1 LOC105378819.csv
chr1 LOC105378821.csv
chr1 LOC105378823.csv
chr1 LOC105378826.csv
chr1 LOC105378833.csv
chr1 LOC105378834.csv
chr1 LOC105378838.csv
chr1 LOC105378839.csv
chr1 LOC105378841.csv
chr1 LOC105378842.csv
chr1 LOC105378847.csv
chr1 LOC105378848.csv
chr1 LOC105378849.csv
chr1 LOC105378860.csv
chr1 LOC105378861.csv
chr1 LOC105378863.csv
chr1 LOC105378866.csv
chr1 LOC105378867.csv
chr1 LOC105378879.csv
chr1 LOC105378883.csv
chr1 LOC105378886.csv
chr1 LOC105378887.csv
chr1 LOC105378890.csv
chr1 LOC105378891.csv
chr1 LOC105378898.csv
chr1 LOC105378906.csv
chr1 LOC105378914.csv
chr1 LOC105378924.csv
chr1 LOC105378925.csv
chr1 LOC105378927.csv
chr1 LOC105378930.csv
chr1 LOC105378936.csv
chr1 LOC105378938.csv
chr1 LOC105378940.csv
chr1 LOC105378941.csv
chr1 LOC105378943.csv
chr1 LOC105378951.csv
chr1 LOC10

Counting sequence length:   5%|▌         | 994/19235 [00:03<00:53, 342.28it/s]

chr1 LOC107984933.csv
chr1 LOC107984934.csv
chr1 LOC107984935.csv
chr1 LOC107984940.csv
chr1 LOC107984945.csv
chr1 LOC107984948.csv
chr1 LOC107984951.csv
chr1 LOC107984952.csv
chr1 LOC107984954.csv
chr1 LOC107984956.csv
chr1 LOC107984962.csv
chr1 LOC107984963.csv
chr1 LOC107984964.csv
chr1 LOC107985041.csv
chr1 LOC107985043.csv
chr1 LOC107985076.csv
chr1 LOC107985095.csv
chr1 LOC107985096.csv
chr1 LOC107985115.csv
chr1 LOC107985116.csv
chr1 LOC107985174.csv
chr1 LOC107985180.csv
chr1 LOC107985191.csv
chr1 LOC107985193.csv
chr1 LOC107985203.csv
chr1 LOC107985207.csv
chr1 LOC107985211.csv
chr1 LOC107985216.csv
chr1 LOC107985219.csv
chr1 LOC107985220.csv
chr1 LOC107985227.csv
chr1 LOC107985230.csv
chr1 LOC107985232.csv
chr1 LOC107985233.csv
chr1 LOC107985238.csv
chr1 LOC107985239.csv
chr1 LOC107985241.csv
chr1 LOC107985250.csv
chr1 LOC107985253.csv
chr1 LOC107985262.csv
chr1 LOC107985272.csv
chr1 LOC107985281.csv
chr1 LOC107985300.csv
chr1 LOC107985318.csv
chr1 LOC107985352.csv
chr1 LOC10

Counting sequence length:   5%|▌         | 1029/19235 [00:03<00:53, 340.58it/s]

chr1 LOC107985454.csv
chr1 LOC107985458.csv
chr1 LOC107985460.csv
chr1 LOC107985462.csv
chr1 LOC107985467.csv
chr1 LOC107985470.csv
chr1 LOC107985517.csv
chr1 LOC107985523.csv
chr1 LOC107985524.csv
chr1 LOC107985591.csv
chr1 LOC107985593.csv
chr1 LOC107985594.csv
chr1 LOC107985721.csv
chr1 LOC107985725.csv
chr1 LOC107985729.csv
chr1 LOC107985732.csv
chr1 LOC107985743.csv
chr1 LOC107985744.csv
chr1 LOC107985745.csv
chr1 LOC107985747.csv
chr1 LOC112267871.csv
chr1 LOC112267900.csv
chr1 LOC112268218.csv
chr1 LOC112268219.csv
chr1 LOC112268220.csv
chr1 LOC112268225.csv
chr1 LOC112268227.csv
chr1 LOC112268231.csv
chr1 LOC112268235.csv
chr1 LOC112268239.csv
chr1 LOC112268240.csv
chr1 LOC112268254.csv
chr1 LOC112268256.csv
chr1 LOC112268258.csv
chr1 LOC112268263.csv
chr1 LOC112268267.csv
chr1 LOC112268273.csv
chr1 LOC112268276.csv
chr1 LOC112268287.csv
chr1 LOC112543491.csv
chr1 LOC117779438.csv
chr1 LOC122526777.csv
chr1 LOC148696.csv
chr1 LOC284581.csv
chr1 LOC284600.csv
chr1 LOC339539.csv


Counting sequence length:   6%|▌         | 1097/19235 [00:03<01:02, 289.27it/s]

chr1 LURAP1.csv
chr1 LY9.csv
chr1 LYPLA2.csv
chr1 LYPLAL1.csv
chr1 MAB21L3.csv
chr1 MACF1.csv
chr1 MACO1.csv
chr1 MAEL.csv
chr1 MAGI3.csv
chr1 MAGOH-DT.csv
chr1 MAN1A2.csv
chr1 MAN1C1.csv
chr1 MANEAL.csv
chr1 MAP10.csv
chr1 MAP3K21.csv
chr1 MAP7D1.csv
chr1 MAPKAPK2.csv
chr1 MARK1.csv
chr1 MAST2.csv
chr1 MATN1-AS1.csv
chr1 MDM4.csv
chr1 MDS2.csv
chr1 MED18.csv
chr1 MED8-AS1.csv
chr1 METTL13.csv
chr1 METTL25B.csv
chr1 MFN2.csv
chr1 MFSD14A.csv
chr1 MFSD2A.csv
chr1 MFSD4A.csv
chr1 MGC27382.csv
chr1 MGST3.csv
chr1 MIA3.csv
chr1 MIB2.csv
chr1 MICOS10-NBL1.csv
chr1 MICOS10.csv
chr1 MIER1.csv
chr1 MIGA1.csv
chr1 MIIP.csv
chr1 MIR11399.csv
chr1 MIR12116.csv
chr1 MIR12132.csv
chr1 MIR12133.csv
chr1 MIR1231.csv
chr1 MIR1255B2.csv
chr1 MIR1278.csv
chr1 MIR1295B.csv


Counting sequence length:   6%|▌         | 1180/19235 [00:04<00:51, 348.27it/s]

chr1 MIR1302-2.csv
chr1 MIR1302-2HG.csv
chr1 MIR197.csv
chr1 MIR1976.csv
chr1 MIR200A.csv
chr1 MIR200B.csv
chr1 MIR205.csv
chr1 MIR205HG.csv
chr1 MIR30C1.csv
chr1 MIR30E.csv
chr1 MIR3115.csv
chr1 MIR3116-1.csv
chr1 MIR3117.csv
chr1 MIR3119-2.csv
chr1 MIR3120.csv
chr1 MIR3122.csv
chr1 MIR3123.csv
chr1 MIR3124.csv
chr1 MIR320B1.csv
chr1 MIR3620.csv
chr1 MIR3658.csv
chr1 MIR3659.csv
chr1 MIR3659HG.csv
chr1 MIR378F.csv
chr1 MIR3972.csv
chr1 MIR4251.csv
chr1 MIR4255.csv
chr1 MIR4257.csv
chr1 MIR4258.csv
chr1 MIR429.csv
chr1 MIR4418.csv
chr1 MIR4421.csv
chr1 MIR4422.csv
chr1 MIR4422HG.csv
chr1 MIR4423.csv
chr1 MIR4424.csv
chr1 MIR4425.csv
chr1 MIR4426.csv
chr1 MIR4427.csv
chr1 MIR4428.csv
chr1 MIR4632.csv
chr1 MIR4654.csv
chr1 MIR4666A.csv
chr1 MIR4671.csv
chr1 MIR4677.csv
chr1 MIR4684.csv
chr1 MIR4781.csv
chr1 MIR4794.csv
chr1 MIR5187.csv
chr1 MIR5191.csv
chr1 MIR553.csv
chr1 MIR554.csv
chr1 MIR556.csv
chr1 MIR557.csv
chr1 MIR5584.csv
chr1 MIR5585.csv
chr1 MIR5697.csv
chr1 MIR6077.csv
chr1 

Counting sequence length:   6%|▋         | 1216/19235 [00:04<00:53, 338.29it/s]

chr1 MNDA.csv
chr1 MOV10.csv
chr1 MPL.csv
chr1 MPZL1.csv
chr1 MR1.csv
chr1 MROH7-TTC4.csv
chr1 MROH7.csv
chr1 MROH9.csv
chr1 MRPL20-AS1.csv
chr1 MRPL20-DT.csv
chr1 MRPL37.csv
chr1 MRPS21.csv
chr1 MRTO4.csv
chr1 MSH4.csv
chr1 MSTO1.csv
chr1 MTARC1.csv
chr1 MTARC2.csv
chr1 MTF2.csv
chr1 MTFR1L.csv
chr1 MTOR-AS1.csv
chr1 MTR.csv
chr1 MTX1.csv
chr1 MYCL-AS1.csv
chr1 MYOCOS.csv
chr1 MYOM3-AS1.csv
chr1 MYOPARR.csv
chr1 NASP.csv
chr1 NAV1.csv
chr1 NAXE.csv
chr1 NBL1.csv
chr1 NBPF12.csv
chr1 NBPF19.csv
chr1 NBPF26.csv
chr1 NBPF3.csv
chr1 NBPF6.csv
chr1 NBPF8.csv
chr1 NCDN.csv
chr1 NCMAP.csv
chr1 NCSTN.csv
chr1 NDUFS2.csv
chr1 NDUFS5.csv
chr1 NECAP2.csv
chr1 NECTIN4-AS1.csv
chr1 NEK2-DT.csv
chr1 NEK7.csv
chr1 NENF.csv
chr1 NEXN.csv
chr1 NFASC.csv


Counting sequence length:   7%|▋         | 1282/19235 [00:04<01:01, 291.40it/s]

chr1 NFIA.csv
chr1 NFYC.csv
chr1 NGF-AS1.csv
chr1 NHLH1.csv
chr1 NIPAL3.csv
chr1 NIT1.csv
chr1 NLRP3.csv
chr1 NMNAT1.csv
chr1 NOS1AP.csv
chr1 NOTCH2NLC.csv
chr1 NOTCH2NLR.csv
chr1 NPL.csv
chr1 NPPA-AS1.csv
chr1 NPR1.csv
chr1 NR5A2.csv
chr1 NSUN4.csv
chr1 NTMT2.csv
chr1 NTNG1.csv
chr1 NTPCR.csv
chr1 NTRK1.csv
chr1 NUDC.csv
chr1 NUDT17.csv
chr1 NUDT4B.csv
chr1 NUF2.csv
chr1 NUP133-DT.csv
chr1 OAZ3.csv
chr1 OBSCN.csv
chr1 ODR4.csv
chr1 OLFML3.csv
chr1 OPRD1.csv
chr1 OPTC.csv
chr1 OR10J1.csv
chr1 OR10K1.csv
chr1 OR10R2.csv
chr1 OR10Z1.csv
chr1 OR14C36.csv
chr1 OR14K1.csv
chr1 OR14L1P.csv
chr1 OR2AJ1.csv
chr1 OR2AK2.csv
chr1 OR2G2.csv
chr1 OR2G3.csv
chr1 OR2G6.csv
chr1 OR2L13.csv
chr1 OR2L2.csv
chr1 OR2L3.csv
chr1 OR2L5.csv
chr1 OR2L8.csv
chr1 OR2M2.csv
chr1 OR2M3.csv
chr1 OR2M4.csv


Counting sequence length:   7%|▋         | 1313/19235 [00:04<01:00, 296.25it/s]

chr1 OR2M5.csv
chr1 OR2T1.csv
chr1 OR2T2.csv
chr1 OR2T3.csv
chr1 OR2T4.csv
chr1 OR2T5.csv
chr1 OR2T6.csv
chr1 OR2T7.csv
chr1 OR2T8.csv
chr1 OR2W3.csv
chr1 OR4F5.csv
chr1 OR6K6.csv
chr1 OR9H1P.csv
chr1 OSBPL9.csv
chr1 OTUD3.csv
chr1 OVAAL.csv
chr1 PABPC4-AS1.csv
chr1 PACERR.csv
chr1 PADI1.csv
chr1 PADI3.csv
chr1 PADI4.csv
chr1 PADI6.csv
chr1 PALMD.csv
chr1 PAPPA2.csv
chr1 PARK7.csv
chr1 PATJ.csv
chr1 PAX7.csv
chr1 PBX1.csv
chr1 PCAT6.csv
chr1 PCP4L1.csv
chr1 PCSK9.csv
chr1 PDC-AS1.csv
chr1 PDE4B.csv
chr1 PDE4DIP.csv
chr1 PDIK1L.csv
chr1 PDPN.csv
chr1 PEA15.csv
chr1 PEAR1.csv
chr1 PEF1-AS1.csv
chr1 PER3.csv
chr1 PEX14.csv
chr1 PFKFB2.csv
chr1 PGBD2.csv
chr1 PGD.csv
chr1 PGM1.csv
chr1 PHACTR4.csv


Counting sequence length:   7%|▋         | 1372/19235 [00:04<01:05, 270.85it/s]

chr1 PHC2-AS1.csv
chr1 PHF13.csv
chr1 PHGDH.csv
chr1 PIFO.csv
chr1 PIGV.csv
chr1 PIK3CD.csv
chr1 PINK1.csv
chr1 PIP5K1A.csv
chr1 PITHD1.csv
chr1 PKN2.csv
chr1 PKP1.csv
chr1 PLA2G2F.csv
chr1 PLA2G4A.csv
chr1 PLA2G5.csv
chr1 PLCH2.csv
chr1 PLEKHM2.csv
chr1 PLEKHN1.csv
chr1 PLEKHO1.csv
chr1 PLK3.csv
chr1 PLOD1.csv
chr1 PLPPR4.csv
chr1 PLPPR5-AS1.csv
chr1 PMF1-BGLAP.csv
chr1 PMF1.csv
chr1 PNRC2.csv
chr1 PODN.csv
chr1 POGK.csv
chr1 POLR3C.csv
chr1 POLR3GL.csv
chr1 POU2F1.csv
chr1 PPCS.csv
chr1 PPFIA4.csv
chr1 PPIAL4A.csv
chr1 PPIAL4C.csv
chr1 PPIAL4F.csv
chr1 PPIE.csv
chr1 PPIH.csv
chr1 PPM1J-DT.csv
chr1 PPOX.csv
chr1 PPP1R12B.csv
chr1 PPP1R8.csv
chr1 PPP2R5A.csv
chr1 PRAMEF1.csv
chr1 PRAMEF12.csv
chr1 PRAMEF15.csv
chr1 PRAMEF17.csv
chr1 PRAMEF2.csv
chr1 PRAMEF20.csv
chr1 PRAMEF25.csv
chr1 PRAMEF33.csv
chr1 PRAMEF5.csv
chr1 PRAMEF7.csv
chr1 PRAMEF9.csv
chr1 PRCC.csv


Counting sequence length:   7%|▋         | 1428/19235 [00:05<01:10, 252.85it/s]

chr1 PRDM16.csv
chr1 PRDM2.csv
chr1 PRDX6.csv
chr1 PRELP.csv
chr1 PRG4.csv
chr1 PRKAA2.csv
chr1 PRKACB.csv
chr1 PRKCZ.csv
chr1 PRMT6.csv
chr1 PROK1.csv
chr1 PROX1.csv
chr1 PRPF3.csv
chr1 PRPF38A.csv
chr1 PRPF38B.csv
chr1 PRR9.csv
chr1 PRRC2C.csv
chr1 PRRX1.csv
chr1 PRSS38.csv
chr1 PRUNE1.csv
chr1 PRXL2B.csv
chr1 PSEN2.csv
chr1 PSMB4.csv
chr1 PSMD4.csv
chr1 PTBP2.csv
chr1 PTGFR.csv
chr1 PTGFRN.csv
chr1 PTPRC.csv
chr1 PTPRF.csv
chr1 PTPRU.csv
chr1 PUSL1.csv
chr1 PYHIN1.csv
chr1 QSOX1.csv
chr1 RAB25.csv
chr1 RAB42.csv
chr1 RAB4A.csv
chr1 RABGAP1L.csv
chr1 RABGGTB.csv
chr1 RAD54L.csv
chr1 RALGPS2.csv
chr1 RAP1A.csv


Counting sequence length:   8%|▊         | 1454/19235 [00:05<01:10, 253.40it/s]

chr1 RASAL2.csv
chr1 RASSF5.csv
chr1 RAVER2.csv
chr1 RBBP4.csv
chr1 RBM15.csv
chr1 RBP7.csv
chr1 RC3H1-DT.csv
chr1 RCAN3.csv
chr1 RCC1.csv
chr1 RCC2-AS1.csv
chr1 RCOR3.csv
chr1 RCSD1.csv
chr1 RER1.csv
chr1 RERE-AS1.csv
chr1 RFX5-AS1.csv
chr1 RGL1.csv
chr1 RGS1.csv
chr1 RGS13.csv
chr1 RGS18.csv
chr1 RGS2.csv
chr1 RGS21.csv
chr1 RGS4.csv
chr1 RGS5-AS1.csv
chr1 RGSL1.csv
chr1 RHBG.csv
chr1 RHD.csv
chr1 RHEX.csv
chr1 RHOU.csv
chr1 RIIAD1.csv
chr1 RIMKLA.csv
chr1 RLF.csv
chr1 RNF11.csv
chr1 RNF187.csv
chr1 RNF2.csv
chr1 RNF207.csv
chr1 RNF220.csv
chr1 RNPC3.csv
chr1 RNPEP.csv
chr1 RNU1-2.csv
chr1 RNU1-4.csv
chr1 RNU11.csv
chr1 RNU5E-1.csv
chr1 RNVU1-14.csv
chr1 RNVU1-15.csv
chr1 RNVU1-3.csv
chr1 RNVU1-4.csv
chr1 RO60.csv
chr1 ROR1.csv


Counting sequence length:   8%|▊         | 1506/19235 [00:05<01:09, 255.11it/s]

chr1 RPAP2.csv
chr1 RPF1.csv
chr1 RPL11.csv
chr1 RPL5.csv
chr1 RPRD2.csv
chr1 RPS27.csv
chr1 RPS6KA1.csv
chr1 RPS6KC1.csv
chr1 RPS8.csv
chr1 RRP15.csv
chr1 RSC1A1.csv
chr1 RTCA.csv
chr1 RUNX3-AS1.csv
chr1 RUSC1.csv
chr1 RWDD3.csv
chr1 RXFP4.csv
chr1 RYR2.csv
chr1 S100A1.csv
chr1 S100A7A.csv
chr1 S100A9.csv
chr1 S100PBP.csv
chr1 S1PR1.csv
chr1 SAMD11.csv
chr1 SAMD13.csv
chr1 SARS1.csv
chr1 SCARNA1.csv
chr1 SCARNA2.csv
chr1 SCARNA21B.csv
chr1 SCCPDH.csv
chr1 SCNM1.csv
chr1 SCNN1D.csv
chr1 SCP2.csv
chr1 SDCCAG8.csv
chr1 SDHC.csv
chr1 SELENON.csv
chr1 SEMA4A.csv
chr1 SERINC2.csv
chr1 SERTAD4.csv
chr1 SESN2.csv
chr1 SETDB1.csv
chr1 SFN.csv
chr1 SFT2D2.csv
chr1 SGIP1.csv
chr1 SH3BGRL3.csv
chr1 SH3D21.csv
chr1 SH3GLB1.csv
chr1 SHISA4.csv
chr1 SHISAL2A.csv


Counting sequence length:   8%|▊         | 1561/19235 [00:05<01:14, 236.32it/s]

chr1 SKI.csv
chr1 SLAMF7.csv
chr1 SLAMF8.csv
chr1 SLC16A1-AS1.csv
chr1 SLC22A15.csv
chr1 SLC25A33.csv
chr1 SLC25A34.csv
chr1 SLC25A44.csv
chr1 SLC26A9-AS1.csv
chr1 SLC27A3.csv
chr1 SLC2A1-DT.csv
chr1 SLC30A7.csv
chr1 SLC35A3.csv
chr1 SLC35F3.csv
chr1 SLC44A3.csv
chr1 SLC45A1.csv
chr1 SLC50A1.csv
chr1 SLC5A9.csv
chr1 SLC66A1.csv
chr1 SLC6A17.csv
chr1 SLFNL1-AS1.csv
chr1 SMAP2.csv
chr1 SMCP.csv
chr1 SMG7.csv
chr1 SMIM1.csv
chr1 SMPDL3B.csv
chr1 SMYD2.csv
chr1 SMYD3-AS1.csv
chr1 SNAP47.csv
chr1 SNAPIN.csv
chr1 SNHG3.csv
chr1 SNORA16B.csv
chr1 SNORA58B.csv
chr1 SNORA59A.csv
chr1 SNORA66.csv
chr1 SNORA70H.csv
chr1 SNORA73A.csv
chr1 SNORA73B.csv
chr1 SNORA77.csv
chr1 SNORD13C.csv
chr1 SNORD160.csv
chr1 SNORD21.csv
chr1 SNORD38A.csv
chr1 SNORD38B.csv
chr1 SNORD3G.csv


Counting sequence length:   8%|▊         | 1592/19235 [00:05<01:09, 254.15it/s]

chr1 SNORD45A.csv
chr1 SNORD45B.csv
chr1 SNORD45C.csv
chr1 SNORD46.csv
chr1 SNORD55.csv
chr1 SNRPE.csv
chr1 SNX27.csv
chr1 SNX7.csv
chr1 SOAT1.csv
chr1 SOX13.csv
chr1 SPATA1.csv
chr1 SPATA17.csv
chr1 SPATA42.csv
chr1 SPEN.csv
chr1 SPRR1A.csv
chr1 SPRR1B.csv
chr1 SPRR3.csv
chr1 SPRR4.csv
chr1 SPRR5.csv
chr1 SPRTN.csv
chr1 SPSB1.csv
chr1 SRARP.csv
chr1 SRGAP2.csv
chr1 SRGAP2C.csv
chr1 SRP9.csv
chr1 SRRM1.csv
chr1 SRSF11.csv
chr1 SSBP3-AS1.csv
chr1 ST3GAL3.csv
chr1 ST6GALNAC3.csv
chr1 ST6GALNAC5.csv
chr1 STRIP1.csv
chr1 STUM.csv
chr1 STX12.csv
chr1 STXBP3.csv
chr1 STYXL2.csv
chr1 SUCO.csv
chr1 SWT1.csv
chr1 SYCP1.csv
chr1 SYPL2.csv
chr1 SYT11.csv


Counting sequence length:   9%|▊         | 1646/19235 [00:05<01:11, 247.03it/s]

chr1 SYT14.csv
chr1 SYTL1.csv
chr1 SZRD1.csv
chr1 SZT2.csv
chr1 TAF12-DT.csv
chr1 TAF1A-AS1.csv
chr1 TAFA3.csv
chr1 TARDBP.csv
chr1 TARS2.csv
chr1 TAS1R1.csv
chr1 TAS1R3.csv
chr1 TATDN3.csv
chr1 TBCE.csv
chr1 TBX19.csv
chr1 TCEANC2.csv
chr1 TDRD10.csv
chr1 TDRD5.csv
chr1 TDRKH-AS1.csv
chr1 TEKT2.csv
chr1 TENT5C.csv
chr1 TEX35.csv
chr1 TEX38.csv
chr1 TEX50.csv
chr1 TFAP2E.csv
chr1 TGFB2-OT1.csv
chr1 TGFB2.csv
chr1 THAP3.csv
chr1 THBS3-AS1.csv
chr1 THEMIS2.csv
chr1 THRAP3.csv
chr1 TIE1.csv
chr1 TIMM17A.csv
chr1 TINAGL1.csv
chr1 TIPRL.csv
chr1 TLCD4-RWDD3.csv
chr1 TLCD4.csv
chr1 TMCC2.csv
chr1 TMCO1-AS1.csv
chr1 TMCO2.csv
chr1 TMEM125.csv
chr1 TMEM167B.csv
chr1 TMEM183A.csv
chr1 TMEM201.csv
chr1 TMEM222.csv
chr1 TMEM269.csv
chr1 TMEM39B.csv
chr1 TMEM50A.csv
chr1 TMEM51.csv
chr1 TMEM61.csv
chr1 TMEM69.csv
chr1 TMEM78.csv
chr1 TMEM79.csv
chr1 TMEM82.csv
chr1 TMEM88B.csv
chr1 TNFAIP8L2-SCNM1.csv
chr1 TNFAIP8L2.csv
chr1 TNFRSF14.csv
chr1 TNFRSF1B.csv


Counting sequence length:   9%|▉         | 1720/19235 [00:06<00:57, 306.57it/s]

chr1 TNFRSF8.csv
chr1 TNN.csv
chr1 TNNI3K.csv
chr1 TOE1.csv
chr1 TOMM40L.csv
chr1 TOR1AIP1.csv
chr1 TOR3A.csv
chr1 TP73.csv
chr1 TPRG1L.csv
chr1 TRAF3IP3.csv
chr1 TRAF5.csv
chr1 TRE-CTC1-1.csv
chr1 TRE-CTC2-1.csv
chr1 TRE-TTC3-1.csv
chr1 TRG-CCC1-2.csv
chr1 TRG-CCC5-1.csv
chr1 TRG-GCC1-1.csv
chr1 TRG-GCC1-2.csv
chr1 TRG-GCC1-3.csv
chr1 TRG-GCC1-4.csv
chr1 TRG-GCC4-1.csv
chr1 TRG-TCC2-1.csv
chr1 TRG-TCC2-6.csv
chr1 TRH-GTG1-1.csv
chr1 TRH-GTG1-2.csv
chr1 TRH-GTG1-3.csv
chr1 TRIM46.csv
chr1 TRIM58.csv
chr1 TRIM67.csv
chr1 TRK-CTT2-1.csv
chr1 TRK-TTT3-1.csv
chr1 TRK-TTT8-1.csv
chr1 TRL-CAA4-1.csv
chr1 TRL-CAG1-1.csv
chr1 TRL-CAG1-2.csv
chr1 TRL-CAG1-3.csv
chr1 TRL-CAG1-4.csv
chr1 TRL-CAG1-5.csv
chr1 TRMT13.csv
chr1 TRN-GTT1-1.csv
chr1 TRN-GTT10-1.csv
chr1 TRN-GTT13-1.csv
chr1 TRN-GTT2-1.csv
chr1 TRN-GTT2-7.csv
chr1 TRN-GTT24-1.csv
chr1 TRN-GTT3-1.csv
chr1 TRN-GTT4-1.csv
chr1 TRNAU1AP.csv
chr1 TRNAV-CAC.csv
chr1 TRNP1.csv
chr1 TRP-CGG1-1.csv
chr1 TRQ-CTG3-1.csv
chr1 TRQ-CTG3-2.csv
chr1 TRQ

Counting sequence length:   9%|▉         | 1784/19235 [00:06<00:59, 294.66it/s]

chr1 TTC39A-AS1.csv
chr1 TTC4.csv
chr1 TTF2.csv
chr1 TTLL10.csv
chr1 TUFT1.csv
chr1 TXLNA.csv
chr1 TXNDC12-AS1.csv
chr1 TYW3.csv
chr1 UAP1.csv
chr1 UBAP2L.csv
chr1 UBE2Q1-AS1.csv
chr1 UBE2U.csv
chr1 UBE4B.csv
chr1 UBIAD1.csv
chr1 UBL4B.csv
chr1 UBXN10.csv
chr1 UCK2.csv
chr1 UFC1.csv
chr1 UHMK1.csv
chr1 UQCRH.csv
chr1 URB2.csv
chr1 UROD.csv
chr1 USH2A-AS1.csv
chr1 USH2A-AS2.csv
chr1 USP1.csv
chr1 USP21.csv
chr1 UTP11.csv
chr1 UTP25.csv
chr1 VAMP3.csv
chr1 VANGL1.csv
chr1 VANGL2.csv
chr1 VASH2.csv
chr1 VAV3-AS1.csv
chr1 VCAM1.csv
chr1 VPS13D.csv
chr1 VPS45.csv
chr1 VWA1.csv
chr1 VWA5B1.csv
chr1 WARS2-AS1.csv
chr1 WARS2-IT1.csv
chr1 WDR3.csv
chr1 WDR64.csv
chr1 WDTC1.csv
chr1 WNT2B.csv
chr1 WNT3A.csv
chr1 XCL1.csv
chr1 XKR8.csv
chr1 XPR1.csv
chr1 YBX1.csv
chr1 YTHDF2.csv
chr1 ZBED6.csv
chr1 ZBTB18.csv


Counting sequence length:   9%|▉         | 1814/19235 [00:06<01:00, 288.84it/s]

chr1 ZBTB37.csv
chr1 ZBTB40.csv
chr1 ZBTB48.csv
chr1 ZBTB7B.csv
chr1 ZBTB8A.csv
chr1 ZBTB8B.csv
chr1 ZC3H11A.csv
chr1 ZC3H12A.csv
chr1 ZCCHC17.csv
chr1 ZDHHC18.csv
chr1 ZFP69.csv
chr1 ZFP69B.csv
chr1 ZFYVE9.csv
chr1 ZMPSTE24.csv
chr1 ZMYM1.csv
chr1 ZMYM4.csv
chr1 ZNF326.csv
chr1 ZNF362.csv
chr1 ZNF436-AS1.csv
chr1 ZNF593.csv
chr1 ZNF672.csv
chr1 ZNF678.csv
chr1 ZNF684.csv
chr1 ZNF687.csv
chr1 ZNF691.csv
chr1 ZPLD2P.csv
chr1 ZRANB2-AS1.csv
chr1 ZRANB2-DT.csv
chr1 ZSCAN20.csv
chr1 ZYG11A.csv
chr1 ZYG11B.csv
chr2 ABCG8.csv
chr2 ABHD1.csv
chr2 ABI2.csv
chr2 ACKR3.csv
chr2 ACMSD.csv
chr2 ACOXL.csv
chr2 ACP1.csv
chr2 ACSL3.csv
chr2 ACTG2.csv
chr2 ACTR2.csv
chr2 ACTR3.csv
chr2 ACVR2A.csv
chr2 ACYP2.csv
chr2 ADAM23.csv


Counting sequence length:  10%|▉         | 1844/19235 [00:06<01:09, 251.66it/s]

chr2 AFTPH.csv
chr2 AGAP1-IT1.csv
chr2 AGAP1.csv
chr2 AGBL5.csv
chr2 AGFG1.csv
chr2 AGPS.csv
chr2 AGXT.csv
chr2 ALLC.csv
chr2 ALMS1-IT1.csv
chr2 ALMS1.csv
chr2 ALPG.csv
chr2 ALPI.csv
chr2 ALPP.csv
chr2 AMER3.csv
chr2 ANKAR.csv
chr2 ANKRD36.csv
chr2 ANKRD44-AS1.csv
chr2 ANKRD44-DT.csv
chr2 ANKRD53.csv
chr2 ANKZF1.csv
chr2 ANO7.csv
chr2 ANTXR1.csv
chr2 ANXA4.csv
chr2 AOX1.csv
chr2 APLF.csv
chr2 AQP12A.csv
chr2 AQP7B.csv
chr2 ARHGAP15.csv
chr2 ARHGAP25.csv
chr2 ARHGEF33.csv
chr2 ARHGEF4.csv
chr2 ARID5A.csv
chr2 ARL6IP6.csv
chr2 ARMC9.csv
chr2 ARPC2.csv


Counting sequence length:  10%|▉         | 1896/19235 [00:06<01:14, 232.09it/s]

chr2 ASAP2.csv
chr2 ASB1.csv
chr2 ASDURF.csv
chr2 ASIC4.csv
chr2 ASNSD1.csv
chr2 ATG16L1.csv
chr2 ATG4B.csv
chr2 ATIC.csv
chr2 ATOH8.csv
chr2 ATP6V1B1.csv
chr2 ATP6V1C2.csv
chr2 ATRAID.csv
chr2 B3GALT1.csv
chr2 B3GNT2.csv
chr2 B3GNT7.csv
chr2 BABAM2.csv
chr2 BAZ2B-AS1.csv
chr2 BBS5.csv
chr2 BCL2L11.csv
chr2 BCS1L.csv
chr2 BCYRN1.csv
chr2 BIRC6.csv
chr2 BMPR2.csv
chr2 BOK.csv
chr2 BOLA3-DT.csv
chr2 BZW1.csv
chr2 C2orf15.csv
chr2 C2orf16.csv
chr2 C2orf27A.csv
chr2 C2orf49.csv
chr2 C2orf50.csv
chr2 C2orf69.csv
chr2 C2orf72.csv
chr2 C2orf73.csv
chr2 C2orf74.csv
chr2 C2orf78.csv
chr2 C2orf88.csv
chr2 C2orf92.csv
chr2 CAB39.csv
chr2 CAD.csv
chr2 CALCRL-AS1.csv
chr2 CAMKMT.csv


Counting sequence length:  10%|█         | 1946/19235 [00:07<01:12, 239.57it/s]

chr2 CAPN10.csv
chr2 CARF.csv
chr2 CASP10.csv
chr2 CASP8.csv
chr2 CATIP.csv
chr2 CAVIN2-AS1.csv
chr2 CBWD2.csv
chr2 CCDC138.csv
chr2 CCDC140.csv
chr2 CCDC148-AS1.csv
chr2 CCDC150.csv
chr2 CCDC74A.csv
chr2 CCDC85A.csv
chr2 CCL20.csv
chr2 CCNT2.csv
chr2 CCNYL1.csv
chr2 CCT7.csv
chr2 CD28.csv
chr2 CD8B2.csv
chr2 CDCA7.csv
chr2 CDK15.csv
chr2 CDK5R2.csv
chr2 CEBPZOS.csv
chr2 CENPA.csv
chr2 CENPO.csv
chr2 CEP68.csv
chr2 CERS6.csv
chr2 CFAP221.csv
chr2 CFAP36.csv
chr2 CFC1B.csv
chr2 CFLAR.csv
chr2 CHAC2.csv
chr2 CHCHD5.csv
chr2 CHMP3-AS1.csv
chr2 CHRND.csv
chr2 CHRNG.csv
chr2 CHROMR.csv
chr2 CIAO1.csv
chr2 CLASP1-AS1.csv
chr2 CLIP4.csv
chr2 CMKLR2-AS.csv
chr2 CNGA3.csv
chr2 CNNM3.csv
chr2 CNNM4.csv
chr2 CNOT11.csv
chr2 CNOT9.csv
chr2 CNTNAP5.csv


Counting sequence length:  10%|█         | 1994/19235 [00:07<01:16, 224.17it/s]

chr2 COL3A1.csv
chr2 COL4A3.csv
chr2 COLEC11.csv
chr2 COMMD1.csv
chr2 COPS7B.csv
chr2 COPS8.csv
chr2 COQ10B.csv
chr2 COX5B.csv
chr2 CPO.csv
chr2 CPS1-IT1.csv
chr2 CPS1.csv
chr2 CPSF3.csv
chr2 CREB1.csv
chr2 CRIM1.csv
chr2 CRIPT.csv
chr2 CROCC2.csv
chr2 CSRNP3.csv
chr2 CT75.csv
chr2 CTDSP1.csv
chr2 CTLA4.csv
chr2 CTNNA2.csv
chr2 CXCR2.csv
chr2 CYBRD1.csv
chr2 CYP1B1-AS1.csv
chr2 CYP20A1.csv
chr2 CYP27A1.csv
chr2 CYTOR.csv
chr2 D2HGDH.csv
chr2 DAPL1.csv
chr2 DARS1-AS1.csv
chr2 DAW1.csv
chr2 DBI.csv
chr2 DCAF17.csv
chr2 DCDC2C.csv
chr2 DCTN1-AS1.csv
chr2 DDX1.csv
chr2 DDX18.csv
chr2 DES.csv
chr2 DGKD.csv


Counting sequence length:  10%|█         | 2017/19235 [00:07<01:21, 210.70it/s]

chr2 DGUOK.csv
chr2 DHRS9.csv
chr2 DIRC3-AS1.csv
chr2 DIS3L2.csv
chr2 DLX1.csv
chr2 DLX2-DT.csv
chr2 DNAH6.csv
chr2 DNAJB2.csv
chr2 DNAJC10.csv
chr2 DNAJC27-AS1.csv
chr2 DNAJC5G.csv
chr2 DNPEP-AS1.csv
chr2 DOK1.csv
chr2 DPP10.csv
chr2 DPP4-DT.csv
chr2 DPYSL5.csv
chr2 DRC1.csv
chr2 DTNB-AS1.csv
chr2 DUSP19.csv
chr2 DUSP28.csv
chr2 DYNC1I2.csv
chr2 DYNC2LI1.csv
chr2 DYSF.csv
chr2 ECRG4.csv
chr2 EEF1B2.csv
chr2 EFHD1.csv
chr2 EFR3B.csv
chr2 EHBP1.csv
chr2 EHD3.csv
chr2 EIF2AK3-DT.csv
chr2 EIF4E2.csv
chr2 EIF5B.csv
chr2 ELMOD3.csv
chr2 EMILIN1.csv
chr2 EML4.csv


Counting sequence length:  11%|█         | 2060/19235 [00:07<01:24, 203.61it/s]

chr2 EML6.csv
chr2 EMX1.csv
chr2 EPAS1.csv
chr2 EPB41L5.csv
chr2 EPC2.csv
chr2 EPCAM.csv
chr2 ERFE.csv
chr2 ERICH2.csv
chr2 ERLEC1.csv
chr2 ESPNL.csv
chr2 ETAA1.csv
chr2 EVA1A-AS.csv
chr2 FAHD2A.csv
chr2 FAM117B.csv
chr2 FAM138B.csv
chr2 FAM166C.csv
chr2 FAM171B.csv
chr2 FAM201B.csv
chr2 FAM228A.csv
chr2 FAM228B.csv
chr2 FAM237A.csv
chr2 FARP2.csv
chr2 FASTKD2.csv
chr2 FBLN7.csv
chr2 FBXO36.csv
chr2 FER1L5.csv
chr2 FKBP1B.csv
chr2 FMNL2.csv
chr2 FN1-DT.csv
chr2 FOSL2.csv
chr2 FOXD4L1.csv
chr2 FOXN2.csv
chr2 FSIP2.csv
chr2 FZD7.csv
chr2 G6PC2.csv
chr2 GACAT1.csv
chr2 GACAT3.csv
chr2 GAD1.csv
chr2 GAL3ST2.csv


Counting sequence length:  11%|█         | 2110/19235 [00:07<01:16, 225.15it/s]

chr2 GALM.csv
chr2 GALNT13.csv
chr2 GALNT5.csv
chr2 GAREM2.csv
chr2 GCA.csv
chr2 GCC2.csv
chr2 GCKR.csv
chr2 GCSIR.csv
chr2 GDF7.csv
chr2 GEMIN6.csv
chr2 GEN1.csv
chr2 GIGYF2.csv
chr2 GKN1.csv
chr2 GLI2.csv
chr2 GLS.csv
chr2 GMCL1.csv
chr2 GMPPA.csv
chr2 GNLY.csv
chr2 GORASP2.csv
chr2 GPATCH11.csv
chr2 GPBAR1.csv
chr2 GPC1.csv
chr2 GPD2.csv
chr2 GPN1.csv
chr2 GPR148.csv
chr2 GPR155-DT.csv
chr2 GPR17.csv
chr2 GPR35.csv
chr2 GPR39.csv
chr2 GPR45.csv
chr2 GREB1.csv
chr2 GRHL1.csv
chr2 GTF2A1L.csv
chr2 GTF3C2-AS1.csv
chr2 GTF3C2-AS2.csv
chr2 GULP1.csv
chr2 GYPC.csv
chr2 HADHB.csv
chr2 HAGLROS.csv
chr2 HAT1.csv
chr2 HDAC4-AS1.csv
chr2 HECW2-AS1.csv
chr2 HK2.csv
chr2 HNMT.csv
chr2 HNRNPA3.csv


Counting sequence length:  11%|█         | 2133/19235 [00:07<01:16, 222.72it/s]

chr2 HOXD1.csv
chr2 HOXD10.csv
chr2 HOXD11.csv
chr2 HOXD12.csv
chr2 HOXD13.csv
chr2 HOXD3.csv
chr2 HOXD4.csv
chr2 HOXD8.csv
chr2 HOXD9.csv
chr2 HPCAL1.csv
chr2 HSPE1-MOB4.csv
chr2 HSPE1.csv
chr2 HTRA2.csv
chr2 IAH1.csv
chr2 ICOS.csv
chr2 ID2.csv
chr2 IDH1-AS1.csv
chr2 IGFBP-AS1.csv
chr2 IGFBP2.csv
chr2 IGK.csv
chr2 IGKV1D-12.csv
chr2 IGKV1D-13.csv
chr2 IGKV1D-16.csv
chr2 IGKV1D-17.csv
chr2 IGKV1D-33.csv
chr2 IGKV1D-37.csv
chr2 IGKV1D-39.csv
chr2 IGKV1D-42.csv
chr2 IGKV1D-43.csv
chr2 IGKV1D-8.csv
chr2 IGKV2D-24.csv
chr2 IGKV2D-28.csv
chr2 IGKV2D-29.csv
chr2 IGKV2D-30.csv
chr2 IGKV2D-40.csv
chr2 IGKV3D-11.csv
chr2 IGKV3D-15.csv
chr2 IGKV3D-20.csv
chr2 IGKV3D-7.csv
chr2 IGKV4-1.csv
chr2 IGKV5-2.csv
chr2 IGKV6D-21.csv
chr2 IGKV6D-41.csv
chr2 IL18R1.csv
chr2 IL18RAP.csv
chr2 IL1F10.csv
chr2 IL1R1.csv
chr2 IL1R2.csv
chr2 IL1RL1.csv
chr2 IL1RL2.csv
chr2 IL1RN.csv
chr2 IL36A.csv
chr2 IL36G.csv
chr2 IL36RN.csv
chr2 IL37.csv


Counting sequence length:  11%|█▏        | 2200/19235 [00:08<01:05, 259.24it/s]

chr2 IMP4.csv
chr2 ING5.csv
chr2 INHA.csv
chr2 INHBB.csv
chr2 INO80B-WBP1.csv
chr2 INO80B.csv
chr2 INPP1.csv
chr2 INPP4A.csv
chr2 INPP5D.csv
chr2 INSIG2.csv
chr2 ITGA4.csv
chr2 ITGA6.csv
chr2 ITGAV.csv
chr2 ITM2C.csv
chr2 ITPRID2.csv
chr2 ITPRIPL1.csv
chr2 KANSL1L-AS1.csv
chr2 KCMF1.csv
chr2 KCNE4.csv
chr2 KCNF1.csv
chr2 KCNH7-AS1.csv
chr2 KCNIP3.csv
chr2 KCNJ3.csv
chr2 KCNK3.csv
chr2 KCNS3.csv
chr2 KDM3A.csv
chr2 KHK.csv
chr2 KIAA2012.csv
chr2 KIF5C.csv
chr2 KLF11.csv
chr2 KLHL23.csv
chr2 KLHL29.csv
chr2 KLHL30.csv
chr2 KLHL41.csv
chr2 KRTCAP3.csv
chr2 KYNU.csv
chr2 LANCL1-AS1.csv
chr2 LAPTM4A-DT.csv
chr2 LBH.csv
chr2 LBX2-AS1.csv
chr2 LCLAT1.csv
chr2 LCT-AS1.csv
chr2 LGALSL.csv
chr2 LIMS1.csv
chr2 LIMS3-LOC440895.csv
chr2 

Counting sequence length:  12%|█▏        | 2253/19235 [00:08<01:09, 243.50it/s]

LIMS3.csv
chr2 LINC00486.csv
chr2 LINC00570.csv
chr2 LINC00608.csv
chr2 LINC00954.csv
chr2 LINC01087.csv
chr2 LINC01102.csv
chr2 LINC01104.csv
chr2 LINC01117.csv
chr2 LINC01118.csv
chr2 LINC01119.csv
chr2 LINC01120.csv
chr2 LINC01122.csv
chr2 LINC01123.csv
chr2 LINC01126.csv
chr2 LINC01127.csv
chr2 LINC01143.csv
chr2 LINC01173.csv
chr2 LINC01191.csv
chr2 LINC01237.csv
chr2 LINC01238.csv
chr2 LINC01291.csv
chr2 LINC01293.csv
chr2 LINC01305.csv
chr2 LINC01317.csv
chr2 LINC01318.csv
chr2 LINC01320.csv
chr2 LINC01381.csv
chr2 LINC01412.csv
chr2 LINC01614.csv
chr2 LINC01790.csv
chr2 LINC01793.csv
chr2 LINC01794.csv
chr2 LINC01795.csv
chr2 LINC01798.csv
chr2 LINC01799.csv
chr2 LINC01802.csv
chr2 LINC01804.csv
chr2 LINC01805.csv
chr2 LINC01806.csv
chr2 LINC01808.csv
chr2 LINC01809.csv
chr2 LINC01818.csv
chr2 LINC01821.csv


Counting sequence length:  12%|█▏        | 2285/19235 [00:08<01:04, 263.91it/s]

chr2 LINC01823.csv
chr2 LINC01824.csv
chr2 LINC01828.csv
chr2 LINC01832.csv
chr2 LINC01849.csv
chr2 LINC01850.csv
chr2 LINC01851.csv
chr2 LINC01856.csv
chr2 LINC01857.csv
chr2 LINC01865.csv
chr2 LINC01866.csv
chr2 LINC01867.csv
chr2 LINC01870.csv
chr2 LINC01871.csv
chr2 LINC01873.csv
chr2 LINC01875.csv
chr2 LINC01877.csv
chr2 LINC01878.csv
chr2 LINC01881.csv
chr2 LINC01883.csv
chr2 LINC01884.csv
chr2 LINC01886.csv
chr2 LINC01888.csv
chr2 LINC01889.csv
chr2 LINC01911.csv
chr2 LINC01913.csv
chr2 LINC01918.csv
chr2 LINC01921.csv
chr2 LINC01934.csv
chr2 LINC01936.csv
chr2 LINC01937.csv
chr2 LINC01954.csv
chr2 LINC01958.csv
chr2 LINC01960.csv
chr2 LINC01961.csv
chr2 LINC01963.csv
chr2 LINC01965.csv
chr2 LINC02576.csv
chr2 LINC02579.csv
chr2 LINC02583.csv
chr2 LINC02612.csv
chr2 LINC02831.csv
chr2 LINC02832.csv
chr2 LINC02850.csv
chr2 LINC02934.csv
chr2 LIPT1.csv
chr2 LOC100129175.csv
chr2 LOC100505498.csv
chr2 LOC100506076.csv
chr2 LOC100506235.csv
chr2 LOC100506274.csv
chr2 LOC100506405.cs

Counting sequence length:  12%|█▏        | 2340/19235 [00:08<01:04, 262.36it/s]

chr2 LOC100507562.csv
chr2 LOC100996506.csv
chr2 LOC100996637.csv
chr2 LOC101926959.csv
chr2 LOC101926969.csv
chr2 LOC101926974.csv
chr2 LOC101927053.csv
chr2 LOC101927055.csv
chr2 LOC101927073.csv
chr2 LOC101927089.csv
chr2 LOC101927213.csv
chr2 LOC101927235.csv
chr2 LOC101927262.csv
chr2 LOC101927283.csv
chr2 LOC101927366.csv
chr2 LOC101927383.csv
chr2 LOC101927400.csv
chr2 LOC101927454.csv
chr2 LOC101927509.csv
chr2 LOC101927533.csv
chr2 LOC101927661.csv
chr2 LOC101927881.csv
chr2 LOC101927948.csv
chr2 LOC101927960.csv
chr2 LOC101928156.csv
chr2 LOC101928180.csv
chr2 LOC101928361.csv
chr2 LOC101928371.csv
chr2 LOC101928386.csv
chr2 LOC101928526.csv
chr2 LOC101928881.csv
chr2 LOC101929386.csv
chr2 LOC101929510.csv
chr2 LOC101929532.csv
chr2 LOC101929633.csv
chr2 LOC101929643.csv
chr2 LOC101929667.csv
chr2 LOC101929908.csv
chr2 LOC102723413.csv
chr2 LOC102723730.csv
chr2 LOC102724008.csv
chr2 LOC102724058.csv
chr2 LOC102724072.csv
chr2 LOC102724340.csv
chr2 LOC102724389.csv
chr2 LOC10

Counting sequence length:  12%|█▏        | 2394/19235 [00:08<01:04, 260.54it/s]

chr2 LOC102724849.csv
chr2 LOC102724861.csv
chr2 LOC102724875.csv
chr2 LOC102724955.csv
chr2 LOC105369167.csv
chr2 LOC105373351.csv
chr2 LOC105373352.csv
chr2 LOC105373389.csv
chr2 LOC105373392.csv
chr2 LOC105373394.csv
chr2 LOC105373395.csv
chr2 LOC105373397.csv
chr2 LOC105373398.csv
chr2 LOC105373406.csv
chr2 LOC105373407.csv
chr2 LOC105373409.csv
chr2 LOC105373414.csv
chr2 LOC105373416.csv
chr2 LOC105373418.csv
chr2 LOC105373419.csv
chr2 LOC105373429.csv
chr2 LOC105373430.csv
chr2 LOC105373436.csv
chr2 LOC105373437.csv
chr2 LOC105373438.csv
chr2 LOC105373440.csv
chr2 LOC105373447.csv
chr2 LOC105373454.csv
chr2 LOC105373456.csv
chr2 LOC105373461.csv
chr2 LOC105373466.csv
chr2 LOC105373480.csv
chr2 LOC105373481.csv
chr2 LOC105373484.csv
chr2 LOC105373487.csv
chr2 LOC105373488.csv
chr2 LOC105373493.csv
chr2 LOC105373504.csv
chr2 LOC105373508.csv
chr2 LOC105373510.csv
chr2 LOC105373512.csv
chr2 LOC105373525.csv
chr2 LOC105373526.csv
chr2 LOC105373527.csv
chr2 LOC105373531.csv
chr2 LOC10

Counting sequence length:  13%|█▎        | 2457/19235 [00:09<00:58, 285.41it/s]

chr2 LOC105373595.csv
chr2 LOC105373601.csv
chr2 LOC105373604.csv
chr2 LOC105373605.csv
chr2 LOC105373611.csv
chr2 LOC105373612.csv
chr2 LOC105373613.csv
chr2 LOC105373616.csv
chr2 LOC105373618.csv
chr2 LOC105373622.csv
chr2 LOC105373623.csv
chr2 LOC105373629.csv
chr2 LOC105373640.csv
chr2 LOC105373641.csv
chr2 LOC105373643.csv
chr2 LOC105373645.csv
chr2 LOC105373648.csv
chr2 LOC105373649.csv
chr2 LOC105373651.csv
chr2 LOC105373663.csv
chr2 LOC105373666.csv
chr2 LOC105373668.csv
chr2 LOC105373674.csv
chr2 LOC105373679.csv
chr2 LOC105373685.csv
chr2 LOC105373689.csv
chr2 LOC105373691.csv
chr2 LOC105373693.csv
chr2 LOC105373698.csv
chr2 LOC105373699.csv
chr2 LOC105373700.csv
chr2 LOC105373707.csv
chr2 LOC105373709.csv
chr2 LOC105373714.csv
chr2 LOC105373715.csv
chr2 LOC105373716.csv
chr2 LOC105373717.csv
chr2 LOC105373724.csv
chr2 LOC105373727.csv
chr2 LOC105373728.csv
chr2 LOC105373741.csv
chr2 LOC105373752.csv
chr2 LOC105373756.csv
chr2 LOC105373766.csv
chr2 LOC105373768.csv
chr2 LOC10

Counting sequence length:  13%|█▎        | 2520/19235 [00:09<00:57, 292.17it/s]

chr2 LOC105373835.csv
chr2 LOC105373845.csv
chr2 LOC105373851.csv
chr2 LOC105373854.csv
chr2 LOC105373864.csv
chr2 LOC105373870.csv
chr2 LOC105373874.csv
chr2 LOC105373880.csv
chr2 LOC105373887.csv
chr2 LOC105373888.csv
chr2 LOC105373890.csv
chr2 LOC105373891.csv
chr2 LOC105373893.csv
chr2 LOC105373895.csv
chr2 LOC105373896.csv
chr2 LOC105373899.csv
chr2 LOC105373902.csv
chr2 LOC105373903.csv
chr2 LOC105373907.csv
chr2 LOC105373908.csv
chr2 LOC105373909.csv
chr2 LOC105373910.csv
chr2 LOC105373918.csv
chr2 LOC105373919.csv
chr2 LOC105373920.csv
chr2 LOC105373921.csv
chr2 LOC105373928.csv
chr2 LOC105373932.csv
chr2 LOC105373937.csv
chr2 LOC105373938.csv
chr2 LOC105373940.csv
chr2 LOC105373945.csv
chr2 LOC105373947.csv
chr2 LOC105373949.csv
chr2 LOC105373950.csv
chr2 LOC105373952.csv
chr2 LOC105373953.csv
chr2 LOC105373958.csv
chr2 LOC105373966.csv
chr2 LOC105373976.csv
chr2 LOC105373977.csv
chr2 LOC105373989.csv
chr2 LOC105373991.csv
chr2 LOC105374317.csv
chr2 LOC105374318.csv
chr2 LOC10

Counting sequence length:  13%|█▎        | 2550/19235 [00:09<00:57, 288.57it/s]

chr2 LOC105374383.csv
chr2 LOC105374435.csv
chr2 LOC105374455.csv
chr2 LOC105374464.csv
chr2 LOC105374468.csv
chr2 LOC105374470.csv
chr2 LOC105374491.csv
chr2 LOC105374567.csv
chr2 LOC105374570.csv
chr2 LOC105374571.csv
chr2 LOC105374573.csv
chr2 LOC105374576.csv
chr2 LOC105374577.csv
chr2 LOC105374584.csv
chr2 LOC105374588.csv
chr2 LOC105374589.csv
chr2 LOC105374590.csv
chr2 LOC105374592.csv
chr2 LOC105374594.csv
chr2 LOC105374595.csv
chr2 LOC105374615.csv
chr2 LOC105374653.csv
chr2 LOC105374662.csv
chr2 LOC105374690.csv
chr2 LOC105374697.csv
chr2 LOC105374760.csv
chr2 LOC105374761.csv
chr2 LOC105374764.csv
chr2 LOC105374775.csv
chr2 LOC105374776.csv
chr2 LOC105374794.csv
chr2 LOC105374796.csv
chr2 LOC105374803.csv
chr2 LOC105374808.csv
chr2 LOC105374810.csv
chr2 LOC105374811.csv
chr2 LOC105374813.csv
chr2 LOC105374814.csv
chr2 LOC105374815.csv
chr2 LOC105374820.csv
chr2 LOC105374821.csv
chr2 LOC105374829.csv
chr2 LOC105374831.csv
chr2 LOC105374834.csv
chr2 LOC105374837.csv
chr2 LOC10

Counting sequence length:  14%|█▎        | 2618/19235 [00:09<00:53, 311.35it/s]

chr2 LOC105376787.csv
chr2 LOC105376810.csv
chr2 LOC105377635.csv
chr2 LOC107984020.csv
chr2 LOC107984111.csv
chr2 LOC107985769.csv
chr2 LOC107985771.csv
chr2 LOC107985772.csv
chr2 LOC107985774.csv
chr2 LOC107985776.csv
chr2 LOC107985779.csv
chr2 LOC107985780.csv
chr2 LOC107985782.csv
chr2 LOC107985784.csv
chr2 LOC107985788.csv
chr2 LOC107985795.csv
chr2 LOC107985807.csv
chr2 LOC107985808.csv
chr2 LOC107985811.csv
chr2 LOC107985812.csv
chr2 LOC107985815.csv
chr2 LOC107985817.csv
chr2 LOC107985821.csv
chr2 LOC107985822.csv
chr2 LOC107985830.csv
chr2 LOC107985831.csv
chr2 LOC107985833.csv
chr2 LOC107985838.csv
chr2 LOC107985842.csv
chr2 LOC107985845.csv
chr2 LOC107985847.csv
chr2 LOC107985850.csv
chr2 LOC107985851.csv
chr2 LOC107985852.csv
chr2 LOC107985853.csv
chr2 LOC107985854.csv
chr2 LOC107985855.csv
chr2 LOC107985857.csv
chr2 LOC107985869.csv
chr2 LOC107985877.csv
chr2 LOC107985883.csv
chr2 LOC107985889.csv
chr2 LOC107985892.csv
chr2 LOC107985902.csv
chr2 LOC107985903.csv
chr2 LOC10

Counting sequence length:  14%|█▍        | 2683/19235 [00:09<00:53, 312.23it/s]

chr2 LOC107985953.csv
chr2 LOC107985955.csv
chr2 LOC107985959.csv
chr2 LOC107985960.csv
chr2 LOC107985961.csv
chr2 LOC107985968.csv
chr2 LOC107985972.csv
chr2 LOC107985976.csv
chr2 LOC107985978.csv
chr2 LOC107985980.csv
chr2 LOC107985981.csv
chr2 LOC107985983.csv
chr2 LOC107985984.csv
chr2 LOC107985986.csv
chr2 LOC107985990.csv
chr2 LOC107985991.csv
chr2 LOC107985992.csv
chr2 LOC107985995.csv
chr2 LOC107985996.csv
chr2 LOC107985997.csv
chr2 LOC107986000.csv
chr2 LOC107986001.csv
chr2 LOC107986002.csv
chr2 LOC112268411.csv
chr2 LOC112268412.csv
chr2 LOC112268413.csv
chr2 LOC112268416.csv
chr2 LOC112268418.csv
chr2 LOC112268420.csv
chr2 LOC112268422.csv
chr2 LOC112268426.csv
chr2 LOC112268429.csv
chr2 LOC112268430.csv
chr2 LOC112268433.csv
chr2 LOC112268434.csv
chr2 LOC112268436.csv
chr2 LOC112268438.csv
chr2 LOC112268439.csv
chr2 LOC150935.csv
chr2 LOC284950.csv
chr2 LOC285097.csv
chr2 LOC285191.csv
chr2 LOC344065.csv
chr2 LOC400940.csv
chr2 LOC401021.csv
chr2 LOC401040.csv
chr2 LOC4408

Counting sequence length:  14%|█▍        | 2715/19235 [00:10<01:03, 260.33it/s]

chr2 LPIN1.csv
chr2 LRATD1.csv
chr2 LRRFIP1.csv
chr2 LRRTM4-AS1.csv
chr2 LTBP1.csv
chr2 LYPD6.csv
chr2 LYPD6B.csv
chr2 MAIP1.csv
chr2 MAL.csv
chr2 MAP2.csv
chr2 MAP3K2-DT.csv
chr2 MAP3K20.csv
chr2 MAP4K3-DT.csv
chr2 MAP4K4.csv
chr2 MAPRE3.csv
chr2 MARCHF7.csv
chr2 MARCO.csv
chr2 MARS2.csv
chr2 MAT2A.csv
chr2 MBD5.csv
chr2 MDH1.csv
chr2 MEIS1.csv
chr2 MERTK.csv
chr2 METAP1D.csv
chr2 MFF.csv
chr2 MFSD2B.csv
chr2 MFSD6.csv
chr2 MGAT5.csv
chr2 MIR10B.csv
chr2 MIR1244-1.csv
chr2 MIR1245A.csv
chr2 MIR128-1.csv
chr2 MIR149.csv
chr2 MIR26B.csv
chr2 MIR3125.csv
chr2 MIR3126.csv


Counting sequence length:  14%|█▍        | 2784/19235 [00:10<00:55, 294.70it/s]

chr2 MIR3127.csv
chr2 MIR3130-2.csv
chr2 MIR3133.csv
chr2 MIR3606.csv
chr2 MIR3679.csv
chr2 MIR3681.csv
chr2 MIR3681HG.csv
chr2 MIR4263.csv
chr2 MIR4269.csv
chr2 MIR4430.csv
chr2 MIR4433A.csv
chr2 MIR4434.csv
chr2 MIR4435-1.csv
chr2 MIR4436A.csv
chr2 MIR4436B2.csv
chr2 MIR4438.csv
chr2 MIR4444-1.csv
chr2 MIR4757.csv
chr2 MIR4765.csv
chr2 MIR4771-1.csv
chr2 MIR4772.csv
chr2 MIR4773-1.csv
chr2 MIR4774.csv
chr2 MIR4775.csv
chr2 MIR4776-1.csv
chr2 MIR4777.csv
chr2 MIR5000.csv
chr2 MIR5192.csv
chr2 MIR548AD.csv
chr2 MIR548AE1.csv
chr2 MIR548BA.csv
chr2 MIR548S.csv
chr2 MIR558.csv
chr2 MIR559.csv
chr2 MIR5590.csv
chr2 MIR561.csv
chr2 MIR562.csv
chr2 MIR5696.csv
chr2 MIR5703.csv
chr2 MIR6810.csv
chr2 MIR6811.csv
chr2 MIR6888.csv
chr2 MIR7158.csv
chr2 MIR7515.csv
chr2 MIR7704.csv
chr2 MIR7845.csv
chr2 MIR9500.csv
chr2 MIR9986.csv
chr2 MLPH.csv
chr2 MMADHC-DT.csv
chr2 MOB4.csv
chr2 MOGAT1.csv
chr2 MORN2.csv
chr2 MPHOSPH10.csv
chr2 MROH2A.csv
chr2 MRPL19.csv
chr2 MRPL30.csv
chr2 MRPL33.csv
chr2 

Counting sequence length:  15%|█▍        | 2815/19235 [00:10<00:58, 279.86it/s]

chr2 MSH6.csv
chr2 MTA3.csv
chr2 MTHFD2.csv
chr2 MTX2.csv
chr2 MXD1.csv
chr2 MYCN.csv
chr2 MYCNUT.csv
chr2 MYO1B.csv
chr2 MYO3B.csv
chr2 MYO7B.csv
chr2 MYOSLID.csv
chr2 MYT1L-AS1.csv
chr2 MZT2B.csv
chr2 NAB1.csv
chr2 NABP1-OT1.csv
chr2 NABP1.csv
chr2 NAGK.csv
chr2 NBEAL1.csv
chr2 NCAPH.csv
chr2 NCK2.csv
chr2 NCKAP5-AS1.csv
chr2 NCKAP5-AS2.csv
chr2 NCOA1.csv
chr2 NDUFAF7.csv
chr2 NDUFB3.csv
chr2 NEMP2-DT.csv
chr2 NEU2.csv
chr2 NEU4.csv
chr2 NIF3L1.csv
chr2 NIFK-AS1.csv
chr2 NMS.csv
chr2 NOP58.csv
chr2 NOSTRIN.csv
chr2 NOTO.csv
chr2 NPAS2.csv
chr2 NRBP1.csv
chr2 NRP2.csv
chr2 NT5DC4.csv
chr2 NUP35.csv
chr2 NYAP2.csv
chr2 ODC1-DT.csv
chr2 OSBPL6.csv


Counting sequence length:  15%|█▍        | 2872/19235 [00:10<01:13, 222.62it/s]

chr2 OSGEPL1-AS1.csv
chr2 OTX1.csv
chr2 PAPOLG.csv
chr2 PARD3B.csv
chr2 PAX8-AS1.csv
chr2 PCBP1.csv
chr2 PCGEM1.csv
chr2 PCYOX1.csv
chr2 PDCL3.csv
chr2 PDE11A-AS1.csv
chr2 PDK1.csv
chr2 PEX13.csv
chr2 PHOSPHO2-KLHL23.csv
chr2 PHOSPHO2.csv
chr2 PIKFYVE.csv
chr2 PJVK.csv
chr2 PKDCC.csv
chr2 PKP4.csv
chr2 PLB1.csv
chr2 PLCD4.csv
chr2 PLCL1.csv
chr2 PLEK.csv
chr2 PLEKHA3.csv
chr2 PLEKHB2.csv
chr2 PLEKHH2.csv
chr2 PLGLB2.csv
chr2 PMS1.csv
chr2 PNKD.csv
chr2 PNO1.csv
chr2 POLE4.csv
chr2 POLR1B.csv


Counting sequence length:  15%|█▌        | 2896/19235 [00:10<01:12, 225.14it/s]

chr2 POTEE.csv
chr2 POTEF-AS1.csv
chr2 POTEJ.csv
chr2 POU3F3.csv
chr2 PPIG.csv
chr2 PPM1B.csv
chr2 PPP1CB.csv
chr2 PPP1R1C.csv
chr2 PPP1R21.csv
chr2 PPP1R7.csv
chr2 PPP4R3B-DT.csv
chr2 PRKCE.csv
chr2 PRKD3-DT.csv
chr2 PRLH.csv
chr2 PROC.csv
chr2 PROKR1.csv
chr2 PROM2.csv
chr2 PRSS56.csv
chr2 PSD4.csv
chr2 PSMD1.csv
chr2 PSMD14.csv
chr2 PTCD3.csv
chr2 PTH2R.csv
chr2 PTMA.csv
chr2 PTPN18.csv
chr2 PTPN4.csv
chr2 QPCT.csv
chr2 R3HDM1.csv
chr2 RAB10.csv
chr2 RAB3GAP1.csv
chr2 RAB6C.csv
chr2 RABL2A.csv
chr2 RALB.csv
chr2 RAMP1.csv
chr2 RANBP2.csv
chr2 RAPGEF4.csv
chr2 RASGRP3.csv
chr2 RBM44.csv
chr2 RBM45.csv
chr2 REG1A.csv
chr2 REG3G.csv


Counting sequence length:  15%|█▌        | 2945/19235 [00:11<01:11, 228.11it/s]

chr2 REL.csv
chr2 RETREG2.csv
chr2 RGPD1.csv
chr2 RGPD4.csv
chr2 RGPD5.csv
chr2 RHBDD1.csv
chr2 RHOB.csv
chr2 RHOQ.csv
chr2 RIF1.csv
chr2 RMDN2.csv
chr2 RMND5A.csv
chr2 RNASEH1-DT.csv
chr2 RNF144A.csv
chr2 RNF181.csv
chr2 RNPEPL1.csv
chr2 RNU4ATAC.csv
chr2 RPE.csv
chr2 RPIA.csv
chr2 RPL31.csv
chr2 RPL37A.csv
chr2 RPS27A.csv
chr2 RPS7.csv
chr2 RRM2.csv
chr2 RSAD2.csv
chr2 RTP5.csv
chr2 RUFY4.csv
chr2 SAG.csv
chr2 SANBR.csv
chr2 SATB2-AS1.csv
chr2 SCARNA5.csv
chr2 SCARNA6.csv
chr2 SCHLAP1.csv
chr2 SCLY.csv
chr2 SCN1A-AS1.csv
chr2 SCN2A.csv
chr2 SCRN3.csv
chr2 SCTR-AS1.csv
chr2 SCYGR2.csv
chr2 SCYGR4.csv
chr2 SCYGR5.csv
chr2 SCYGR7.csv
chr2 SCYGR8.csv
chr2 SELENOI.csv
chr2 SEMA4F.csv
chr2 SEPTIN2.csv
chr2 SFT2D3.csv
chr2 SGO2.csv
chr2 SGPP2.csv


Counting sequence length:  16%|█▌        | 2995/19235 [00:11<01:09, 234.92it/s]

chr2 SH2D6.csv
chr2 SH3BP4.csv
chr2 SH3RF3.csv
chr2 SILC1.csv
chr2 SIX3.csv
chr2 SLC11A1.csv
chr2 SLC1A4.csv
chr2 SLC20A1.csv
chr2 SLC30A6.csv
chr2 SLC35F6.csv
chr2 SLC39A10.csv
chr2 SLC3A1.csv
chr2 SLC4A10.csv
chr2 SLC4A1AP.csv
chr2 SLC4A3.csv
chr2 SLC5A7.csv
chr2 SLC66A3.csv
chr2 SLC8A1-AS1.csv
chr2 SLC9A2.csv
chr2 SLC9A4.csv
chr2 SMARCAL1.csv
chr2 SMIM39.csv
chr2 SMYD1.csv
chr2 SMYD5.csv
chr2 SNED1.csv
chr2 SNHG31.csv
chr2 SNORA10B.csv
chr2 SNORA112.csv
chr2 SNORA40B.csv
chr2 SNORA41.csv
chr2 SNORA70I.csv
chr2 SNORC.csv
chr2 SNORD11.csv
chr2 SNORD11B.csv
chr2 SNORD3K.csv
chr2 SNORD51.csv
chr2 SNORD53.csv
chr2 SNORD53B.csv
chr2 SNORD70.csv
chr2 SNORD70B.csv
chr2 SNORD92.csv
chr2 SNORD94.csv
chr2 SNRNP27.csv
chr2 SNTG2.csv
chr2 SNX17.csv
chr2 SOCAR.csv


Counting sequence length:  16%|█▌        | 3043/19235 [00:11<01:10, 230.35it/s]

chr2 SOCS5.csv
chr2 SOWAHC.csv
chr2 SOX11.csv
chr2 SP100.csv
chr2 SP140.csv
chr2 SP140L.csv
chr2 SP5.csv
chr2 SP9.csv
chr2 SPAG16.csv
chr2 SPAST.csv
chr2 SPATA3.csv
chr2 SPATS2L.csv
chr2 SPDYA.csv
chr2 SPEG.csv
chr2 SPEGNB.csv
chr2 SPOPL.csv
chr2 SPP2.csv
chr2 SPR.csv
chr2 SPTBN1.csv
chr2 SSB.csv
chr2 ST3GAL5-AS1.csv
chr2 STAMBP.csv
chr2 STARD7-AS1.csv
chr2 STAT4-AS1.csv
chr2 STEAP3.csv
chr2 STK11IP.csv
chr2 STK16.csv
chr2 STK36.csv
chr2 STON1-GTF2A1L.csv
chr2 STON1.csv
chr2 STRADB.csv
chr2 SULT1C2.csv
chr2 SULT1C3.csv
chr2 SULT1C4.csv
chr2 TAF1B.csv
chr2 TANC1.csv
chr2 TANK.csv
chr2 TBC1D8-AS1.csv
chr2 TBR1.csv
chr2 TCF23.csv


Counting sequence length:  16%|█▌        | 3067/19235 [00:11<01:10, 230.45it/s]

chr2 TCF7L1-IT1.csv
chr2 TCF7L1.csv
chr2 TDRD15.csv
chr2 TEKT4.csv
chr2 TET3.csv
chr2 TEX37.csv
chr2 TEX41.csv
chr2 TEX44.csv
chr2 TEX51.csv
chr2 THNSL2.csv
chr2 THSD7B.csv
chr2 TLX2.csv
chr2 TMEM169.csv
chr2 TMEM177.csv
chr2 TMEM178A.csv
chr2 TMEM18-DT.csv
chr2 TMEM182.csv
chr2 TMEM198.csv
chr2 TMEM214.csv
chr2 TMEM247.csv
chr2 TMEM37.csv
chr2 TMEM87B.csv
chr2 TMSB10.csv
chr2 TNFAIP6.csv
chr2 TNS1-AS1.csv
chr2 TOGARAM2.csv
chr2 TPO.csv
chr2 TRA-CGC3-1.csv
chr2 TRAF3IP1.csv
chr2 TRAPPC12.csv
chr2 TRI-TAT2-1.csv
chr2 TRIB2.csv
chr2 TRIM43.csv
chr2 TRIM54.csv
chr2 TRPM8.csv
chr2 TRY-ATA1-1.csv
chr2 TRY-GTA2-1.csv
chr2 TSN.csv
chr2 TTC21B-AS1.csv
chr2 TTC27.csv
chr2 TTC31.csv
chr2 TTC7A.csv
chr2 TTL.csv
chr2 TTLL4.csv
chr2 TTN-AS1.csv


Counting sequence length:  16%|█▌        | 3119/19235 [00:11<01:13, 219.83it/s]

chr2 TUBA3D.csv
chr2 TUBA4B.csv
chr2 TWIST2.csv
chr2 UBE2E3.csv
chr2 UBE2F-SCLY.csv
chr2 UBE2F.csv
chr2 UBR3.csv
chr2 UBXN2A.csv
chr2 UBXN4.csv
chr2 UGGT1.csv
chr2 UGP2.csv
chr2 UGT1A.csv
chr2 UGT1A1.csv
chr2 UGT1A10.csv
chr2 UGT1A3.csv
chr2 UGT1A4.csv
chr2 UGT1A5.csv
chr2 UGT1A6.csv
chr2 UGT1A7.csv
chr2 UGT1A8.csv
chr2 UGT1A9.csv
chr2 UNC50.csv
chr2 UNC80.csv
chr2 UPP2.csv
chr2 USP34-DT.csv
chr2 USP39.csv
chr2 VAMP5.csv
chr2 VAMP8.csv
chr2 VAX2.csv
chr2 VIL1.csv
chr2 VIT.csv
chr2 VRK2.csv
chr2 VSNL1.csv
chr2 VWA3B.csv
chr2 VWC2L-IT1.csv


Counting sequence length:  16%|█▋        | 3170/19235 [00:11<01:08, 235.52it/s]

chr2 VWC2L.csv
chr2 WBP1.csv
chr2 WDR35-DT.csv
chr2 WDR43.csv
chr2 WDR54.csv
chr2 WDR75.csv
chr2 WNT10A.csv
chr2 WNT6.csv
chr2 XIRP2.csv
chr2 XRCC5.csv
chr2 YIPF4.csv
chr2 YPEL5.csv
chr2 ZAP70.csv
chr2 ZC3H15.csv
chr2 ZC3H6.csv
chr2 ZDBF2.csv
chr2 ZEB2-AS1.csv
chr2 ZFAND2B.csv
chr2 ZNF2.csv
chr2 ZNF512.csv
chr2 ZNF638.csv
chr2 ZNF804A.csv
chr2 ZNF806.csv
chr3 AADAC.csv
chr3 AADACL2.csv
chr3 ABCC5-AS1.csv
chr3 ABCF3.csv
chr3 ABHD10.csv
chr3 ABHD14A-ACY1.csv
chr3 ABHD14A.csv
chr3 ABHD5.csv
chr3 ABHD6.csv
chr3 ABTB1.csv
chr3 ACAD9.csv
chr3 ACKR2.csv
chr3 ACKR4.csv
chr3 ACP3.csv
chr3 ACTL6A.csv
chr3 ACVR2B.csv
chr3 ACY1.csv
chr3 ADAMTS9-AS1.csv
chr3 ADAMTS9-AS2.csv
chr3 ADGRG7.csv
chr3 ADIPOQ.csv
chr3 ADPRH.csv
chr3 AGTR1.csv
chr3 AHSG.csv
chr3 ALAS1.csv
chr3 ALCAM.csv
chr3 ALDH1L1-AS1.csv
chr3 ALDH1L1-AS2.csv


Counting sequence length:  17%|█▋        | 3197/19235 [00:12<01:05, 243.89it/s]

chr3 ALG1L2.csv
chr3 AP2M1.csv
chr3 APEH.csv
chr3 APPL1.csv
chr3 APRG1.csv
chr3 ARF4-AS1.csv
chr3 ARGFX.csv
chr3 ARHGAP31.csv
chr3 ARHGEF26.csv
chr3 ARHGEF3-AS1.csv
chr3 ARIH2.csv
chr3 ARL13B.csv
chr3 ARL14.csv
chr3 ARL6.csv
chr3 ARL6IP5.csv
chr3 ARL8B.csv
chr3 ARMC8.csv
chr3 ARPC4-TTLL3.csv
chr3 ARPC4.csv
chr3 ARPP21.csv
chr3 ATG7.csv
chr3 ATP11B.csv
chr3 ATP13A3-DT.csv
chr3 ATP13A4-AS1.csv
chr3 ATP13A5-AS1.csv
chr3 ATP1B3.csv
chr3 ATP2C1.csv
chr3 ATP6V1A.csv
chr3 ATRIP-TREX1.csv
chr3 ATRIP.csv
chr3 ATXN7.csv
chr3 B3GNT5.csv
chr3 B4GALT4-AS1.csv
chr3 BBX.csv
chr3 BFSP2.csv
chr3 BHLHE40.csv
chr3 BOC.csv
chr3 BPESC1.csv
chr3 BRK1.csv
chr3 BRPF1.csv
chr3 BSN.csv
chr3 BTD.csv


Counting sequence length:  17%|█▋        | 3246/19235 [00:12<01:15, 211.60it/s]

chr3 C3orf14.csv
chr3 C3orf20.csv
chr3 C3orf38.csv
chr3 C3orf49.csv
chr3 C3orf52.csv
chr3 C3orf56.csv
chr3 C3orf80.csv
chr3 C3orf85.csv
chr3 CACNA1D.csv
chr3 CACNA2D3.csv
chr3 CADM2.csv
chr3 CAMP.csv
chr3 CAND2.csv
chr3 CAPN7.csv
chr3 CASR.csv
chr3 CAV3.csv
chr3 CCDC13-AS1.csv
chr3 CCDC13-AS2.csv
chr3 CCDC174.csv
chr3 CCDC50.csv
chr3 CCDC54.csv
chr3 CCDC66.csv
chr3 CCR2.csv
chr3 CCR3.csv
chr3 CCR4.csv
chr3 CCR5.csv
chr3 CCR8.csv
chr3 CCR9.csv
chr3 CCRL2.csv
chr3 CD200.csv
chr3 CD200R1L-AS1.csv
chr3 CD86.csv
chr3 CD96.csv
chr3 CDV3.csv
chr3 CEP63.csv


Counting sequence length:  17%|█▋        | 3268/19235 [00:12<01:17, 205.13it/s]

chr3 CEP97.csv
chr3 CFAP100.csv
chr3 CFAP20DC-AS1.csv
chr3 CFAP20DC-DT.csv
chr3 CFAP44-AS1.csv
chr3 CFAP91.csv
chr3 CHCHD6.csv
chr3 CHL1.csv
chr3 CHMP2B.csv
chr3 CHRD.csv
chr3 CHST13.csv
chr3 CHST2.csv
chr3 CLDN11.csv
chr3 CLDN16.csv
chr3 CLDN18.csv
chr3 CLEC3B.csv
chr3 CLRN1-AS1.csv
chr3 CLSTN2.csv
chr3 CMC1.csv
chr3 CMSS1.csv
chr3 CMTM7.csv
chr3 CMTM8.csv
chr3 CNOT10.csv
chr3 CNTN4.csv
chr3 CNTN6.csv
chr3 COL6A5.csv
chr3 COL6A6.csv


Counting sequence length:  17%|█▋        | 3308/19235 [00:12<01:28, 179.77it/s]

chr3 COL8A1.csv
chr3 COPB2-DT.csv
chr3 COPG1.csv
chr3 CPA3.csv
chr3 CPB1.csv
chr3 CPNE9.csv
chr3 CRELD1.csv
chr3 CRTAP.csv
chr3 CRYBG3.csv
chr3 CSNKA2IP.csv
chr3 CSTA.csv
chr3 CTDSPL.csv
chr3 CTNNB1.csv
chr3 CXCR6.csv
chr3 CYB561D2.csv
chr3 DAG1.csv
chr3 DENND6A-DT.csv
chr3 DHX30.csv
chr3 DIPK2A.csv
chr3 DLEC1.csv
chr3 DLG1-AS1.csv
chr3 DNAH1.csv
chr3 DNAJB11.csv
chr3 DNAJB8-AS1.csv
chr3 DNAJC13.csv
chr3 DOCK3.csv
chr3 DTX3L.csv
chr3 DUBR.csv
chr3 DVL3.csv
chr3 DZIP3.csv
chr3 EAF1.csv
chr3 EAF2.csv
chr3 EBLN2.csv
chr3 ECE2.csv
chr3 ECT2.csv


Counting sequence length:  17%|█▋        | 3357/19235 [00:12<01:19, 200.24it/s]

chr3 EDEM1.csv
chr3 EEF1AKMT4-ECE2.csv
chr3 EEF1AKMT4.csv
chr3 EEFSEC.csv
chr3 EFCC1.csv
chr3 EHHADH-AS1.csv
chr3 EIF1B.csv
chr3 EIF2A.csv
chr3 EIF2B5.csv
chr3 EIF4A2.csv
chr3 EIF4G1.csv
chr3 EMC3-AS1.csv
chr3 ENTPD3.csv
chr3 EPHA3.csv
chr3 EPHA6.csv
chr3 EPHB1.csv
chr3 EPHB3.csv
chr3 ERICH6-AS1.csv
chr3 ESYT3.csv
chr3 EXOG.csv
chr3 EXOSC7.csv
chr3 FAIM.csv
chr3 FAM131A.csv
chr3 FAM157A.csv
chr3 FAM162A.csv
chr3 FAM240A.csv
chr3 FAM3D-AS1.csv
chr3 FAM43A.csv
chr3 FANCD2.csv
chr3 FBLN2.csv
chr3 FBXL2.csv
chr3 FBXO40.csv
chr3 FBXO45.csv
chr3 FBXW12.csv
chr3 FETUB.csv
chr3 FGD5.csv
chr3 FGF12-AS1.csv
chr3 FGF12-AS2.csv
chr3 FLJ42393.csv
chr3 FLNB.csv


Counting sequence length:  18%|█▊        | 3382/19235 [00:13<01:14, 213.63it/s]

chr3 FNDC3B.csv
chr3 FOXL2NB.csv
chr3 FOXP1-AS1.csv
chr3 FRG2C.csv
chr3 FXR1.csv
chr3 FYTTD1.csv
chr3 GALNT15.csv
chr3 GAP43.csv
chr3 GASK1A.csv
chr3 GATA2-AS1.csv
chr3 GFM1.csv
chr3 GHRLOS.csv
chr3 GLYCTK.csv
chr3 GMPS.csv
chr3 GNAI2.csv
chr3 GNAT1.csv
chr3 GNL3.csv
chr3 GOLGA4.csv
chr3 GP9.csv
chr3 GPD1L.csv
chr3 GPR15.csv
chr3 GPR160.csv
chr3 GPR27.csv
chr3 GPR62.csv
chr3 GRAMD1C.csv
chr3 GRK7.csv
chr3 GRM2.csv
chr3 GRM7.csv
chr3 GSK3B-DT.csv
chr3 GTF2E1.csv
chr3 GTPBP8.csv
chr3 GXYLT2.csv
chr3 GYG1.csv
chr3 H1-10-AS1.csv
chr3 H1-8.csv
chr3 HDAC11.csv
chr3 HEMK1.csv
chr3 HES1.csv
chr3 HHATL-AS1.csv
chr3 HHLA2.csv
chr3 HLTF-AS1.csv
chr3 HMCES.csv
chr3 HPS3.csv
chr3 HRG.csv
chr3 HRH1.csv
chr3 HTD2.csv
chr3 HTR1F.csv


Counting sequence length:  18%|█▊        | 3431/19235 [00:13<01:13, 214.82it/s]

chr3 HTR3C.csv
chr3 HTR3D.csv
chr3 HTR3E.csv
chr3 IFT122.csv
chr3 IGF2BP2-AS1.csv
chr3 IGSF11-AS1.csv
chr3 IHO1.csv
chr3 IL12A.csv
chr3 IL17RB.csv
chr3 IL17RC.csv
chr3 IL17RE.csv
chr3 IL1RAP.csv
chr3 IL20RB.csv
chr3 INKA1.csv
chr3 IQCF2.csv
chr3 IQCF3.csv
chr3 IQCF5-AS1.csv
chr3 IQCJ-SCHIP1.csv
chr3 IQCJ.csv
chr3 IRAK2.csv
chr3 ITGA9.csv
chr3 ITIH1.csv
chr3 ITIH3.csv
chr3 ITIH4-AS1.csv
chr3 ITPR1.csv
chr3 JAGN1.csv
chr3 KALRN.csv
chr3 KAT2B.csv
chr3 KBTBD12.csv
chr3 KBTBD8.csv
chr3 KCNAB1.csv
chr3 KCNH8.csv
chr3 KCNMB2.csv
chr3 KCTD6.csv
chr3 KIF15.csv
chr3 KIF9-AS1.csv


Counting sequence length:  18%|█▊        | 3477/19235 [00:13<01:13, 215.71it/s]

chr3 KLHDC8B.csv
chr3 KLHL18.csv
chr3 KLHL24.csv
chr3 KLHL40.csv
chr3 KLHL6-AS1.csv
chr3 KNG1.csv
chr3 KRBOX1.csv
chr3 LARS2.csv
chr3 LEKR1.csv
chr3 LIMD1.csv
chr3 LINC00312.csv
chr3 LINC00488.csv
chr3 LINC00501.csv
chr3 LINC00506.csv
chr3 LINC00578.csv
chr3 LINC00620.csv
chr3 LINC00636.csv
chr3 LINC00690.csv
chr3 LINC00693.csv
chr3 LINC00698.csv
chr3 LINC00852.csv
chr3 LINC00870.csv
chr3 LINC00879.csv
chr3 LINC00881.csv
chr3 LINC00885.csv
chr3 LINC00888.csv
chr3 LINC00901.csv
chr3 LINC00903.csv
chr3 LINC00960.csv
chr3 LINC01014.csv
chr3 LINC01100.csv
chr3 LINC01205.csv
chr3 LINC01206.csv
chr3 LINC01210.csv
chr3 LINC01213.csv
chr3 LINC01215.csv
chr3 LINC01266.csv
chr3 LINC01322.csv
chr3 LINC01327.csv
chr3 LINC01487.csv
chr3 LINC01811.csv
chr3 LINC01839.csv
chr3 LINC01968.csv


Counting sequence length:  18%|█▊        | 3530/19235 [00:13<01:06, 237.54it/s]

chr3 LINC01980.csv
chr3 LINC01985.csv
chr3 LINC01986.csv
chr3 LINC01988.csv
chr3 LINC01990.csv
chr3 LINC01997.csv
chr3 LINC02004.csv
chr3 LINC02005.csv
chr3 LINC02008.csv
chr3 LINC02010.csv
chr3 LINC02015.csv
chr3 LINC02017.csv
chr3 LINC02018.csv
chr3 LINC02019.csv
chr3 LINC02020.csv
chr3 LINC02021.csv
chr3 LINC02025.csv
chr3 LINC02027.csv
chr3 LINC02029.csv
chr3 LINC02030.csv
chr3 LINC02031.csv
chr3 LINC02032.csv
chr3 LINC02034.csv
chr3 LINC02035.csv
chr3 LINC02037.csv
chr3 LINC02040.csv
chr3 LINC02041.csv
chr3 LINC02046.csv
chr3 LINC02047.csv
chr3 LINC02048.csv
chr3 LINC02050.csv
chr3 LINC02051.csv
chr3 LINC02053.csv
chr3 LINC02054.csv
chr3 LINC02066.csv
chr3 LINC02067.csv
chr3 LINC02070.csv
chr3 LINC02082.csv
chr3 LINC02083.csv
chr3 LINC02084.csv
chr3 LINC02085.csv
chr3 LINC02585.csv
chr3 LINC02618.csv
chr3 LINC02877.csv
chr3 LINCR-0002.csv
chr3 LMCD1.csv
chr3 LMLN.csv
chr3 LNP1.csv
chr3 LOC100131635.csv


Counting sequence length:  19%|█▊        | 3568/19235 [00:13<01:00, 257.95it/s]

chr3 LOC100287290.csv
chr3 LOC100507389.csv
chr3 LOC101926886.csv
chr3 LOC101926923.csv
chr3 LOC101926953.csv
chr3 LOC101926968.csv
chr3 LOC101927010.csv
chr3 LOC101927296.csv
chr3 LOC101927374.csv
chr3 LOC101927439.csv
chr3 LOC101927467.csv
chr3 LOC101927518.csv
chr3 LOC101927647.csv
chr3 LOC101927829.csv
chr3 LOC101927942.csv
chr3 LOC101927995.csv
chr3 LOC101928236.csv
chr3 LOC101928263.csv
chr3 LOC101928529.csv
chr3 LOC101928636.csv
chr3 LOC101928711.csv
chr3 LOC101929130.csv
chr3 LOC101929411.csv
chr3 LOC102723430.csv
chr3 LOC102723512.csv
chr3 LOC102723596.csv
chr3 LOC102724048.csv
chr3 LOC102724104.csv
chr3 LOC102724120.csv
chr3 LOC102724289.csv
chr3 LOC102724438.csv
chr3 LOC102724479.csv
chr3 LOC102724817.csv
chr3 LOC102724877.csv
chr3 LOC102724949.csv
chr3 LOC105369194.csv





KeyboardInterrupt: 

In [None]:
import pandas as pd
_cols = ['chr', 'gene']
df_path = os.path.join('workspace', 'sequential-labelling', 'gene_index.csv')
_dir = os.path.join('workspace', 'sequential-labelling', 'duplicate genes')
os.makedirs(_dir, exist_ok=True)
df = pd.read_csv(df_path)
train_df = pd.DataFrame(columns=_cols)
valid_df = pd.DataFrame(columns=_cols)
test_df = pd.DataFrame(columns=_cols)
genes_unique = df['gene'].unique()
for g in genes_unique:
    filtered_df = df[df['gene'] == g]
    if filtered_df.shape[0] > 1:
        _g = "{}.csv".format(g)
        filtered_df.to_csv(os.path.join(_dir, _g), index=False)

In [None]:
from data_preparation import gff_to_csvs, gff_to_csv
from data_dir import (annotated_grch38_gff, annotated_grch38_gff_dir, annotated_grch38_gff_csv)

print(annotated_grch38_gff)
print(annotated_grch38_gff_csv)
print(annotated_grch38_gff_dir)

In [None]:
from data_dir import chr21_fasta, chr21_index_csv, data_genome_grch38_labels_dir
from data_preparation import generate_sequence_labelling
import os

target_path = os.path.join(data_genome_grch38_labels_dir, 'chr21.csv')
print("Generate sequential labelling {} => {}: {}".format(chr21_index_csv, target_path, generate_sequence_labelling(chr21_index_csv, chr21_fasta, target_path)))


In [None]:
from data_dir import (
    chr1_index_csv, chr2_index_csv, chr3_index_csv, chr4_index_csv, chr5_index_csv, chr6_index_csv, chr7_index_csv, chr8_index_csv, chr9_index_csv, chr10_index_csv,
    chr11_index_csv, chr12_index_csv, chr13_index_csv, chr14_index_csv, chr15_index_csv, chr16_index_csv, chr17_index_csv, chr18_index_csv, chr19_index_csv, chr20_index_csv,
    chr21_index_csv, chr22_index_csv, chr23_index_csv, chr24_index_csv
)
from data_dir import (
    chr1_fasta, chr2_fasta, chr3_fasta, chr4_fasta, chr5_fasta, chr6_fasta, chr7_fasta, chr8_fasta, chr9_fasta, chr10_fasta, 
	chr11_fasta, chr12_fasta, chr13_fasta, chr14_fasta, chr15_fasta, chr16_fasta, chr17_fasta, chr18_fasta, chr19_fasta, chr20_fasta, 
	chr21_fasta, chr22_fasta, chr23_fasta, chr24_fasta,
)
chr_fastas = [
    chr1_fasta,
	#chr2_fasta, chr3_fasta, chr4_fasta, chr5_fasta, chr6_fasta, chr7_fasta, chr8_fasta, chr9_fasta, chr10_fasta,
	#chr11_fasta, chr12_fasta, chr13_fasta, chr14_fasta, chr15_fasta, chr16_fasta, chr17_fasta, chr18_fasta, chr19_fasta, chr20_fasta,
	#chr21_fasta, chr22_fasta, chr23_fasta, chr24_fasta
]
from data_dir import labseq_dir, labseq_names
from data_preparation import generate_sequence_labelling
chr_indices = [
    chr1_index_csv, 
	#chr2_index_csv, chr3_index_csv, chr4_index_csv, chr5_index_csv, chr6_index_csv, chr7_index_csv, chr8_index_csv, chr9_index_csv, chr10_index_csv,
    #chr11_index_csv, chr12_index_csv, chr13_index_csv, chr14_index_csv, chr15_index_csv, chr16_index_csv, chr17_index_csv, chr18_index_csv, chr19_index_csv, chr20_index_csv,
    #chr21_index_csv, chr22_index_csv, chr23_index_csv, chr24_index_csv
]
chr_labseq_path = [os.path.join(labseq_dir, fname) for fname in labseq_names[0:1]]
for src, fasta, target in zip(chr_indices, chr_fastas, chr_labseq_path):
    print(src, fasta, target)
    #print("Generating sequential labelling for index {}, from fasta {}, to {}: {}".format(src, fasta, target, generate_sequence_labelling(src, fasta, target, do_expand=True, expand_size=512)))

In [None]:
from sequential_labelling import Label_Dictionary
import pandas as pd
from tqdm import tqdm
import os

def prepare_sequence_from_csv(src_csv, label_dictionary=Label_Dictionary):
    """
    Convert sequence into tokenized DNA sequence and label sequence.
    CSV source has columns `sequence` and `label`. 
    `sequence` contains tokenized DNA sequence and `label` contains sequence of labels.
    @param      src_csv (string): path to CSV source.
    @param      label_dictionary (dict): dictionary to convert label into number.
    @return     sequence, labels
    """
    if not os.path.exists(src_csv):
        raise FileNotFoundError(src_csv)
    if label_dictionary == None:
        raise Exception("Argument `label_dictionary` cannot be empty!")
    
    df = pd.read_csv(src_csv)
    return list(df['sequence']), list(df['label'])
    

In [None]:
"""Create bundle from all expanded sequence from all genes from all chromosomes."""
import os
import pandas as pd
from tqdm import tqdm

src_dirs = [os.path.join("data", "genome", "seqlab.positive.strand", f"chr{i+1}.expanded") for i in range(24)]
bundle_path = os.path.join("data", "genome", "seqlab.positive.strand", "bundle.csv")
if os.path.exists(bundle_path):
    os.remove(bundle_path)
bundle = open(bundle_path, "x")
bundle.write("sequence,label\n")
for srcdir in tqdm(src_dirs, total=24):
    files = [os.path.join(srcdir, f) for f in os.listdir(srcdir)]
    for f in files:
        df = pd.read_csv(f)
        for i, r in df.iterrows():
            bundle.write(f"{r['sequence']},{r['label']}\n")
bundle.close()

In [1]:
# 17 May 2022
# From non overlapping kmer-ized genes, make bundle.
import pandas as pd
import os
chrs = [f"chr{i + 1}" for i in range(24)]
srcs = [os.path.join("workspace", "genlab", "seqlab.strand-positive.kmer.stride-510", chr) for chr in chrs]
dest = os.path.join("workspace", "seqlab", "seqlab.strand-positive.kmer.stride-510", "bundle.csv")

if os.path.exists(dest):
    os.remove(dest)
fdest = open(dest, "x")
fdest.write("sequence,label\n")
for src in srcs:
    files = os.listdir(src)
    files = [os.path.join(src, f) for f in files]
    for f in files:
        df = pd.read_csv(f)
        for i,r in df.iterrows():
            fdest.write(f"{r['sequence']},{r['label']}\n")


fdest.close()

In [10]:
# Creating bundle from index.
import os
import pandas as pd
from tqdm import tqdm

gene_dir = os.path.join("data", "gene_dir_c510_k3")
index_dir = os.path.join("index")
#train_index = os.path.join(gene_dir, "gene_train_index.csv")
#validation_index = os.path.join(gene_dir, "gene_validation_index.csv")
#test_index = os.path.join(gene_dir, "gene_test_index.csv")
whole_index = os.path.join(index_dir, "gene_index.csv")

# Create training bundle from index.
# Store file in by_sequence folder.
bundle_path = os.path.join("workspace", "seqlab", "seqlab-3")
#train_bundle = os.path.join(bundle_path, "gene_train_bundle.csv")
#validation_bundle = os.path.join(bundle_path, "gene_validation_bundle.csv")
#test_bundle = os.path.join(bundle_path, "gene_test_bundle.csv")
whole_bundle = os.path.join(bundle_path, "gene_bundle.csv")
for s, d in zip([whole_index], [whole_bundle]):
    df = pd.read_csv(s)
    ddirname = os.path.dirname(d)
    if not os.path.exists(ddirname):
        os.makedirs(ddirname)
    if os.path.exists(d):
        os.remove(d)
    dest_file = open(d, "x")
    dest_file.write("sequence,label\n")
    for i, r in tqdm(df.iterrows(), total=df.shape[0], desc="Bundling"):
        chr = r["chr"]
        gene = r["gene"]
        gene_path = os.path.join(gene_dir, chr, gene)
        gene_df = pd.read_csv(gene_path)
        for j, k in gene_df.iterrows():
            dest_file.write(f"{k['sequence']},{k['label']}\n")

    dest_file.close()

# 19235 genes seqlab
# 19235 genes seqlab-3

Bundling: 100%|██████████| 19235/19235 [11:28<00:00, 27.95it/s] 


In [2]:
# Split gene_bundle into train, validation, and test set.
import os 
import pandas as pd

dirpath = os.path.join("workspace", "seqlab", "seqlab-3")
gene_bundle_path = os.path.join(dirpath, "gene_bundle.csv")
gene_train_bundle_path = os.path.join(dirpath, "gene_train_bundle.csv")
gene_validation_bundle_path = os.path.join(dirpath, "gene_validation_bundle.csv")
gene_test_bundle_path = os.path.join(dirpath, "gene_test_bundle.csv")

df = pd.read_csv(gene_bundle_path)
train_df = df.sample(frac=0.8, random_state=1337)
validation_df = df.drop(train_df.index)
test_df = validation_df.sample(frac=0.5, random_state=1337)
validation_df = validation_df.drop(test_df.index)
train_df.to_csv(gene_train_bundle_path, index=False)
validation_df.to_csv(gene_validation_bundle_path, index=False)
test_df.to_csv(gene_test_bundle_path, index=False)

In [6]:
# Create small version of seqlab-3 sequence.
import os 
import pandas as pd

fractions = [0.1, 0.25]
seqlab_3_dir = os.path.join("workspace", "seqlab", "seqlab-3")
train_df = pd.read_csv(os.path.join(seqlab_3_dir, "gene_train_bundle.csv"))
validation_df = pd.read_csv(os.path.join(seqlab_3_dir, "gene_validation_bundle.csv"))
test_df = pd.read_csv(os.path.join(seqlab_3_dir, "gene_test_bundle.csv"))

for frac in fractions:
    train_df.sample(frac=frac).to_csv(os.path.join(seqlab_3_dir, f"gene_train_bundle.{frac * 100}.csv"))
    validation_df.sample(frac=frac).to_csv(os.path.join(seqlab_3_dir, f"gene_validation_bundle.{frac * 100}.csv"))
    test_df.sample(frac=frac).to_csv(os.path.join(seqlab_3_dir, f"gene_test_bundle.{frac * 100}.csv"))

In [4]:
# Generate SPLICE SITES, INTRON, and EXON only bundles.
def at_least_one_exists(list, target_list):
    # Check if at leats one element of list exists in target_list.
    found = False
    for elem in list:
        if elem in target_list:
            found = True
    return found

splice_sites = ['iiE', 'iEi', 'Eii', 'iEE', 'EEi', 'EiE']

import os
import pandas as pd
from tqdm import tqdm

gene_bundle_dirpath = os.path.join("workspace", "seqlab", "seqlab-3")
gene_bundle = os.path.join(gene_bundle_dirpath, "gene_bundle.csv")
gene_ss_bundle = os.path.join(gene_bundle_dirpath, "gene_ss_bundle.csv")
gene_exon_bundle = os.path.join(gene_bundle_dirpath, "gene_exon_bundle.csv")
gene_intron_bundle = os.path.join(gene_bundle_dirpath, "gene_intron_bundle.csv")
for p in [gene_bundle]:
    df = pd.read_csv(p)
    for a in [gene_ss_bundle, gene_exon_bundle, gene_intron_bundle]:
        if os.path.exists(a):
            os.remove(a)

    gene_ss_bundle = open(gene_ss_bundle, "x")
    gene_exon_bundle = open(gene_exon_bundle, "x")
    gene_intron_bundle = open(gene_intron_bundle, "x")

    for a in [gene_ss_bundle, gene_exon_bundle, gene_intron_bundle]:
        a.write("sequence,label\n")

    from datetime import datetime
    cur_date = datetime.now()
    for i, r in tqdm(df.iterrows(), total=df.shape[0], desc=f"Processing ({cur_date})"):
        arr_labels = r["label"].split(" ")
        if at_least_one_exists(splice_sites, arr_labels):
            # raise NotImplementedError("TODO: write sequence into splice site bundle.")
            # Extract splice sites from gene_train_bundle.csv, gene_validation_bundle.csv, and gene_test_bundle.csv.
            gene_ss_bundle.write(f"{r['sequence']},{r['label']}\n")
        elif all([a == "iii" for a in arr_labels]):
            # raise NotImplementedError("TODO: write sequence into intron bundle.")
            # Extract introns.
            gene_intron_bundle.write(f"{r['sequence']},{r['label']}\n")
        elif all([a == "EEE" for a in arr_labels]):
            # raise NotImplementedError("TODO: write sequence into exon bundle.")
            # Extract exons.
            gene_exon_bundle.write(f"{r['sequence']},{r['label']}\n")

    for a in [gene_ss_bundle, gene_intron_bundle, gene_exon_bundle]:
        a.close()

# Split SPLICE SITES bundle into train, validation, and test set.
import os 
import pandas as pd

dirpath = os.path.join("workspace", "seqlab", "seqlab-3")
splice_site_bundle_path = os.path.join(dirpath, "gene_ss_bundle.csv")
splice_site_train_bundle_path = os.path.join(dirpath, "gene_ss_train_bundle.csv")
splice_site_validation_bundle_path = os.path.join(dirpath, "gene_ss_validation_bundle.csv")
splice_site_test_bundle_path = os.path.join(dirpath, "gene_ss_test_bundle.csv")

df = pd.read_csv(splice_site_bundle_path)
train_df = df.sample(frac=0.8, random_state=1337)
validation_df = df.drop(train_df.index)
test_df = validation_df.sample(frac=0.5, random_state=1337)
validation_df = validation_df.drop(test_df.index)
train_df.to_csv(splice_site_train_bundle_path, index=False)
validation_df.to_csv(splice_site_validation_bundle_path, index=False)
test_df.to_csv(splice_site_test_bundle_path, index=False)

# 1753964 
train_len = train_df.shape[0]
validation_len = validation_df.shape[0]
test_len = test_df.shape[0]
total = train_len + validation_len + test_len
print(f"# Training instance {train_len} {train_len/total}")
print(f"# Validation instance {validation_len} {validation_len/total}")
print(f"# Test instance {test_len} {test_len/total}")


Processing (2022-08-12 09:47:26.822747): 100%|██████████| 1753964/1753964 [13:33<00:00, 2156.62it/s]


In [None]:
train_len = train_df.shape[0]
validation_len = validation_df.shape[0]
test_len = test_df.shape[0]
total = train_len + validation_len + test_len
print(f"# Training instance {train_len} {train_len/total}")
print(f"# Validation instance {validation_len} {validation_len/total}")
print(f"# Test instance {test_len} {test_len/total}")


In [16]:
# Gene sequential labelling.
# Create 10% and 25% sample of gene indices.

import os
import pandas as pd

index_dir = os.path.join("index")
gene_train_index = os.path.join(index_dir, "gene_train_index.csv")
gene_validation_index = os.path.join(index_dir, "gene_validation_index.csv")
gene_test_index = os.path.join(index_dir, "gene_test_index.csv")

gene_train_index_10 = os.path.join(index_dir, "gene_train_index.10.csv")
gene_validation_index_10 = os.path.join(index_dir, "gene_validation_index.10.csv")
gene_test_index_10 = os.path.join(index_dir, "gene_test_index.10.csv")

gene_train_index_25 = os.path.join(index_dir, "gene_train_index.25.csv")
gene_validation_index_25 = os.path.join(index_dir, "gene_validation_index.25.csv")
gene_test_index_25 = os.path.join(index_dir, "gene_test_index.25.csv")

srcs = [gene_train_index, gene_validation_index, gene_test_index]
tens = [gene_train_index_10, gene_validation_index_10, gene_test_index_10]
quarter = [gene_train_index_25, gene_validation_index_25, gene_test_index_25]

for src, t, q in zip(srcs, tens, quarter):
    src_df = pd.read_csv(src)
    src_10_df = src_df.sample(frac=0.1, random_state=1337)
    src_10_df.to_csv(t, index=False)
    src_25_df = src_df.sample(frac=0.25, random_state=1337)
    src_25_df.to_csv(q, index=False)


In [20]:
from tqdm import tqdm
import pandas as pd

def generate_bundle_with_marker(src_index, target_bundle, gene_dir):
    gene_train_index = src_index
    gene_train_bundle_csv = target_bundle
    if os.path.exists(gene_train_bundle_csv):
        os.remove(gene_train_bundle_csv)

    target_bundle_dir = os.path.dirname(target_bundle)
    os.makedirs(target_bundle_dir, exist_ok=True)

    gene_train_bundle = open(gene_train_bundle_csv, "x")
    gene_train_bundle.write("sequence,label,marker\n")

    df = pd.read_csv(gene_train_index)
    marker = True
    for i, r in tqdm(df.iterrows(), total=df.shape[0], desc= f"Processing {os.path.basename(src_index)}"):
        chr_dir = r["chr"]
        gene_file = r["gene"]
        gene_name = gene_file.split(".")[0]
        gene_csv = os.path.join(gene_dir, chr_dir, gene_file)
        gene_df = pd.read_csv(gene_csv)
        for j, k in gene_df.iterrows():
            sequence = k["sequence"]
            label = k["label"]
            gene_train_bundle.write(f"{sequence},{label},{int(marker)}\n")
        
        marker = not marker

    gene_train_bundle.close()

In [21]:
# Gene labelling: create bundle with marker that indicates a set of sequence belong to one long sequence.
import os
import pandas as pd
from tqdm import tqdm

index_dir = os.path.join("index")
srcs = [
    os.path.join(index_dir, "gene_train_index.10.csv"),
    os.path.join(index_dir, "gene_train_index.25.csv"),
    os.path.join(index_dir, "gene_train_index.csv"),
    os.path.join(index_dir, "gene_validation_index.10.csv"),
    os.path.join(index_dir, "gene_validation_index.25.csv"),
    os.path.join(index_dir, "gene_validation_index.csv"),
    os.path.join(index_dir, "gene_test_index.10.csv"),
    os.path.join(index_dir, "gene_test_index.25.csv"),
    os.path.join(index_dir, "gene_test_index.csv"),
]
workspace_dir = os.path.join("workspace", "genlab", "genlab-3")
dests = [
    os.path.join(workspace_dir, "gene_train_index_bundle.10.csv"),
    os.path.join(workspace_dir, "gene_train_index_bundle.25.csv"),
    os.path.join(workspace_dir, "gene_train_index_bundle.csv"),
    os.path.join(workspace_dir, "gene_validation_index_bundle.10.csv"),
    os.path.join(workspace_dir, "gene_validation_index_bundle.25.csv"),
    os.path.join(workspace_dir, "gene_validation_index_bundle.csv"),
    os.path.join(workspace_dir, "gene_test_index_bundle.10.csv"),
    os.path.join(workspace_dir, "gene_test_index_bundle.25.csv"),
    os.path.join(workspace_dir, "gene_test_index_bundle.csv"),
]
# Generate gene bundles based on above.
import os

gene_dir = os.path.join("data", "gene_dir_c510_k3")
for a, b in zip(srcs, dests):
    generate_bundle_with_marker(a, b, gene_dir)

Processing gene_train_index.10.csv: 100%|██████████| 1539/1539 [02:20<00:00, 10.98it/s]
Processing gene_train_index.25.csv: 100%|██████████| 3847/3847 [03:28<00:00, 18.42it/s]
Processing gene_train_index.csv: 100%|██████████| 15388/15388 [14:23<00:00, 17.83it/s]
Processing gene_validation_index.10.csv: 100%|██████████| 192/192 [00:10<00:00, 18.67it/s]
Processing gene_validation_index.25.csv: 100%|██████████| 481/481 [00:22<00:00, 21.73it/s]
Processing gene_validation_index.csv: 100%|██████████| 1924/1924 [01:46<00:00, 18.13it/s]
Processing gene_test_index.10.csv: 100%|██████████| 192/192 [00:12<00:00, 15.86it/s]
Processing gene_test_index.25.csv: 100%|██████████| 481/481 [00:23<00:00, 20.22it/s]
Processing gene_test_index.csv: 100%|██████████| 1923/1923 [01:38<00:00, 19.52it/s]


In [3]:
# Create gene sequence which contains splice sites at all position.
import pandas as pd
import os

from utils.utils import is_exists_splice_site_in_sequence

is_exists_splice_site_in_sequence(["iii", "EEE", "EEE"])


False

In [2]:
# Create gene sequence which contains splice sites at all position.
import pandas as pd
import os

from tqdm import tqdm
from utils.utils import kmer, str_kmer, is_exists_splice_site_in_sequence

def generate_splice_site_all_pos_bundle(source_gene_dir, bundle_dest_dir, chunk_size, kmer_size):
    """
    `source_gene_dir` - contains directories which each corresponds to chromosome.
    Genes in chromosome folder is in raw format, not kmerized.
    `bundle_dest_dir` - folder where the resulting bundle will be written.
    """
    chr_names = os.listdir(source_gene_dir)
    chr_dirs = [os.path.join(source_gene_dir, a) for a in chr_names]
    chr_dirs = [a for a in chr_dirs if os.path.isdir(a)]

    os.makedirs(bundle_dest_dir, exist_ok=True)
    bundle_path = os.path.join(bundle_dest_dir, "splice_site_all_pos.csv")
    if os.path.exists(bundle_path):
        os.remove(bundle_path)
    
    bundle_file = open(bundle_path, "x")
    bundle_file.write("sequence,label\n")

    for d in tqdm(chr_dirs, total=len(chr_dirs), desc="Processing Chromosome"):
        filenames = os.listdir(d)
        filepaths = [os.path.join(d, a) for a in filenames]
        filepaths = [a for a in filepaths if os.path.isfile(a)]
        
        for f in filepaths:
            df = pd.read_csv(f)
            for i, r in df.iterrows():
                sequence = r["sequence"]
                label = r["label"]
                len_sequence = len(sequence)
                for i in range(0, len_sequence - chunk_size, 1):
                    sublabel = label[i:i+chunk_size]
                    arr_sublabel = kmer(sublabel, kmer_size)
                    if is_exists_splice_site_in_sequence(arr_sublabel):
                        subsequence = sequence[i:i+chunk_size]
                        bundle_file.write(f"{str_kmer(subsequence, kmer_size)},{' '.join(arr_sublabel)}\n")
                    
    bundle_file.close()                


In [3]:
# Create sample index.
import os 
gene_dir_path = os.path.join("data", "gene_dir_sample")
bundle_dest_dir = os.path.join("data", "gene_dir_sample")
chunk_size = 512
kmer_size = 3
generate_splice_site_all_pos_bundle(gene_dir_path, bundle_dest_dir, chunk_size, kmer_size)

Processing Chromosome: 100%|██████████| 2/2 [08:54<00:00, 267.35s/it]
