In [4]:
# Generate gene index.
import os
import pandas as pd
from tqdm import tqdm

def generate_gene_index(gene_dir, index_dir):
    chr_dirs = [f"chr{i+1}" for i in range(0, 24)]
    chrs_path = [os.path.join(gene_dir, a) for a in chr_dirs]
    
    index_path = os.path.join(index_dir, "gene_index.csv")
    os.makedirs(os.path.dirname(index_path), exist_ok=True)
    if os.path.exists(index_path):
        os.remove(index_path)
    index = open(index_path, "x")
    index.write("chr,gene,size\n")
    for chr_dir in tqdm(chrs_path, total=len(chrs_path), desc="Creating index"):
        file_names = [a for a in os.listdir(chr_dir) if os.path.isfile(os.path.join(chr_dir, a))]
        for fname in file_names:
            # Count gene length.
            fpath = os.path.join(chr_dir, fname)
            df = pd.read_csv(fpath)
            len_sequence = 0
            for i, r in df.iterrows():                
                len_sequence += len(r["sequence"])

            index.write(f"{os.path.basename(chr_dir)},{fname},{len_sequence}\n")
    index.close()

gene_dir = os.path.join("data", "genome", "seqlab.positive.strand")
index_dir = os.path.join("data", "genome", "seqlab.positive.strand")
generate_gene_index(gene_dir, index_dir)

Creating index: 100%|██████████| 24/24 [06:53<00:00, 17.21s/it]


In [None]:
"""
Merge genes.
"""
import os
from data_preparation import merge_csv
for c in range(4, 24):
    chr = f"chr{c+1}"
    src_dir = os.path.join("workspace", "seq2seq-stride.384", chr)
    dest_file = os.path.join("workspace", "seq2seq-stride.384", chr, "bundle.csv")
    src_files = [os.path.join(src_dir, fname) for fname in os.listdir(src_dir)] 
    status = merge_csv(src_files, dest_file)
    if not status:
        print(f"Something wrong with merging files in directory {src_dir}.")
        raise Exception("something wrong.")


In [None]:
"""
Make sample from bundle.csv.
"""
from utils.utils import create_n_sample
import os

n_sample = 100
for c in range(24):
    bundle_csv = os.path.join("workspace", "seq2seq-stride.384", f"chr{c+1}", "bundle.csv")
    bundle_sample_csv = os.path.join("workspace", "seq2seq-stride.384", f"chr{c+1}", "bundle.sample.csv")
    create_n_sample(bundle_csv, n_sample, bundle_sample_csv)

In [None]:
"""
Make train, validation, and test bundle for each chr.
"""
from utils.utils import split_and_store_csv
import os

fractions = [0.7, 0.2, 0.1]
file_types = ["train", "validation", "test"]
dest_dir = [os.path.join("workspace", "seq2seq-stride.384", f"chr{c+1}") for c in range(24)]
src_files = [os.path.join("workspace", "seq2seq-stride.384", f"chr{c+1}", "bundle.csv") for c in range(24)]
dest_train_files = [os.path.join("workspace", "seq2seq-stride.384", f"chr{c+1}", "bundle.train.csv") for c in range(24)]
dest_valid_files = [os.path.join("workspace", "seq2seq-stride.384", f"chr{c+1}", "bundle.validation.csv") for c in range(24)]
dest_test_files = [os.path.join("workspace", "seq2seq-stride.384", f"chr{c+1}", "bundle.test.csv") for c in range(24)]

for src, dest_train, dest_valid, dest_test in zip(src_files, dest_train_files, dest_valid_files, dest_test_files):
    split_and_store_csv(src, fractions, [
        dest_train, dest_valid, dest_test
    ])

In [None]:
"""
Merge bundle.sample.csv from every chr.
"""
from data_preparation import merge_csv
import os
chr_bundle_sample_csvs = [os.path.join("workspace", "seq2seq-stride.384", f"chr{c+1}", "bundle.sample.csv") for c in range(24)]
merged_bundle_sample = os.path.join("workspace", "seq2seq-stride.384", "bundle.sample.csv")
merge_csv(chr_bundle_sample_csvs, merged_bundle_sample)

In [None]:
"""
Merge bundle from every chr.
"""
from data_preparation import merge_csv
import os
chr_bundle_csvs = [os.path.join("workspace", "seq2seq-stride.384", f"chr{c+1}", "bundle.csv") for c in range(24)]
merged_bundle = os.path.join("workspace", "seq2seq-stride.384", "bundle.csv")
merge_csv(chr_bundle_csvs, merged_bundle)

In [None]:
"""
Merge bundle train, validation, and test from every chr.
"""
from data_preparation import merge_csv
import os

for t in ["train", "validation", "test"]:
    chr_bundle_csvs = [os.path.join("workspace", "seq2seq-stride.384", f"chr{c+1}", f"bundle.{t}.csv") for c in range(24)]
    merged_bundle = os.path.join("workspace", "seq2seq-stride.384", f"bundle.{t}.csv")
    merge_csv(chr_bundle_csvs, merged_bundle)

In [None]:
"""
Merge bundle training, validation, and test from every chr.
"""
from data_preparation import merge_csv
import os
for t in ["train", "validation", "test"]:
    chr_bundle_csvs = [os.path.join("workspace", "seq2seq-stride.384", f"chr{c+1}", f"bundle.{t}.csv") for c in range(24)]
    merged_bundle = os.path.join("workspace", "seq2seq-stride.384", f"bundle.{t}.csv")
    merge_csv(chr_bundle_csvs, merged_bundle)

In [None]:
from data_preparation import kmer
import pandas as pd
import os

_cols = ['sequence', 'label']
_stride = 256
_length = 512
for c in ['chr{}'.format(i+1) for i in range(24)]:
    genes_dir_by_chr = os.path.join('data', 'genome', 'sequential-labelling-positive-strand', c)
    genes_expansion_dir_by_chr = os.path.join('workspace', f"sequential-labelling-stride.{_stride}", c)
    if not os.path.exists(genes_expansion_dir_by_chr):
        os.makedirs(genes_expansion_dir_by_chr, exist_ok=True)
    for gene_file in os.listdir(genes_dir_by_chr):
        gene_file_path = os.path.join(genes_dir_by_chr, gene_file)
        gene_expansion_file_path = os.path.join(genes_expansion_dir_by_chr, f"{gene_file.split('.')[0]}.expanded.csv")
        if os.path.exists(gene_expansion_file_path):
            os.remove(gene_expansion_file_path)
        target_file = open(gene_expansion_file_path, 'x')
        target_file.write(f"sequence,label\n")
        df = pd.read_csv(gene_file_path)
        print(f"Working on {c} {gene_file_path}                                 ", end='\r')
        for _, row in df.iterrows():
            seq_chunks = kmer(row['sequence'].strip(), _length, _stride)
            label_chunks = kmer(row['label'].strip(), _length, _stride)
            for seq, label in zip(seq_chunks, label_chunks):
                target_file.write(f"{seq},{label}\n")
        target_file.close()


In [None]:
"""
Filter index based on gene name.
"""
import os
import pandas as pd
_chr_indices = [
    'NC_000001.11.csv',
    'NC_000002.12.csv',
    'NC_000003.12.csv',
    'NC_000004.12.csv',
    'NC_000005.10.csv',
    'NC_000006.12.csv',
    'NC_000007.14.csv',
    'NC_000008.11.csv',
    'NC_000009.12.csv',
    'NC_000010.11.csv',
    'NC_000011.10.csv',
    'NC_000012.12.csv',
    'NC_000013.11.csv',
    'NC_000014.9.csv',
    'NC_000015.10.csv',
    'NC_000016.10.csv',
    'NC_000017.11.csv',
    'NC_000018.10.csv',
    'NC_000019.10.csv',
    'NC_000020.11.csv',
    'NC_000021.9.csv',
    'NC_000022.11.csv',
    'NC_000023.11.csv',
    'NC_000024.10.csv']
_chr_dir = ["chr{}".format(i+1) for i in range(len(_chr_indices))]

for chr, chr_number in zip(_chr_indices, _chr_dir):
    path = os.path.join('data', 'genome', 'grch38', 'csvs_strand', chr)
    df = pd.read_csv(path)
    genes = list(df['gene'].unique())
    genes = [a for a in genes if not pd.isnull(a)]
    for g in genes:
        ndf = df[df['gene'] == g]
        path = os.path.join('data', 'genome', 'grch38', 'genes', chr_number,"{}.csv".format(g))
        if not os.path.exists(os.path.dirname(path)):
            os.mkdir(os.path.dirname(path))
        ndf.to_csv(path, index=False)
        print("Success: {}                                                      ".format(path), end="\r")

In [8]:
"""
Generate sequence from fasta based on gene indices.
"""
import os
from Bio import SeqIO
import pandas as pd
from tqdm import tqdm
from data_dir import chr_fasta_mapname

strand = '+'
chrs = ['chr{}'.format(i+1) for i in range(24)] # Folder name.
# chrs = ['chr{}'.format(i+1) for i in range(2)] # Folder name.

for c in chrs:
    chr_genes_indices_dir = os.path.join("data", "genome", "grch38", "genes", c)
    chr_genes_sequence_csv_dir = os.path.join("data", "genome", "seqlab.positive.strand",c)
    chr_fasta = os.path.join("data", "chr", chr_fasta_mapname[c])

    if not os.path.exists(chr_fasta):
        raise FileNotFoundError("Fasta {} not found.".format(chr_fasta))

    """
    Read whole chromosome here and return its complete sequence.
    Yes, it's long.
    """
    records = SeqIO.parse(chr_fasta, "fasta")
    chr_records = next(records)
    chr_sequence = str(chr_records.seq)
    genome_sequence = chr_sequence # Use genome sequence reader here.

    _columns = ['sequence', 'label']
    for fname in os.listdir(chr_genes_indices_dir):
        print("Working on chr {} gene {}                                    ".format(c, fname), end='\r')
        fpath = os.path.join(chr_genes_indices_dir, fname)
        if os.path.isfile(fpath):
            """
            Gene index found. Read the index and cross-reference with genome sequence.
            """
            index_df = pd.read_csv(fpath)
            gene_region = index_df[index_df['region'] == "gene"]
            if strand != None:
                gene_region = gene_region[gene_region['strand'] == strand]

            if len(gene_region) > 0:
                gene_df = pd.DataFrame(columns=_columns)
                for i, g in gene_region.iterrows():
                    gene_start_index = int(g['start_index'])
                    gene_end_index = int(g['end_index'])
                    gene_sequence = genome_sequence[gene_start_index:gene_end_index + 1]
                    # print("Gene sequence: {}".format(gene_sequence))
                    # If gene sequence isn't None then this gene is available in chromosome sequence.
                    if gene_sequence != None:
                        gene_sequential_labelling = ['i' for a in gene_sequence]
                        exons = index_df[index_df['region'] == "exon"]
                        #if strand != None:
                        #    exons = index_df[index_df['strand'] == strand]
                        for j, r in exons.iterrows():
                            start_index = int(r['start_index'])
                            end_index = int(r['end_index'])
                            if (start_index >= gene_start_index and end_index <= gene_end_index):
                                start_index = int(r['start_index']) - gene_start_index
                                end_index = int(r['end_index']) - gene_start_index
                                for k in range(start_index, end_index + 1):
                                    gene_sequential_labelling[k] = 'E'
                        gene_sequential_labelling = ''.join(gene_sequential_labelling)
                        #endfor
                        gene_df = pd.concat([gene_df, pd.DataFrame([[gene_sequence, gene_sequential_labelling]], columns=_columns)])

                target_path = os.path.join(chr_genes_sequence_csv_dir, fname)
                # data\genome\sequential-labelling\chr1
                # Only write if dataframe is not empty.
                if gene_df.shape[0] > 0:
                    if not os.path.exists(chr_genes_sequence_csv_dir):
                        os.makedirs(chr_genes_sequence_csv_dir, exist_ok=True)
                    if gene_df.shape[0] > 0:    
                        gene_df.to_csv(target_path, index=False)
                    

Working on chr chr24 gene ZNF92P1Y.csv                                                     

In [1]:
"""Chunk each gene into 512 characters, for each chromosome."""
import pandas as pd
from data_preparation import kmer
import os

chrs = [f"chr{i + 1}" for i in range(24)] # Test one chromosome.
chr_paths = [os.path.join("data", "genome", "seqlab.positive.strand", f"{chr}") for chr in chrs]
dest_paths = [os.path.join("data", "genome", "seqlab.strand-positive.stride-512", f"{chr}") for chr in chrs]
for cp, dp in zip(chr_paths, dest_paths):
    genes = os.listdir(cp)
    genes = [g for g in genes if "expanded" not in g.split('.')]
    for gene in genes:
        gene_path = os.path.join(cp, gene)
        gene_df = pd.read_csv(gene_path)
        print(f"Processing {os.path.basename(cp)} {gene}                    ", end="\r")
        for i, r in gene_df.iterrows():
            sequence = r["sequence"]
            label = r["label"]
            seq_chunks = kmer(sequence, 512, 512)
            label_chunks = kmer(label, 512, 512)
            dest_path = os.path.join(dp, f"{gene.split('.')[0]}.csv")
            if os.path.exists(dest_path):
                os.remove(dest_path)
            if not os.path.exists(os.path.dirname(dest_path)):
                os.makedirs(os.path.dirname(dest_path), exist_ok=True)
            dest = open(dest_path, "x")
            dest.write("sequence,label\n")
            for c, l in zip(seq_chunks, label_chunks):
                dest.write(f"{c},{l}\n")
            dest.close()


Processing chr24 ZFY.csv                                       

In [None]:
"""Move file around"""

import os
for i in range(24):
    k = i + 1
    src_path = os.path.join("data", "genome", "seqlab.positive.strand", f"chr{k}")
    dest_path = os.path.join("data", "genome", "seqlab.positive.strand", f"chr{k}.expanded")
    if not os.path.exists(dest_path):
        os.makedirs(dest_path, exist_ok=True)
    files = os.listdir(src_path)
    files = [f for f in files if "expanded" in f.split('.')]
    for f in files:
        oldpath = os.path.join(src_path, f)
        newpath = os.path.join(dest_path, f)
        if not os.path.exists(newpath):
            os.rename(oldpath, newpath)
        else:
            print(f"Skip {oldpath}                                                    ", end="\r")

In [None]:
"""
Rename files in expanded folder
filename.expanded.csv => filename.csv
"""
src_paths = [os.path.join("data", "genome", "seqlab.positive.strand", f"chr{i + 1}.expanded") for i in range(24)]
for srcdir in src_paths:
    files = os.listdir(srcdir)
    for f in files:
        oldpath = os.path.join(srcdir, f)
        newpath = os.path.join(srcdir, f"{f.split('.')[0]}.csv")
        os.rename(oldpath, newpath)

In [4]:
"""Generate kmer version from sequence."""
import os
import pandas as pd
from data_preparation import str_kmer

# src_paths = [os.path.join("data", "genome", "seqlab.positive.strand", f"chr{i + 1}") for i in range(24)]
src_paths = [os.path.join("data", "genome", "seqlab.strand-positive.stride-512", f"chr{i + 1}") for i in range(24)]
# dest_paths = [os.path.join("data", "genome", "seqlab.strand-positive.kmer", f"chr{i + 1}") for i in range(24)]
dest_paths = [os.path.join("data", "genome", "seqlab.strand-positive.kmer.stride-512", f"chr{i + 1}") for i in range(24)]
for srcdir, destdir in zip(src_paths, dest_paths):
    files = os.listdir(srcdir)
    for f in files:
        src = os.path.join(srcdir, f)
        dest = os.path.join(destdir, f)
        if not os.path.exists(destdir):
            os.makedirs(destdir, exist_ok=True)
        if os.path.exists(dest):
            os.remove(dest)
        dest = open(dest, "x")
        dest.write("sequence,label\n")
        df = pd.read_csv(src)
        for i, r in df.iterrows():
            sequence = r["sequence"]
            label = r["label"]
            dest.write(f"{str_kmer(sequence, 3)},{str_kmer(label, 3)}\n")
        dest.close()
            

In [1]:
"""Chunk each gene into 510 token. Since sequence is already in token form, ``kmer`` function cannot be used, had to create another script."""
import pandas as pd
from data_preparation import kmer
import os

def chunk_kmer_sequence(chunk: str, size: int, stride: int) -> str:
    arr = chunk.split(' ')
    arr = kmer(arr, size, window_size=stride)
    return arr

chrs = [f"chr{i + 1}" for i in range(24)] # Test one chromosome.
chr_paths = [os.path.join("data", "genome", "seqlab.strand-positive.kmer", f"{chr}") for chr in chrs]
dest_paths = [os.path.join("data", "genome", "seqlab.strand-positive.kmer.stride-205", f"{chr}") for chr in chrs]
for cp, dp in zip(chr_paths, dest_paths):
    genes = os.listdir(cp)
    genes = [g for g in genes if "expanded" not in g.split('.')]
    for gene in genes:
        gene_path = os.path.join(cp, gene)
        gene_df = pd.read_csv(gene_path)
        print(f"Processing {os.path.basename(cp)} {gene}                    ", end="\r")
        for i, r in gene_df.iterrows():
            sequence = r["sequence"]
            label = r["label"]
            seq_chunks = chunk_kmer_sequence(sequence, 510, 205)
            label_chunks = chunk_kmer_sequence(label, 510, 205)    
            dest_path = os.path.join(dp, f"{gene.split('.')[0]}.csv")
            if os.path.exists(dest_path):
                os.remove(dest_path)
            if not os.path.exists(os.path.dirname(dest_path)):
                os.makedirs(os.path.dirname(dest_path), exist_ok=True)
            dest = open(dest_path, "x")
            dest.write("sequence,label\n")
            for c, l in zip(seq_chunks, label_chunks):
                dest.write(f"{' '.join(c)},{' '.join(l)}\n")
            dest.close()

Processing chr24 ZFY.csv                                       

In [4]:
"""
Split gene_index for training, validation, and testing.
"""
import os
index_dir = os.path.join("index")
gene_index = os.path.join(index_dir, f"gene_index.csv")
train_index = os.path.join(index_dir, "gene_train_index.csv")
val_index = os.path.join(index_dir, "gene_validation_index.csv")
test_index = os.path.join(index_dir, "gene_test_index.csv")

import pandas as pd
df = pd.read_csv(gene_index)
train_df = df.sample(frac=0.8)
test_df = df.drop(train_df.index)
val_df = test_df.sample(frac=0.5)
test_df = test_df.drop(val_df.index)
train_df.to_csv(train_index, index=False)
val_df.to_csv(val_index, index=False)
test_df.to_csv(test_index, index=False)

In [None]:
# Gene sequential labelling.
# Create 10% and 25% sample of gene indices.

import os
import pandas as pd

index_dir = os.path.join("index")
gene_train_index = os.path.join(index_dir, "gene_train_index.csv")
gene_validation_index = os.path.join(index_dir, "gene_validation_index.csv")
gene_test_index = os.path.join(index_dir, "gene_test_index.csv")

gene_train_index_10 = os.path.join(index_dir, "gene_train_index.10.csv")
gene_validation_index_10 = os.path.join(index_dir, "gene_validation_index.10.csv")
gene_test_index_10 = os.path.join(index_dir, "gene_test_index.10.csv")

gene_train_index_25 = os.path.join(index_dir, "gene_train_index.25.csv")
gene_validation_index_25 = os.path.join(index_dir, "gene_validation_index.25.csv")
gene_test_index_25 = os.path.join(index_dir, "gene_test_index.25.csv")

srcs = [gene_train_index, gene_validation_index, gene_test_index]
tens = [gene_train_index_10, gene_validation_index_10, gene_test_index_10]
quarter = [gene_train_index_25, gene_validation_index_25, gene_test_index_25]

for src, t, q in zip(srcs, tens, quarter):
    src_df = pd.read_csv(src)
    src_10_df = src_df.sample(frac=0.1, random_state=1337)
    src_10_df.to_csv(t, index=False)
    src_25_df = src_df.sample(frac=0.25, random_state=1337)
    src_25_df.to_csv(q, index=False)


In [3]:
# Enrich index with gene length.
import os
import pandas as pd
from tqdm import tqdm

index_dir = os.path.join("index")
gene_index = os.path.join(index_dir, "gene_index.csv")
gene_train_index = os.path.join(index_dir, "gene_train_index.csv")
gene_validation_index = os.path.join(index_dir, "gene_validation_index.csv")
gene_test_index = os.path.join(index_dir, "gene_test_index.csv")
gene_dir = os.path.join("data", "gene_dir")

for index in [gene_index, gene_train_index, gene_validation_index, gene_test_index]:
    df = pd.read_csv(index)
    len_sequences = []
    for i, r in tqdm(df.iterrows(), total=df.shape[0], desc="Counting sequence length"):
        chr_name = r["chr"]
        gene_filename = r["gene"]
        gene_filepath = os.path.join(gene_dir, chr_name, gene_filename)
        gene_df = pd.read_csv(gene_filepath)
        length = 0
        for i, j in gene_df.iterrows():
            length += len(j["sequence"])
        len_sequences.append(length)
    df["length"] = len_sequences
    df.to_csv(index, index=False)


Counting sequence length: 100%|██████████| 19235/19235 [06:53<00:00, 46.55it/s]
Counting sequence length: 100%|██████████| 15388/15388 [04:54<00:00, 52.30it/s]
Counting sequence length: 100%|██████████| 1924/1924 [01:00<00:00, 32.01it/s]
Counting sequence length: 100%|██████████| 1923/1923 [00:56<00:00, 34.27it/s]


In [None]:
import pandas as pd
_cols = ['chr', 'gene']
df_path = os.path.join('workspace', 'sequential-labelling', 'gene_index.csv')
_dir = os.path.join('workspace', 'sequential-labelling', 'duplicate genes')
os.makedirs(_dir, exist_ok=True)
df = pd.read_csv(df_path)
train_df = pd.DataFrame(columns=_cols)
valid_df = pd.DataFrame(columns=_cols)
test_df = pd.DataFrame(columns=_cols)
genes_unique = df['gene'].unique()
for g in genes_unique:
    filtered_df = df[df['gene'] == g]
    if filtered_df.shape[0] > 1:
        _g = "{}.csv".format(g)
        filtered_df.to_csv(os.path.join(_dir, _g), index=False)

In [None]:
from data_preparation import gff_to_csvs, gff_to_csv
from data_dir import (annotated_grch38_gff, annotated_grch38_gff_dir, annotated_grch38_gff_csv)

print(annotated_grch38_gff)
print(annotated_grch38_gff_csv)
print(annotated_grch38_gff_dir)

In [None]:
from data_dir import chr21_fasta, chr21_index_csv, data_genome_grch38_labels_dir
from data_preparation import generate_sequence_labelling
import os

target_path = os.path.join(data_genome_grch38_labels_dir, 'chr21.csv')
print("Generate sequential labelling {} => {}: {}".format(chr21_index_csv, target_path, generate_sequence_labelling(chr21_index_csv, chr21_fasta, target_path)))


In [None]:
from data_dir import (
    chr1_index_csv, chr2_index_csv, chr3_index_csv, chr4_index_csv, chr5_index_csv, chr6_index_csv, chr7_index_csv, chr8_index_csv, chr9_index_csv, chr10_index_csv,
    chr11_index_csv, chr12_index_csv, chr13_index_csv, chr14_index_csv, chr15_index_csv, chr16_index_csv, chr17_index_csv, chr18_index_csv, chr19_index_csv, chr20_index_csv,
    chr21_index_csv, chr22_index_csv, chr23_index_csv, chr24_index_csv
)
from data_dir import (
    chr1_fasta, chr2_fasta, chr3_fasta, chr4_fasta, chr5_fasta, chr6_fasta, chr7_fasta, chr8_fasta, chr9_fasta, chr10_fasta, 
	chr11_fasta, chr12_fasta, chr13_fasta, chr14_fasta, chr15_fasta, chr16_fasta, chr17_fasta, chr18_fasta, chr19_fasta, chr20_fasta, 
	chr21_fasta, chr22_fasta, chr23_fasta, chr24_fasta,
)
chr_fastas = [
    chr1_fasta,
	#chr2_fasta, chr3_fasta, chr4_fasta, chr5_fasta, chr6_fasta, chr7_fasta, chr8_fasta, chr9_fasta, chr10_fasta,
	#chr11_fasta, chr12_fasta, chr13_fasta, chr14_fasta, chr15_fasta, chr16_fasta, chr17_fasta, chr18_fasta, chr19_fasta, chr20_fasta,
	#chr21_fasta, chr22_fasta, chr23_fasta, chr24_fasta
]
from data_dir import labseq_dir, labseq_names
from data_preparation import generate_sequence_labelling
chr_indices = [
    chr1_index_csv, 
	#chr2_index_csv, chr3_index_csv, chr4_index_csv, chr5_index_csv, chr6_index_csv, chr7_index_csv, chr8_index_csv, chr9_index_csv, chr10_index_csv,
    #chr11_index_csv, chr12_index_csv, chr13_index_csv, chr14_index_csv, chr15_index_csv, chr16_index_csv, chr17_index_csv, chr18_index_csv, chr19_index_csv, chr20_index_csv,
    #chr21_index_csv, chr22_index_csv, chr23_index_csv, chr24_index_csv
]
chr_labseq_path = [os.path.join(labseq_dir, fname) for fname in labseq_names[0:1]]
for src, fasta, target in zip(chr_indices, chr_fastas, chr_labseq_path):
    print(src, fasta, target)
    #print("Generating sequential labelling for index {}, from fasta {}, to {}: {}".format(src, fasta, target, generate_sequence_labelling(src, fasta, target, do_expand=True, expand_size=512)))

In [None]:
from sequential_labelling import Label_Dictionary
import pandas as pd
from tqdm import tqdm
import os

def prepare_sequence_from_csv(src_csv, label_dictionary=Label_Dictionary):
    """
    Convert sequence into tokenized DNA sequence and label sequence.
    CSV source has columns `sequence` and `label`. 
    `sequence` contains tokenized DNA sequence and `label` contains sequence of labels.
    @param      src_csv (string): path to CSV source.
    @param      label_dictionary (dict): dictionary to convert label into number.
    @return     sequence, labels
    """
    if not os.path.exists(src_csv):
        raise FileNotFoundError(src_csv)
    if label_dictionary == None:
        raise Exception("Argument `label_dictionary` cannot be empty!")
    
    df = pd.read_csv(src_csv)
    return list(df['sequence']), list(df['label'])
    

In [None]:
"""Create bundle from all expanded sequence from all genes from all chromosomes."""
import os
import pandas as pd
from tqdm import tqdm

src_dirs = [os.path.join("data", "genome", "seqlab.positive.strand", f"chr{i+1}.expanded") for i in range(24)]
bundle_path = os.path.join("data", "genome", "seqlab.positive.strand", "bundle.csv")
if os.path.exists(bundle_path):
    os.remove(bundle_path)
bundle = open(bundle_path, "x")
bundle.write("sequence,label\n")
for srcdir in tqdm(src_dirs, total=24):
    files = [os.path.join(srcdir, f) for f in os.listdir(srcdir)]
    for f in files:
        df = pd.read_csv(f)
        for i, r in df.iterrows():
            bundle.write(f"{r['sequence']},{r['label']}\n")
bundle.close()

In [1]:
# 17 May 2022
# From non overlapping kmer-ized genes, make bundle.
import pandas as pd
import os
chrs = [f"chr{i + 1}" for i in range(24)]
srcs = [os.path.join("workspace", "genlab", "seqlab.strand-positive.kmer.stride-510", chr) for chr in chrs]
dest = os.path.join("workspace", "seqlab", "seqlab.strand-positive.kmer.stride-510", "bundle.csv")

if os.path.exists(dest):
    os.remove(dest)
fdest = open(dest, "x")
fdest.write("sequence,label\n")
for src in srcs:
    files = os.listdir(src)
    files = [os.path.join(src, f) for f in files]
    for f in files:
        df = pd.read_csv(f)
        for i,r in df.iterrows():
            fdest.write(f"{r['sequence']},{r['label']}\n")


fdest.close()

In [1]:
# Creating bundle from index.
import os
import pandas as pd
from tqdm import tqdm

gene_dir = os.path.join("data", "gene_dir_c512_k3")
index_dir = os.path.join("index")
#train_index = os.path.join(gene_dir, "gene_train_index.csv")
#validation_index = os.path.join(gene_dir, "gene_validation_index.csv")
#test_index = os.path.join(gene_dir, "gene_test_index.csv")
whole_index = os.path.join(index_dir, "gene_index.csv")

# Create training bundle from index.
# Store file in by_sequence folder.
bundle_path = os.path.join("workspace", "seqlab", "seqlab-3")
#train_bundle = os.path.join(bundle_path, "gene_train_bundle.csv")
#validation_bundle = os.path.join(bundle_path, "gene_validation_bundle.csv")
#test_bundle = os.path.join(bundle_path, "gene_test_bundle.csv")
whole_bundle = os.path.join(bundle_path, "gene_bundle.csv")
for s, d in zip([whole_index], [whole_bundle]):
    df = pd.read_csv(s)
    ddirname = os.path.dirname(d)
    if not os.path.exists(ddirname):
        os.makedirs(ddirname)
    if os.path.exists(d):
        os.remove(d)
    dest_file = open(d, "x")
    dest_file.write("sequence,label\n")
    for i, r in tqdm(df.iterrows(), total=df.shape[0], desc="Bundling"):
        chr = r["chr"]
        gene = r["gene"]
        gene_path = os.path.join(gene_dir, chr, gene)
        gene_df = pd.read_csv(gene_path)
        for j, k in gene_df.iterrows():
            dest_file.write(f"{k['sequence']},{k['label']}\n")

    dest_file.close()

# 19235 genes seqlab
# 19235 genes seqlab-3

Bundling:   0%|          | 0/19235 [00:00<?, ?it/s]


FileNotFoundError: [Errno 2] No such file or directory: 'data\\gene_dir_c512_k3\\chr1\\AADACL3.csv'

In [2]:
# Split gene_bundle into train, validation, and test set.
import os 
import pandas as pd

dirpath = os.path.join("workspace", "seqlab", "seqlab-3")
gene_bundle_path = os.path.join(dirpath, "gene_bundle.csv")
gene_train_bundle_path = os.path.join(dirpath, "gene_train_bundle.csv")
gene_validation_bundle_path = os.path.join(dirpath, "gene_validation_bundle.csv")
gene_test_bundle_path = os.path.join(dirpath, "gene_test_bundle.csv")

df = pd.read_csv(gene_bundle_path)
train_df = df.sample(frac=0.8, random_state=1337)
validation_df = df.drop(train_df.index)
test_df = validation_df.sample(frac=0.5, random_state=1337)
validation_df = validation_df.drop(test_df.index)
train_df.to_csv(gene_train_bundle_path, index=False)
validation_df.to_csv(gene_validation_bundle_path, index=False)
test_df.to_csv(gene_test_bundle_path, index=False)

In [1]:
import os
import pandas as pd

dirpath = os.path.join("workspace", "seqlab", "seqlab-3")
gene_bundle_path = os.path.join(dirpath, "gene_bundle.csv")
gene_train_bundle_path = os.path.join(dirpath, "gene_train_bundle.csv")
gene_validation_bundle_path = os.path.join(dirpath, "gene_validation_bundle.csv")
gene_test_bundle_path = os.path.join(dirpath, "gene_test_bundle.csv")

gene_train_bundle_10_path = os.path.join(dirpath, "gene_train_bundle.10.csv")
gene_train_bundle_25_path = os.path.join(dirpath, "gene_train_bundle.25.csv")
gene_validation_bundle_10_path = os.path.join(dirpath, "gene_validation_bundle.10.csv")
gene_validation_bundle_25_path = os.path.join(dirpath, "gene_validation_bundle.25.csv")
gene_test_bundle_10_path =os.path.join(dirpath, "gene_test_bundle.10.csv")
gene_test_bundle_25_path =os.path.join(dirpath, "gene_test_bundle.25.csv")

for p, q, r in zip(
    [gene_train_bundle_path, gene_validation_bundle_path, gene_test_bundle_path],
    [gene_train_bundle_10_path, gene_validation_bundle_10_path, gene_test_bundle_10_path],
    [gene_train_bundle_25_path, gene_validation_bundle_25_path, gene_test_bundle_25_path]
):
    source_df = pd.read_csv(p)
    p10_df = source_df.sample(frac=0.1, random_state=1337)
    p25_df = source_df.sample(frac=0.25, random_state=1337)
    p10_df.to_csv(q, index=False)
    p25_df.to_csv(r, index=False)
    


In [6]:
# Create small version of seqlab-3 sequence.
import os 
import pandas as pd

fractions = [0.1, 0.25]
seqlab_3_dir = os.path.join("workspace", "seqlab", "seqlab-3")
train_df = pd.read_csv(os.path.join(seqlab_3_dir, "gene_train_bundle.csv"))
validation_df = pd.read_csv(os.path.join(seqlab_3_dir, "gene_validation_bundle.csv"))
test_df = pd.read_csv(os.path.join(seqlab_3_dir, "gene_test_bundle.csv"))

for frac in fractions:
    train_df.sample(frac=frac).to_csv(os.path.join(seqlab_3_dir, f"gene_train_bundle.{frac * 100}.csv"))
    validation_df.sample(frac=frac).to_csv(os.path.join(seqlab_3_dir, f"gene_validation_bundle.{frac * 100}.csv"))
    test_df.sample(frac=frac).to_csv(os.path.join(seqlab_3_dir, f"gene_test_bundle.{frac * 100}.csv"))

In [4]:
# Generate SPLICE SITES, INTRON, and EXON only bundles.
def at_least_one_exists(list, target_list):
    # Check if at leats one element of list exists in target_list.
    found = False
    for elem in list:
        if elem in target_list:
            found = True
    return found

def is_intron(label_sequence):
    return all([a == "iii" for a in label_sequence])

def is_exon(label_sequence):
    return all([a == "EEE" for a in label_sequence])

splice_sites = ['iiE', 'iEi', 'Eii', 'iEE', 'EEi', 'EiE']

import os
import pandas as pd
from tqdm import tqdm

gene_bundle_dirpath = os.path.join("workspace", "seqlab", "seqlab-3")
gene_bundle = os.path.join(gene_bundle_dirpath, "gene_bundle.csv")
gene_ss_bundle = os.path.join(gene_bundle_dirpath, "gene_ss_bundle.csv")
gene_exon_bundle = os.path.join(gene_bundle_dirpath, "gene_exon_bundle.csv")
gene_intron_bundle = os.path.join(gene_bundle_dirpath, "gene_intron_bundle.csv")
for p in [gene_bundle]:
    df = pd.read_csv(p)
    for a in [gene_ss_bundle, gene_exon_bundle, gene_intron_bundle]:
        if os.path.exists(a):
            os.remove(a)

    gene_ss_bundle = open(gene_ss_bundle, "x")
    gene_exon_bundle = open(gene_exon_bundle, "x")
    gene_intron_bundle = open(gene_intron_bundle, "x")

    for a in [gene_ss_bundle, gene_exon_bundle, gene_intron_bundle]:
        a.write("sequence,label\n")

    from datetime import datetime
    cur_date = datetime.now()
    for i, r in tqdm(df.iterrows(), total=df.shape[0], desc=f"Processing ({cur_date})"):
        arr_labels = r["label"].split(" ")
        if all([a == "iii" for a in arr_labels]):
            gene_intron_bundle.write(f"{r['sequence']},{r['label']}\n")
        elif all([a == "EEE" for a in arr_labels]):
            gene_exon_bundle.write(f"{r['sequence']},{r['label']}\n")
        else:
            gene_ss_bundle.write(f"{r['sequence']},{r['label']}\n")


    for a in [gene_ss_bundle, gene_intron_bundle, gene_exon_bundle]:
        a.close()

# Split SPLICE SITES bundle into train, validation, and test set.
import os 
import pandas as pd

dirpath = os.path.join("workspace", "seqlab", "seqlab-3")
splice_site_bundle_path = os.path.join(dirpath, "gene_ss_bundle.csv")
splice_site_train_bundle_path = os.path.join(dirpath, "gene_ss_train_bundle.csv")
splice_site_validation_bundle_path = os.path.join(dirpath, "gene_ss_validation_bundle.csv")
splice_site_test_bundle_path = os.path.join(dirpath, "gene_ss_test_bundle.csv")

df = pd.read_csv(splice_site_bundle_path)
train_df = df.sample(frac=0.8, random_state=1337)
validation_df = df.drop(train_df.index)
test_df = validation_df.sample(frac=0.5, random_state=1337)
validation_df = validation_df.drop(test_df.index)
train_df.to_csv(splice_site_train_bundle_path, index=False)
validation_df.to_csv(splice_site_validation_bundle_path, index=False)
test_df.to_csv(splice_site_test_bundle_path, index=False)

# 1753964 
train_len = train_df.shape[0]
validation_len = validation_df.shape[0]
test_len = test_df.shape[0]
total = train_len + validation_len + test_len
print(f"# Training instance {train_len} {train_len/total}")
print(f"# Validation instance {validation_len} {validation_len/total}")
print(f"# Test instance {test_len} {test_len/total}")


Processing (2022-08-12 09:47:26.822747): 100%|██████████| 1753964/1753964 [13:33<00:00, 2156.62it/s]


In [None]:
train_len = train_df.shape[0]
validation_len = validation_df.shape[0]
test_len = test_df.shape[0]
total = train_len + validation_len + test_len
print(f"# Training instance {train_len} {train_len/total}")
print(f"# Validation instance {validation_len} {validation_len/total}")
print(f"# Test instance {test_len} {test_len/total}")


In [5]:
# Gene sequential labelling.
# Create 10% and 25% sample of gene indices.

import os
import pandas as pd

index_dir = os.path.join("index")
gene_train_index = os.path.join(index_dir, "gene_train_index.csv")
gene_validation_index = os.path.join(index_dir, "gene_validation_index.csv")
gene_test_index = os.path.join(index_dir, "gene_test_index.csv")

gene_train_index_10 = os.path.join(index_dir, "gene_train_index.10.csv")
gene_validation_index_10 = os.path.join(index_dir, "gene_validation_index.10.csv")
gene_test_index_10 = os.path.join(index_dir, "gene_test_index.10.csv")

gene_train_index_25 = os.path.join(index_dir, "gene_train_index.25.csv")
gene_validation_index_25 = os.path.join(index_dir, "gene_validation_index.25.csv")
gene_test_index_25 = os.path.join(index_dir, "gene_test_index.25.csv")

srcs = [gene_train_index, gene_validation_index, gene_test_index]
tens = [gene_train_index_10, gene_validation_index_10, gene_test_index_10]
quarter = [gene_train_index_25, gene_validation_index_25, gene_test_index_25]

for src, t, q in zip(srcs, tens, quarter):
    src_df = pd.read_csv(src)
    src_10_df = src_df.sample(frac=0.1, random_state=1337)
    src_10_df.to_csv(t, index=False)
    src_25_df = src_df.sample(frac=0.25, random_state=1337)
    src_25_df.to_csv(q, index=False)


In [20]:
from tqdm import tqdm
import pandas as pd

def generate_bundle_with_marker(src_index, target_bundle, gene_dir):
    gene_train_index = src_index
    gene_train_bundle_csv = target_bundle
    if os.path.exists(gene_train_bundle_csv):
        os.remove(gene_train_bundle_csv)

    target_bundle_dir = os.path.dirname(target_bundle)
    os.makedirs(target_bundle_dir, exist_ok=True)

    gene_train_bundle = open(gene_train_bundle_csv, "x")
    gene_train_bundle.write("sequence,label,marker\n")

    df = pd.read_csv(gene_train_index)
    marker = True
    for i, r in tqdm(df.iterrows(), total=df.shape[0], desc= f"Processing {os.path.basename(src_index)}"):
        chr_dir = r["chr"]
        gene_file = r["gene"]
        gene_name = gene_file.split(".")[0]
        gene_csv = os.path.join(gene_dir, chr_dir, gene_file)
        gene_df = pd.read_csv(gene_csv)
        for j, k in gene_df.iterrows():
            sequence = k["sequence"]
            label = k["label"]
            gene_train_bundle.write(f"{sequence},{label},{int(marker)}\n")
        
        marker = not marker

    gene_train_bundle.close()

In [21]:
# Gene labelling: create bundle with marker that indicates a set of sequence belong to one long sequence.
import os
import pandas as pd
from tqdm import tqdm

index_dir = os.path.join("index")
srcs = [
    os.path.join(index_dir, "gene_train_index.10.csv"),
    os.path.join(index_dir, "gene_train_index.25.csv"),
    os.path.join(index_dir, "gene_train_index.csv"),
    os.path.join(index_dir, "gene_validation_index.10.csv"),
    os.path.join(index_dir, "gene_validation_index.25.csv"),
    os.path.join(index_dir, "gene_validation_index.csv"),
    os.path.join(index_dir, "gene_test_index.10.csv"),
    os.path.join(index_dir, "gene_test_index.25.csv"),
    os.path.join(index_dir, "gene_test_index.csv"),
]
workspace_dir = os.path.join("workspace", "genlab", "genlab-3")
dests = [
    os.path.join(workspace_dir, "gene_train_index_bundle.10.csv"),
    os.path.join(workspace_dir, "gene_train_index_bundle.25.csv"),
    os.path.join(workspace_dir, "gene_train_index_bundle.csv"),
    os.path.join(workspace_dir, "gene_validation_index_bundle.10.csv"),
    os.path.join(workspace_dir, "gene_validation_index_bundle.25.csv"),
    os.path.join(workspace_dir, "gene_validation_index_bundle.csv"),
    os.path.join(workspace_dir, "gene_test_index_bundle.10.csv"),
    os.path.join(workspace_dir, "gene_test_index_bundle.25.csv"),
    os.path.join(workspace_dir, "gene_test_index_bundle.csv"),
]
# Generate gene bundles based on above.
import os

gene_dir = os.path.join("data", "gene_dir_c510_k3")
for a, b in zip(srcs, dests):
    generate_bundle_with_marker(a, b, gene_dir)

Processing gene_train_index.10.csv: 100%|██████████| 1539/1539 [02:20<00:00, 10.98it/s]
Processing gene_train_index.25.csv: 100%|██████████| 3847/3847 [03:28<00:00, 18.42it/s]
Processing gene_train_index.csv: 100%|██████████| 15388/15388 [14:23<00:00, 17.83it/s]
Processing gene_validation_index.10.csv: 100%|██████████| 192/192 [00:10<00:00, 18.67it/s]
Processing gene_validation_index.25.csv: 100%|██████████| 481/481 [00:22<00:00, 21.73it/s]
Processing gene_validation_index.csv: 100%|██████████| 1924/1924 [01:46<00:00, 18.13it/s]
Processing gene_test_index.10.csv: 100%|██████████| 192/192 [00:12<00:00, 15.86it/s]
Processing gene_test_index.25.csv: 100%|██████████| 481/481 [00:23<00:00, 20.22it/s]
Processing gene_test_index.csv: 100%|██████████| 1923/1923 [01:38<00:00, 19.52it/s]


In [3]:
# Create gene sequence which contains splice sites at all position.
import pandas as pd
import os

from utils.utils import is_exists_splice_site_in_sequence

is_exists_splice_site_in_sequence(["iii", "EEE", "EEE"])


False

In [2]:
# Create gene sequence which contains splice sites at all position.
import pandas as pd
import os

from tqdm import tqdm
from utils.utils import kmer, str_kmer, is_exists_splice_site_in_sequence

def generate_splice_site_all_pos_bundle(source_gene_dir, bundle_dest_dir, chunk_size, kmer_size):
    """
    `source_gene_dir` - contains directories which each corresponds to chromosome.
    Genes in chromosome folder is in raw format, not kmerized.
    `bundle_dest_dir` - folder where the resulting bundle will be written.
    """
    chr_names = os.listdir(source_gene_dir)
    chr_dirs = [os.path.join(source_gene_dir, a) for a in chr_names]
    chr_dirs = [a for a in chr_dirs if os.path.isdir(a)]

    os.makedirs(bundle_dest_dir, exist_ok=True)
    bundle_path = os.path.join(bundle_dest_dir, "splice_site_all_pos.csv")
    if os.path.exists(bundle_path):
        os.remove(bundle_path)
    
    bundle_file = open(bundle_path, "x")
    bundle_file.write("sequence,label\n")

    for d in tqdm(chr_dirs, total=len(chr_dirs), desc="Processing Chromosome"):
        filenames = os.listdir(d)
        filepaths = [os.path.join(d, a) for a in filenames]
        filepaths = [a for a in filepaths if os.path.isfile(a)]
        
        for f in filepaths:
            df = pd.read_csv(f)
            for i, r in df.iterrows():
                sequence = r["sequence"]
                label = r["label"]
                len_sequence = len(sequence)
                for i in range(0, len_sequence - chunk_size, 1):
                    sublabel = label[i:i+chunk_size]
                    arr_sublabel = kmer(sublabel, kmer_size)
                    if is_exists_splice_site_in_sequence(arr_sublabel):
                        subsequence = sequence[i:i+chunk_size]
                        bundle_file.write(f"{str_kmer(subsequence, kmer_size)},{' '.join(arr_sublabel)}\n")
                    
    bundle_file.close()                


In [3]:
# Create sample index.
import os 
gene_dir_path = os.path.join("data", "gene_dir_sample")
bundle_dest_dir = os.path.join("data", "gene_dir_sample")
chunk_size = 512
kmer_size = 3
generate_splice_site_all_pos_bundle(gene_dir_path, bundle_dest_dir, chunk_size, kmer_size)

Processing Chromosome: 100%|██████████| 2/2 [08:54<00:00, 267.35s/it]
