In [14]:
"""
Filter index based on gene name.
"""
import os
import pandas as pd
_chr_indices = [
    'NC_000001.11.csv',
    'NC_000002.12.csv',
    'NC_000003.12.csv',
    'NC_000004.12.csv',
    'NC_000005.10.csv',
    'NC_000006.12.csv',
    'NC_000007.14.csv',
    'NC_000008.11.csv',
    'NC_000009.12.csv',
    'NC_000010.11.csv',
    'NC_000011.10.csv',
    'NC_000012.12.csv',
    'NC_000013.11.csv',
    'NC_000014.9.csv',
    'NC_000015.10.csv',
    'NC_000016.10.csv',
    'NC_000017.11.csv',
    'NC_000018.10.csv',
    'NC_000019.10.csv',
    'NC_000020.11.csv',
    'NC_000021.9.csv',
    'NC_000022.11.csv',
    'NC_000023.11.csv',
    'NC_000024.10.csv']
_chr_dir = ["chr{}".format(i+1) for i in range(len(_chr_indices))]

for chr, chr_number in zip(_chr_indices, _chr_dir):
    path = os.path.join('data', 'genome', 'grch38', 'csvs', chr)
    df = pd.read_csv(path)
    genes = list(df['gene'].unique())
    genes = [a for a in genes if not pd.isnull(a)]
    for g in genes:
        ndf = df[df['gene'] == g]
        path = os.path.join('data', 'genome', 'grch38', 'genes', chr_number,"{}.csv".format(g))
        if not os.path.exists(os.path.dirname(path)):
            os.mkdir(os.path.dirname(path))
        ndf.to_csv(path, index=False)
        print("Success: {}".format(path), end="\r")

Success: data\genome\grch38\genes\chr1\KAZN-AS1.csv.csv

In [2]:
from data_preparation import gff_to_csvs, gff_to_csv
from data_dir import (annotated_grch38_gff, annotated_grch38_gff_dir, annotated_grch38_gff_csv)

print(annotated_grch38_gff)
print(annotated_grch38_gff_csv)
print(annotated_grch38_gff_dir)

./data/genome/grch38/GRCh38_latest_genomic.gff
./data/genome/grch38/grch38_gff.csv
./data/genome/grch38/csvs


In [2]:
from data_dir import chr21_fasta, chr21_index_csv, data_genome_grch38_labels_dir
from data_preparation import generate_sequence_labelling
import os

target_path = os.path.join(data_genome_grch38_labels_dir, 'chr21.csv')
print("Generate sequential labelling {} => {}: {}".format(chr21_index_csv, target_path, generate_sequence_labelling(chr21_index_csv, chr21_fasta, target_path)))


Generate sequential labelling data/genome/grch38/csvs/NC_000021.9.csv => data/genome/grch38/labels\chr21.csv: True data/genome/grch38/labels\chr21.csv


In [4]:
from data_dir import (
    chr1_index_csv, chr2_index_csv, chr3_index_csv, chr4_index_csv, chr5_index_csv, chr6_index_csv, chr7_index_csv, chr8_index_csv, chr9_index_csv, chr10_index_csv,
    chr11_index_csv, chr12_index_csv, chr13_index_csv, chr14_index_csv, chr15_index_csv, chr16_index_csv, chr17_index_csv, chr18_index_csv, chr19_index_csv, chr20_index_csv,
    chr21_index_csv, chr22_index_csv, chr23_index_csv, chr24_index_csv
)
from data_dir import (
    chr1_fasta, chr2_fasta, chr3_fasta, chr4_fasta, chr5_fasta, chr6_fasta, chr7_fasta, chr8_fasta, chr9_fasta, chr10_fasta, 
	chr11_fasta, chr12_fasta, chr13_fasta, chr14_fasta, chr15_fasta, chr16_fasta, chr17_fasta, chr18_fasta, chr19_fasta, chr20_fasta, 
	chr21_fasta, chr22_fasta, chr23_fasta, chr24_fasta,
)
chr_fastas = [
    chr1_fasta,
	#chr2_fasta, chr3_fasta, chr4_fasta, chr5_fasta, chr6_fasta, chr7_fasta, chr8_fasta, chr9_fasta, chr10_fasta,
	#chr11_fasta, chr12_fasta, chr13_fasta, chr14_fasta, chr15_fasta, chr16_fasta, chr17_fasta, chr18_fasta, chr19_fasta, chr20_fasta,
	#chr21_fasta, chr22_fasta, chr23_fasta, chr24_fasta
]
from data_dir import labseq_dir, labseq_names
from data_preparation import generate_sequence_labelling
chr_indices = [
    chr1_index_csv, 
	#chr2_index_csv, chr3_index_csv, chr4_index_csv, chr5_index_csv, chr6_index_csv, chr7_index_csv, chr8_index_csv, chr9_index_csv, chr10_index_csv,
    #chr11_index_csv, chr12_index_csv, chr13_index_csv, chr14_index_csv, chr15_index_csv, chr16_index_csv, chr17_index_csv, chr18_index_csv, chr19_index_csv, chr20_index_csv,
    #chr21_index_csv, chr22_index_csv, chr23_index_csv, chr24_index_csv
]
chr_labseq_path = [os.path.join(labseq_dir, fname) for fname in labseq_names[0:1]]
for src, fasta, target in zip(chr_indices, chr_fastas, chr_labseq_path):
    print(src, fasta, target)
    #print("Generating sequential labelling for index {}, from fasta {}, to {}: {}".format(src, fasta, target, generate_sequence_labelling(src, fasta, target, do_expand=True, expand_size=512)))

data/genome/grch38/csvs/NC_000001.11.csv data/chr/NC_000001.11.fasta data/genome/labseq\chr1.csv


In [2]:
from sequential_labelling import Label_Dictionary
import pandas as pd
from tqdm import tqdm
import os

def prepare_sequence_from_csv(src_csv, label_dictionary=Label_Dictionary):
    """
    Convert sequence into tokenized DNA sequence and label sequence.
    CSV source has columns `sequence` and `label`. 
    `sequence` contains tokenized DNA sequence and `label` contains sequence of labels.
    @param      src_csv (string): path to CSV source.
    @param      label_dictionary (dict): dictionary to convert label into number.
    @return     sequence, labels
    """
    if not os.path.exists(src_csv):
        raise FileNotFoundError(src_csv)
    if label_dictionary == None:
        raise Exception("Argument `label_dictionary` cannot be empty!")
    
    df = pd.read_csv(src_csv)
    return list(df['sequence']), list(df['label'])
    

['a', 'a', 'a', 'a', 'a', 'b', 'a', 'a', 'a', 'a']