In [19]:
import os
import pandas as pd

from data_dir import genome_grch37, grch37_sample_dir, genome_grch37_dir
from data_dir import genome_grch38, grch38_sample_dir, genome_grch38_dir

def _gff_parseline(line, regions):
    if line[0] == '#':
        return False
    else:
        words = line.split('\t')
        sequence_id = words[0]
        refseq = words[1]
        region = words[2]
        start = int(words[3]) # one-based numbering.
        start_index = start-1 # zero-based numbering.
        end = int(words[4]) 
        end_index = end-1
        if regions is None:
            return {'sequence_id': sequence_id, 'refseq': refseq, 'region': region, 'start': start, 'start_index': start_index, 'end': end, 'end_index': end_index}
        elif region in regions:
            return {'sequence_id': sequence_id, 'refseq': refseq, 'region': region, 'start': start, 'start_index': start_index, 'end': end, 'end_index': end_index}
        else:
            return False

def gff_to_csv(file, csv_output, regions):
    if os.path.exists(file):
        # Prepare file and dataframe.
        if os.path.exists(csv_output):
            os.remove(csv_output)
        colnames = ['sequence_id', 'refseq', 'region', 'start_index', 'end_index', 'start', 'end']
        header = ",".join(colnames)
        f = open(file, 'r')
        out = open(csv_output, 'x')
        out.write("{} \n".format(header))
        
        for line in f:
            d = _gff_parseline(line, regions)
            try:
                if d != False:
                    if d:
                        output = "{},{},{},{},{},{},{} \n".format(d['sequence_id'], d['refseq'], d['region'], d['start_index'], d['end_index'], d['start'], d['end'])
                        out.write(output)
                    else:
                        break
            except:
                out.close()
                f.close()

        out.close()
        f.close()

print(genome_grch37)
print(genome_grch38)
print(grch37_sample_dir)
print(grch38_sample_dir)


./data/genome/grch37/GRCh37_latest_genomic.gff
./data/genome/grch38/GRCh38_latest_genomic.gff
./sample/grch37
./sample/grch38


In [10]:
s = "NC_000001.11	RefSeq	region	1	248956422	.	+	.	ID=NC_000001.11:1..248956422;Dbxref=taxon:9606;Name=1;chromosome=1;gbkey=Src;genome=chromosome;mol_type=genomic DNA"
d = _gff_parseline(s, ['exon'])
d

False

In [11]:
gff_to_csv(genome_grch37, grch37_sample_dir + "/grch37_all.csv", None)

In [12]:
gff_to_csv(genome_grch38, grch38_sample_dir + "/grch38_all.csv", None)

In [13]:
gff_to_csv(genome_grch37, grch37_sample_dir + "/grch37_exon_only.csv", ['exon'])

In [14]:
gff_to_csv(genome_grch38, grch38_sample_dir + "/grch38_exon_only.csv", ['exon'])

In [3]:
colnames = ['sequence_id', 'refseq', 'region', 'start_index', 'end_index', 'start', 'end']
header = ",".join(colnames)
header

'sequence_id,refseq,region,start_index,end_index,start,end'

In [16]:
def gff_to_csvs(gff_file, target_folder, regions):
    f = open(gff_file)
    target_file = target_folder + '/'
    cur_seq = ""
    temp_seq = ""
    output_file = ""
    file_to_write = {}
    for line in f:
        d = _gff_parseline(line, regions)
        if d:
            output = "{},{},{},{},{},{},{} \n".format(d['sequence_id'], d['refseq'], d['region'], d['start_index'], d['end_index'], d['start'], d['end'])
            temp_seq = d['sequence_id']
            if cur_seq == "":
                cur_seq = temp_seq

            # Prepare desired file to write.
            output_file = target_file + temp_seq + '.csv'

            # Compare if this sequence_id is the as previous sequence_id.
            if temp_seq == cur_seq:

                # If it is then write to desired file.
                # Check if file exists. If not then create file.
                if os.path.exists(output_file):
                    file_to_write.write(output)
                else:
                    file_to_write = open(output_file, 'x')

                    # Write header first.
                    colnames = ['sequence_id', 'refseq', 'region', 'start_index', 'end_index', 'start', 'end']
                    header = ",".join(colnames)
                    file_to_write.write("{}\n".format(header))
                    file_to_write.write(output)
            
            # If this sequence_id is not the same as previous sequence_id, close the existing file.
            elif cur_seq != temp_seq:
                file_to_write.close()
                cur_seq = temp_seq

    # Close any file related to this procedure.
    file_to_write.close()
    f.close()                
            



In [20]:
gff_to_csvs(genome_grch38, genome_grch38_dir + '/exon', ['exon'])

In [22]:
gff_to_csvs(genome_grch37, genome_grch37_dir + '/exon', ['exon'])