In [1]:
#tss
import gzip

gtf_file = "gencode.v48.basic.annotation.gtf.gz"
output_bed = "gencode_v48_TSSs.bed"

def parse_gtf_attributes(attr_str):
    """Extract gene_id and gene_name from GTF attributes column."""
    attrs = dict(item.strip().replace('"','').split(' ', 1)
                 for item in attr_str.strip().split(';') if item)
    return attrs.get("gene_id", "."), attrs.get("gene_name", ".")

with gzip.open(gtf_file, 'rt') as infile, open(output_bed, 'w') as outfile:
    for line in infile:
        if line.startswith("#"):
            continue
        cols = line.strip().split('\t')
        if cols[2] != "transcript":
            continue

        chrom = cols[0]
        start = int(cols[3])
        end = int(cols[4])
        strand = cols[6]
        attr = cols[8]

        gene_id, gene_name = parse_gtf_attributes(attr)

        # Determine TSS (0-based for BED)
        tss = start - 1 if strand == '+' else end - 1

        # Write to BED file (1 bp window at TSS)
        bed_line = f"{chrom}\t{tss}\t{tss+1}\t{gene_name}\t.\t{strand}\n"
        outfile.write(bed_line)

In [2]:
#promoter annotations
import gzip

gtf_file = "gencode.v48.basic.annotation.gtf.gz"
output_bed = "gencode_v48_promoters.bed"
promoter_window = 1000  # ±1 kb

def parse_gtf_attributes(attr_str):
    attrs = dict(item.strip().replace('"','').split(' ', 1)
                 for item in attr_str.strip().split(';') if item)
    return attrs.get("gene_id", "."), attrs.get("gene_name", ".")

with gzip.open(gtf_file, 'rt') as infile, open(output_bed, 'w') as outfile:
    for line in infile:
        if line.startswith("#"):
            continue
        cols = line.strip().split('\t')
        if cols[2] != "transcript":
            continue

        chrom = cols[0]
        start = int(cols[3])
        end = int(cols[4])
        strand = cols[6]
        attr = cols[8]

        gene_id, gene_name = parse_gtf_attributes(attr)

        tss = start if strand == '+' else end
        promoter_start = max(0, tss - promoter_window)
        promoter_end = tss + promoter_window

        bed_line = f"{chrom}\t{promoter_start}\t{promoter_end}\t{gene_name}\t.\t{strand}\n"
        outfile.write(bed_line)


In [3]:
#gene bodies
import gzip

gtf_file = "gencode.v48.basic.annotation.gtf.gz"
output_bed = "gencode_v48_gene_bodies.bed"

def parse_gtf_attributes(attr_str):
    attrs = dict(item.strip().replace('"','').split(' ', 1)
                 for item in attr_str.strip().split(';') if item)
    return attrs.get("gene_id", "."), attrs.get("gene_name", ".")

with gzip.open(gtf_file, 'rt') as infile, open(output_bed, 'w') as outfile:
    for line in infile:
        if line.startswith("#"):
            continue
        cols = line.strip().split('\t')
        if cols[2] != "gene":
            continue

        chrom = cols[0]
        start = int(cols[3]) - 1  # BED is 0-based
        end = int(cols[4])
        strand = cols[6]
        attr = cols[8]

        gene_id, gene_name = parse_gtf_attributes(attr)

        bed_line = f"{chrom}\t{start}\t{end}\t{gene_name}\t.\t{strand}\n"
        outfile.write(bed_line)


In [4]:
#exons
import gzip

gtf_file = "gencode.v48.basic.annotation.gtf.gz"
output_bed = "gencode_v48_exons.bed"

def parse_gtf_attributes(attr_str):
    attrs = dict(item.strip().replace('"','').split(' ', 1)
                 for item in attr_str.strip().split(';') if item)
    return attrs.get("gene_id", "."), attrs.get("gene_name", ".")

with gzip.open(gtf_file, 'rt') as infile, open(output_bed, 'w') as outfile:
    for line in infile:
        if line.startswith("#"):
            continue
        cols = line.strip().split('\t')
        if cols[2] != "exon":
            continue

        chrom = cols[0]
        start = int(cols[3]) - 1
        end = int(cols[4])
        strand = cols[6]
        attr = cols[8]

        gene_id, gene_name = parse_gtf_attributes(attr)
        bed_line = f"{chrom}\t{start}\t{end}\t{gene_name}\t.\t{strand}\n"
        outfile.write(bed_line)


In [5]:
#introns
from collections import defaultdict

gtf_file = "gencode.v48.basic.annotation.gtf.gz"
output_bed = "gencode_v48_introns.bed"

def parse_gtf_attributes(attr_str):
    attrs = dict(item.strip().replace('"','').split(' ', 1)
                 for item in attr_str.strip().split(';') if item)
    return attrs.get("transcript_id", "."), attrs.get("gene_name", ".")

# Collect exon coordinates per transcript
transcripts = defaultdict(lambda: {'chrom': '', 'strand': '', 'exons': [], 'gene_name': ''})

with gzip.open(gtf_file, 'rt') as infile:
    for line in infile:
        if line.startswith("#"):
            continue
        cols = line.strip().split('\t')
        if cols[2] != "exon":
            continue

        chrom = cols[0]
        start = int(cols[3]) - 1
        end = int(cols[4])
        strand = cols[6]
        attr = cols[8]

        tx_id, gene_name = parse_gtf_attributes(attr)

        transcripts[tx_id]['chrom'] = chrom
        transcripts[tx_id]['strand'] = strand
        transcripts[tx_id]['gene_name'] = gene_name
        transcripts[tx_id]['exons'].append((start, end))

# Infer introns
with open(output_bed, 'w') as out:
    for tx_id, info in transcripts.items():
        chrom = info['chrom']
        strand = info['strand']
        gene_name = info['gene_name']
        exons = sorted(info['exons'])

        for i in range(len(exons) - 1):
            intron_start = exons[i][1]
            intron_end = exons[i + 1][0]
            if intron_start < intron_end:
                out.write(f"{chrom}\t{intron_start}\t{intron_end}\t{gene_name}\t.\t{strand}\n")


In [6]:
#biotype protein_coding, lncrna

gtf_file = "gencode.v48.basic.annotation.gtf.gz"
output_bed = "gencode_v48_gene_biotypes.bed"

def parse_gtf_attributes(attr_str):
    attrs = {}
    for item in attr_str.strip().split(';'):
        if item:
            key, val = item.strip().split(' ', 1)
            attrs[key] = val.strip('"')
    return attrs

with gzip.open(gtf_file, 'rt') as infile, open(output_bed, 'w') as outfile:
    for line in infile:
        if line.startswith("#"):
            continue
        cols = line.strip().split('\t')
        if cols[2] != "gene":
            continue

        chrom = cols[0]
        start = int(cols[3]) - 1  # BED is 0-based
        end = int(cols[4])
        strand = cols[6]
        attr = parse_gtf_attributes(cols[8])

        gene_name = attr.get("gene_name", ".")
        gene_type = attr.get("gene_type", attr.get("gene_biotype", "."))

        bed_line = f"{chrom}\t{start}\t{end}\t{gene_name}|{gene_type}\t.\t{strand}\n"
        outfile.write(bed_line)


In [7]:
#gene annotations
import gzip

gtf_file = "gencode.v48.basic.annotation.gtf.gz"
output_bed = "gencode_v48_gene_symbols_ids.bed"

def parse_gtf_attributes(attr_str):
    attrs = {}
    for item in attr_str.strip().split(';'):
        if item:
            key, val = item.strip().split(' ', 1)
            attrs[key] = val.strip('"')
    return attrs

with gzip.open(gtf_file, 'rt') as infile, open(output_bed, 'w') as outfile:
    for line in infile:
        if line.startswith("#"):
            continue
        cols = line.strip().split('\t')
        if cols[2] != "gene":
            continue

        chrom = cols[0]
        start = int(cols[3]) - 1  # BED format: 0-based
        end = int(cols[4])
        strand = cols[6]
        attr = parse_gtf_attributes(cols[8])

        gene_id = attr.get("gene_id", ".")
        gene_name = attr.get("gene_name", ".")

        bed_line = f"{chrom}\t{start}\t{end}\t{gene_name}|{gene_id}\t.\t{strand}\n"
        outfile.write(bed_line)


In [8]:
count = 0
with open("gencode_v48_gene_symbols_ids.bed") as f:
    for line in f:
        if line.strip():  # skip empty lines
            count += 1
print(count)

78686
