In [None]:
import pandas as pd
from gtfparse import read_gtf

pd.options.mode.copy_on_write = True

In [None]:

# download hg38 gtf from https://www.ncbi.nlm.nih.gov/datasets/taxonomy/9606/
# read in gtf file
gtf_df = read_gtf("./genome_files/hg38.gtf.gz")
print(gtf_df)

INFO:root:Extracted GTF attributes: ['gene_id', 'transcript_id', 'db_xref', 'description', 'gbkey', 'gene', 'gene_biotype', 'pseudo', 'product', 'transcript_biotype', 'exon_number', 'gene_synonym', 'model_evidence', 'tag', 'protein_id', 'experiment', 'inference', 'note', 'part', 'exception', 'isoform', 'anticodon', 'partial', 'The', 'transl_except', 'non-AUG', 'standard_name', 'deleted', 'source', 'similar', 'substituted', 'transferase', 'codons', '12S', '16S', 'transl_table', 'ATPase']


shape: (4_697_665, 44)
┌──────────────┬────────┬────────────┬───────┬───┬─────┬─────┬──────────────┬────────┐
│ seqname      ┆ source ┆ feature    ┆ start ┆ … ┆ 12S ┆ 16S ┆ transl_table ┆ ATPase │
│ ---          ┆ ---    ┆ ---        ┆ ---   ┆   ┆ --- ┆ --- ┆ ---          ┆ ---    │
│ cat          ┆ str    ┆ cat        ┆ i64   ┆   ┆ str ┆ str ┆ str          ┆ str    │
╞══════════════╪════════╪════════════╪═══════╪═══╪═════╪═════╪══════════════╪════════╡
│ NC_000001.11 ┆        ┆ gene       ┆ 11874 ┆ … ┆     ┆     ┆              ┆        │
│ NC_000001.11 ┆        ┆ transcript ┆ 11874 ┆ … ┆     ┆     ┆              ┆        │
│ NC_000001.11 ┆        ┆ exon       ┆ 11874 ┆ … ┆     ┆     ┆              ┆        │
│ NC_000001.11 ┆        ┆ exon       ┆ 12613 ┆ … ┆     ┆     ┆              ┆        │
│ NC_000001.11 ┆        ┆ exon       ┆ 13221 ┆ … ┆     ┆     ┆              ┆        │
│ …            ┆ …      ┆ …          ┆ …     ┆ … ┆ …   ┆ …   ┆ …            ┆ …      │
│ NC_012920.1  ┆    

In [None]:
# BUILD MAP OF ALL GENES AND THEIR ASSOCIATED TRANSCRIPTS - HG38_TRANSCRIPTS.TSV
# keep only relevant columns for transcripts
gtf_df2 = gtf_df[['seqname', 'gene_id', 'transcript_id', 'feature', 'start', 'end', 'tag', 'strand', 'exon_number']]
gtf_df2 = pd.DataFrame(gtf_df2)
gtf_df2.columns = ['seqname','gene_id', 'transcript_id', 'feature', 'start', 'end', 'tag', 'strand', 'exon_number']

# keep only transcripts that have a cds entry
print(gtf_df2['transcript_id'].unique().shape)

gtf_tx = gtf_df2[(gtf_df2['feature'] == 'CDS')]
gtf_tx = gtf_tx.drop(columns=['feature'])
gtf_tx = gtf_tx.drop_duplicates("transcript_id")
gtf_tx['chr'] = gtf_tx['seqname']
gtf_tx = gtf_tx[gtf_tx['chr'].str.contains("NC")]
print(gtf_tx['transcript_id'].unique().shape)
gtf_tx['MANE'] = gtf_tx.apply(lambda x: -1 if "MANE" in x['tag'] else 0, axis=1)
gtf_tx['transcript_id'] = gtf_tx.apply(lambda x: x['transcript_id'] + " (MANE)" if x['MANE'] == -1 else x['transcript_id'], axis=1)
# keep minimal data in csvs
# sort by transcript id
gtf_tx = gtf_tx.sort_values(['gene_id', 'MANE', "transcript_id"])
# group by gene id and turn transcript ids into a list
gtf_tx_prnt = gtf_tx.groupby("gene_id").agg({"chr": 'max', 'strand':'max', "transcript_id": list}).reset_index()
gtf_tx_prnt.to_csv("./genome_files/hg38_transcripts.tsv", sep="\t", index=False)


(201190,)
(131141,)


In [None]:
# BUILD MAP OF ALL TRANSCRIPTS AND ASSOCIATED CODING EXONS LENGTHS/NUMS - HG38_TRANSCRIPT_CDS_LENGTHS.TSV
# create df with transcript id and start stop coords
gtf_coords = gtf_df2

# Filter for transcripts with CDS entries
transcripts_with_codons = gtf_coords[(gtf_coords['feature'] == 'CDS')]

# Group by transcript_id and aggregate the necessary information
coding_lengths = transcripts_with_codons.groupby('transcript_id').apply(lambda x: {
    'transcript_id': x['transcript_id'].iloc[0],
    'cds_lengths': list((x[x['feature'] == 'CDS']['end'].astype(int) - x[x['feature'] == 'CDS']['start'].astype(int) + 1)),
    'cds_exons': list(x[x['feature'] == 'CDS']['exon_number'].astype(int)),

}).tolist()
coding_lengths_df = pd.DataFrame(coding_lengths)
coding_lengths_df.to_csv("./genome_files/hg38_transcript_cds_lengths.tsv", sep="\t", index=False)



  coding_lengths = transcripts_with_codons.groupby('transcript_id').apply(lambda x: {


In [None]:
# BUILD MAP OF ALL TRANSCRIPTS AND ASSOCIATED CODING EXONS HG38 COORDS/NUMS - HG38_TRANSCRIPT_CDS_COORDS.TSV
coding_coords = transcripts_with_codons.groupby('transcript_id').apply(lambda x: {
    'transcript_id': x['transcript_id'].iloc[0],
    'cds_coords': list(zip(x[x['feature'] == 'CDS']['start'].astype(int), x[x['feature'] == 'CDS']['end'].astype(int))),
    'cds_exons': list(x[x['feature'] == 'CDS']['exon_number'].astype(int)),
}).tolist()
coding_coords_df = pd.DataFrame(coding_coords)
coding_coords_df.to_csv("./genome_files/hg38_transcript_cds_coords.tsv", sep="\t", index=False)


  coding_coords = transcripts_with_codons.groupby('transcript_id').apply(lambda x: {
