# Testing NCBI hg38.p14 GTF file

*Anusha Aggarwal*

In [42]:
import numpy as np
import pandas as pd
import re

In [45]:
gtf_path = '/mnt/home/aaggarwal/ceph/gates_proj/ncbi_genome_hg38.p14/GCF_000001405.40_GRCh38.p14_genomic.final_CDS.gtf'

col_names = [
    "seqname", "source", "feature", "start", "end",
    "score", "strand", "frame", "attributes"
]

gtf = pd.read_csv(
    gtf_path,
    sep='\t',
    comment='#',
    names=col_names,
    dtype=str
)

gtf['start'] = gtf['start'].astype(int)
gtf['end'] = gtf['end'].astype(int)

gtf['gene_id'] = gtf['attributes'].str.extract(r'gene_id "([^"]+)"')
gtf['transcript_id'] = gtf['attributes'].str.extract(r'transcript_id "([^"]+)"')
gtf['entrez_id'] = gtf['attributes'].str.extract(r'GeneID:(\d+)')

gtf

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attributes,gene_id,transcript_id,entrez_id
0,chr1,BestRefSeq,CDS,65565,65573,.,+,0,"gene_id ""OR4F5""; transcript_id ""NM_001005484.2...",OR4F5,NM_001005484.2,79501
1,chr1,BestRefSeq,CDS,69037,70005,.,+,0,"gene_id ""OR4F5""; transcript_id ""NM_001005484.2...",OR4F5,NM_001005484.2,79501
2,chr1,BestRefSeq,start_codon,65565,65567,.,+,0,"gene_id ""OR4F5""; transcript_id ""NM_001005484.2...",OR4F5,NM_001005484.2,79501
3,chr1,BestRefSeq,stop_codon,70006,70008,.,+,0,"gene_id ""OR4F5""; transcript_id ""NM_001005484.2...",OR4F5,NM_001005484.2,79501
4,chr1,BestRefSeq,CDS,450743,451678,.,-,0,"gene_id ""OR4F29""; transcript_id ""NM_001005221....",OR4F29,NM_001005221.2,729759
...,...,...,...,...,...,...,...,...,...,...,...,...
230230,chrY,BestRefSeq,stop_codon,25038098,25038100,.,-,0,"gene_id ""BPY2C""; transcript_id ""NM_001002761.1...",BPY2C,NM_001002761.1,442868
230231,chrY,BestRefSeq,CDS,25622443,25624034,.,+,0,"gene_id ""CDY1""; transcript_id ""NM_004680.3""; d...",CDY1,NM_004680.3,9085
230232,chrY,BestRefSeq,CDS,25624455,25624524,.,+,1,"gene_id ""CDY1""; transcript_id ""NM_004680.3""; d...",CDY1,NM_004680.3,9085
230233,chrY,BestRefSeq,start_codon,25622443,25622445,.,+,0,"gene_id ""CDY1""; transcript_id ""NM_004680.3""; d...",CDY1,NM_004680.3,9085


In [46]:
gtf["length"] = gtf["end"] - gtf["start"] + 1

gtf["is_multiple_of_3"] = gtf["length"] % 3 == 0

In [47]:
gtf

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attributes,gene_id,transcript_id,entrez_id,length,is_multiple_of_3
0,chr1,BestRefSeq,CDS,65565,65573,.,+,0,"gene_id ""OR4F5""; transcript_id ""NM_001005484.2...",OR4F5,NM_001005484.2,79501,9,True
1,chr1,BestRefSeq,CDS,69037,70005,.,+,0,"gene_id ""OR4F5""; transcript_id ""NM_001005484.2...",OR4F5,NM_001005484.2,79501,969,True
2,chr1,BestRefSeq,start_codon,65565,65567,.,+,0,"gene_id ""OR4F5""; transcript_id ""NM_001005484.2...",OR4F5,NM_001005484.2,79501,3,True
3,chr1,BestRefSeq,stop_codon,70006,70008,.,+,0,"gene_id ""OR4F5""; transcript_id ""NM_001005484.2...",OR4F5,NM_001005484.2,79501,3,True
4,chr1,BestRefSeq,CDS,450743,451678,.,-,0,"gene_id ""OR4F29""; transcript_id ""NM_001005221....",OR4F29,NM_001005221.2,729759,936,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230230,chrY,BestRefSeq,stop_codon,25038098,25038100,.,-,0,"gene_id ""BPY2C""; transcript_id ""NM_001002761.1...",BPY2C,NM_001002761.1,442868,3,True
230231,chrY,BestRefSeq,CDS,25622443,25624034,.,+,0,"gene_id ""CDY1""; transcript_id ""NM_004680.3""; d...",CDY1,NM_004680.3,9085,1592,False
230232,chrY,BestRefSeq,CDS,25624455,25624524,.,+,1,"gene_id ""CDY1""; transcript_id ""NM_004680.3""; d...",CDY1,NM_004680.3,9085,70,False
230233,chrY,BestRefSeq,start_codon,25622443,25622445,.,+,0,"gene_id ""CDY1""; transcript_id ""NM_004680.3""; d...",CDY1,NM_004680.3,9085,3,True


In [48]:
print(gtf['is_multiple_of_3'].value_counts())

is_multiple_of_3
True     118609
False    111626
Name: count, dtype: int64


In [49]:
breakdown = gtf.groupby(['frame', 'is_multiple_of_3']).size().unstack(fill_value=0)
print(breakdown)

is_multiple_of_3  False  True 
frame                         
0                 45337  91248
1                 29780   8025
2                 36509  19336


In [50]:
# Ensure correct types
gtf["start"] = gtf["start"].astype(int)
gtf["end"] = gtf["end"].astype(int)
gtf["frame"] = gtf["frame"].fillna("0").astype(str)

# Adjust length based on frame
def adjust_length(row):
    frame = int(row["frame"]) if row["frame"].isdigit() else 0
    length = row["end"] - row["start"] + 1
    return length - frame

gtf["adjusted_length"] = gtf.apply(adjust_length, axis=1)
gtf["is_multiple_of_3"] = gtf["adjusted_length"] % 3 == 0

# Generate new stats
adjusted_stats = gtf.groupby(["frame", "is_multiple_of_3"]).size().unstack(fill_value=0)
adjusted_stats


is_multiple_of_3,False,True
frame,Unnamed: 1_level_1,Unnamed: 2_level_1
0,45337,91248
1,18175,19630
2,30139,25706


In [51]:
# Subset GTF
cds_df = gtf[gtf['feature'] == 'CDS'].copy()
start_df = gtf[gtf['feature'] == 'start_codon'].copy()
stop_df = gtf[gtf['feature'] == 'stop_codon'].copy()

# Convert to numeric
for col in ['start', 'end']:
    cds_df[col] = pd.to_numeric(cds_df[col])
    start_df[col] = pd.to_numeric(start_df[col])
    stop_df[col] = pd.to_numeric(stop_df[col])

# Group CDS per transcript
cds_grp = cds_df.groupby('entrez_id').agg({
    'start': ['min', 'max'],
    'end': ['min', 'max'],
    'strand': 'first'
}).reset_index()

cds_grp.columns = ['entrez_id', 'cds_start_min', 'cds_start_max', 'cds_end_min', 'cds_end_max', 'strand']

# Get start_codon and stop_codon positions
start_codon_pos = start_df.groupby('entrez_id')['start'].first().rename('start_codon_start')
stop_codon_pos = stop_df.groupby('entrez_id')['end'].first().rename('stop_codon_end')

# Merge all
merged = cds_grp.merge(start_codon_pos, on='entrez_id', how='left')
merged = merged.merge(stop_codon_pos, on='entrez_id', how='left')

# Define check conditions
def check_start(row):
    if row['strand'] == '+':
        return row['start_codon_start'] == row['cds_start_min']
    else:
        return row['start_codon_start'] == row['cds_end_max']

def check_stop(row):
    if row['strand'] == '+':
        return row['stop_codon_end'] == row['cds_end_max']
    else:
        return row['stop_codon_end'] == row['cds_start_min']

merged['start_matches'] = merged.apply(check_start, axis=1)
merged['stop_matches'] = merged.apply(check_stop, axis=1)

# Summary stats
start_match_pct = merged['start_matches'].mean() * 100
stop_match_pct = merged['stop_matches'].mean() * 100

print(f"Start codon matches CDS start in {start_match_pct:.2f}% of cases")
print(f"Stop codon matches CDS end in {stop_match_pct:.2f}% of cases")


Start codon matches CDS start in 50.69% of cases
Stop codon matches CDS end in 0.00% of cases


In [56]:
# Filter for start and stop codons
codon_df = gtf[gtf['feature'].isin(['start_codon', 'stop_codon'])]

# Check if all start/stop codons have frame == 0
codon_frame_counts = codon_df['frame'].value_counts(dropna=False)

codon_frame_counts


frame
0    38438
1       42
2       11
Name: count, dtype: int64

In [57]:
import pandas as pd

# Filter for start_codon and stop_codon features
start_stop_codons = gtf[gtf["feature"].isin(["start_codon", "stop_codon"])].copy()

# Convert start and end to integers
start_stop_codons["start"] = start_stop_codons["start"].astype(int)
start_stop_codons["end"] = start_stop_codons["end"].astype(int)

# Calculate lengths
start_stop_codons["length"] = start_stop_codons["end"] - start_stop_codons["start"] + 1

# Group by feature and summarize length distribution
length_summary = start_stop_codons.groupby("feature")["length"].value_counts().unstack().fillna(0).astype(int)


In [58]:
length_summary

length,1,2,3
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
start_codon,33,33,19186
stop_codon,20,20,19199


In [54]:
# Ensure all numeric columns are proper integers
gtf[["start", "end"]] = gtf[["start", "end"]].astype(int)

# Split into feature types
cds_df = gtf[gtf["feature"] == "CDS"]
stop_df = gtf[gtf["feature"] == "stop_codon"]

# Get last and first CDS entries per transcript
last_cds = cds_df.sort_values(by=["transcript_id", "end"]).groupby("transcript_id").last().reset_index()
first_cds = cds_df.sort_values(by=["transcript_id", "start"]).groupby("transcript_id").first().reset_index()

# Keep necessary columns only
last_cds = last_cds[["transcript_id", "end"]].rename(columns={"end": "cds_end"})
first_cds = first_cds[["transcript_id", "start"]].rename(columns={"start": "cds_start"})

# Prepare stop codons with transcript and strand
stop_check = stop_df[["transcript_id", "start", "end", "strand"]].copy()

# Merge
stop_check = stop_check.merge(last_cds, on="transcript_id", how="left")
stop_check = stop_check.merge(first_cds, on="transcript_id", how="left")

# Check for match, considering strand
stop_check["stop_matches"] = (
    ((stop_check["strand"] == "+") & (stop_check["start"] == stop_check["cds_end"] + 1)) |
    ((stop_check["strand"] == "-") & (stop_check["end"] == stop_check["cds_start"] - 1))
)

# Result
print(f"Stop codon follows CDS end correctly in {stop_check['stop_matches'].mean():.2%} of transcripts")


Stop codon follows CDS end correctly in 99.82% of transcripts


In [39]:
mane_select_count = gtf['attributes'].str.contains('MANE Select', na=False).sum()
print("Number of entries with MANE Select:", mane_select_count)

Number of entries with MANE Select: 230235


In [23]:
transcript_counts = gtf.groupby("gene_id")["transcript_id"].nunique()

multiple_transcripts = transcript_counts[transcript_counts > 1]

In [24]:
multiple_transcripts

Series([], Name: transcript_id, dtype: int64)

In [26]:
unique_gene_ids_count = gtf["gene_id"].nunique()
unique_gene_ids_count

19219

In [27]:
unique_gene_ids_count = gtf["entrez_id"].nunique()
unique_gene_ids_count

19219

In [28]:
unique_gene_ids_count = gtf["transcript_id"].nunique()
unique_gene_ids_count

19219

In [30]:
mane_df = gtf[gtf['attributes'].str.contains('tag "MANE Select"')]

gene_mane_counts = mane_df.groupby('gene_id')['transcript_id'].nunique()

multiple_mane = gene_mane_counts[gene_mane_counts > 1]
print("Genes with multiple MANE Select transcripts:")
print(multiple_mane)

all_gene_ids = gtf['gene_id'].unique()
mane_gene_ids = mane_df['gene_id'].unique()
missing_mane = set(all_gene_ids) - set(mane_gene_ids)
print("Genes with no MANE Select transcript:")
print(missing_mane)

Genes with multiple MANE Select transcripts:
Series([], Name: transcript_id, dtype: int64)
Genes with no MANE Select transcript:
set()


In [40]:
gtf_path = '/mnt/home/aaggarwal/ceph/gates_proj/ncbi_genome_hg38.p14/GCF_000001405.40_GRCh38.p14_genomic.final_transcript.gtf'

col_names = [
    "seqname", "source", "feature", "start", "end",
    "score", "strand", "frame", "attributes"
]

gtf = pd.read_csv(
    gtf_path,
    sep='\t',
    comment='#',
    names=col_names,
    dtype=str
)

gtf['gene_id'] = gtf['attributes'].str.extract(r'gene_id "([^"]+)"')
gtf['transcript_id'] = gtf['attributes'].str.extract(r'transcript_id "([^"]+)"')
gtf['entrez_id'] = gtf['attributes'].str.extract(r'GeneID:(\d+)')

gtf

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attributes,gene_id,transcript_id,entrez_id
0,chr1,BestRefSeq,transcript,65419,71585,.,+,.,"gene_id ""OR4F5""; transcript_id ""NM_001005484.2...",OR4F5,NM_001005484.2,79501
1,chr1,BestRefSeq,transcript,450740,451678,.,-,.,"gene_id ""OR4F29""; transcript_id ""NM_001005221....",OR4F29,NM_001005221.2,729759
2,chr1,BestRefSeq,transcript,685716,686654,.,-,.,"gene_id ""OR4F16""; transcript_id ""NM_001005277....",OR4F16,NM_001005277.1,81399
3,chr1,BestRefSeq,transcript,923923,944574,.,+,.,"gene_id ""SAMD11""; transcript_id ""NM_001385641....",SAMD11,NM_001385641.1,148398
4,chr1,BestRefSeq,transcript,944203,959256,.,-,.,"gene_id ""NOC2L""; transcript_id ""NM_015658.4""; ...",NOC2L,NM_015658.4,26155
...,...,...,...,...,...,...,...,...,...,...,...,...
19214,chrY,BestRefSeq,transcript,24618004,24639207,.,+,.,"gene_id ""BPY2B""; transcript_id ""NM_001002760.1...",BPY2B,NM_001002760.1,442867
19215,chrY,BestRefSeq,transcript,24763069,24813393,.,-,.,"gene_id ""DAZ3""; transcript_id ""NM_020364.4""; d...",DAZ3,NM_020364.4,57054
19216,chrY,BestRefSeq,transcript,24833919,24907040,.,+,.,"gene_id ""DAZ4""; transcript_id ""NM_001388484.1""...",DAZ4,NM_001388484.1,57135
19217,chrY,BestRefSeq,transcript,25030901,25052104,.,-,.,"gene_id ""BPY2C""; transcript_id ""NM_001002761.1...",BPY2C,NM_001002761.1,442868


In [32]:
transcript_counts = gtf.groupby("gene_id")["transcript_id"].nunique()

multiple_transcripts = transcript_counts[transcript_counts > 1]

In [33]:
multiple_transcripts

Series([], Name: transcript_id, dtype: int64)

In [34]:
unique_gene_ids_count = gtf["gene_id"].nunique()
unique_gene_ids_count

19219

In [35]:
unique_gene_ids_count = gtf["entrez_id"].nunique()
unique_gene_ids_count

19219

In [36]:
unique_gene_ids_count = gtf["transcript_id"].nunique()
unique_gene_ids_count

19219

In [41]:
mane_select_count = gtf['attributes'].str.contains('MANE Select', na=False).sum()
print("Number of entries with MANE Select:", mane_select_count)

Number of entries with MANE Select: 19219


## Check the csv GTF files

In [59]:
gtf_cds = pd.read_csv('/mnt/home/aaggarwal/ceph/gates_proj/ncbi_genome_hg38.p14/hg38.p14.ncbiRefSeq.CDS_final_gtf.csv')
gtf_trans = pd.read_csv('/mnt/home/aaggarwal/ceph/gates_proj/ncbi_genome_hg38.p14/hg38.p14.ncbiRefSeq.transcript_final_gtf.csv')

In [60]:
gtf_cds

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attributes,gene_id,transcript_id,entrez_id,length
0,chr1,BestRefSeq,CDS,65565,65573,.,+,0,"gene_id ""OR4F5""; transcript_id ""NM_001005484.2...",OR4F5,NM_001005484.2,79501,9
1,chr1,BestRefSeq,CDS,69037,70005,.,+,0,"gene_id ""OR4F5""; transcript_id ""NM_001005484.2...",OR4F5,NM_001005484.2,79501,969
2,chr1,BestRefSeq,start_codon,65565,65567,.,+,0,"gene_id ""OR4F5""; transcript_id ""NM_001005484.2...",OR4F5,NM_001005484.2,79501,3
3,chr1,BestRefSeq,stop_codon,70006,70008,.,+,0,"gene_id ""OR4F5""; transcript_id ""NM_001005484.2...",OR4F5,NM_001005484.2,79501,3
4,chr1,BestRefSeq,CDS,450743,451678,.,-,0,"gene_id ""OR4F29""; transcript_id ""NM_001005221....",OR4F29,NM_001005221.2,729759,936
...,...,...,...,...,...,...,...,...,...,...,...,...,...
229586,chrY,BestRefSeq,stop_codon,25038098,25038100,.,-,0,"gene_id ""BPY2C""; transcript_id ""NM_001002761.1...",BPY2C,NM_001002761.1,442868,3
229587,chrY,BestRefSeq,CDS,25622443,25624034,.,+,0,"gene_id ""CDY1""; transcript_id ""NM_004680.3""; d...",CDY1,NM_004680.3,9085,1592
229588,chrY,BestRefSeq,CDS,25624455,25624524,.,+,1,"gene_id ""CDY1""; transcript_id ""NM_004680.3""; d...",CDY1,NM_004680.3,9085,70
229589,chrY,BestRefSeq,start_codon,25622443,25622445,.,+,0,"gene_id ""CDY1""; transcript_id ""NM_004680.3""; d...",CDY1,NM_004680.3,9085,3


In [61]:
gtf_trans

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attributes,gene_id,transcript_id,entrez_id
0,chr1,BestRefSeq,transcript,65419,71585,.,+,.,"gene_id ""OR4F5""; transcript_id ""NM_001005484.2...",OR4F5,NM_001005484.2,79501
1,chr1,BestRefSeq,transcript,450740,451678,.,-,.,"gene_id ""OR4F29""; transcript_id ""NM_001005221....",OR4F29,NM_001005221.2,729759
2,chr1,BestRefSeq,transcript,685716,686654,.,-,.,"gene_id ""OR4F16""; transcript_id ""NM_001005277....",OR4F16,NM_001005277.1,81399
3,chr1,BestRefSeq,transcript,923923,944574,.,+,.,"gene_id ""SAMD11""; transcript_id ""NM_001385641....",SAMD11,NM_001385641.1,148398
4,chr1,BestRefSeq,transcript,944203,959256,.,-,.,"gene_id ""NOC2L""; transcript_id ""NM_015658.4""; ...",NOC2L,NM_015658.4,26155
...,...,...,...,...,...,...,...,...,...,...,...,...
19161,chrY,BestRefSeq,transcript,24618004,24639207,.,+,.,"gene_id ""BPY2B""; transcript_id ""NM_001002760.1...",BPY2B,NM_001002760.1,442867
19162,chrY,BestRefSeq,transcript,24763069,24813393,.,-,.,"gene_id ""DAZ3""; transcript_id ""NM_020364.4""; d...",DAZ3,NM_020364.4,57054
19163,chrY,BestRefSeq,transcript,24833919,24907040,.,+,.,"gene_id ""DAZ4""; transcript_id ""NM_001388484.1""...",DAZ4,NM_001388484.1,57135
19164,chrY,BestRefSeq,transcript,25030901,25052104,.,-,.,"gene_id ""BPY2C""; transcript_id ""NM_001002761.1...",BPY2C,NM_001002761.1,442868


In [62]:
unique_gene_ids_count = gtf_cds["gene_id"].nunique()
unique_gene_ids_count

19166

In [63]:
unique_gene_ids_count = gtf_cds["entrez_id"].nunique()
unique_gene_ids_count

19166

In [66]:
from tqdm import tqdm
tqdm.pandas()

In [67]:
# Filter only start_codon and CDS
start_codons = gtf_cds[gtf_cds['feature'] == 'start_codon'].copy()
cds_regions = gtf_cds[gtf_cds['feature'] == 'CDS'].copy()

# Ensure all coordinates are integers
start_codons["start"] = start_codons["start"].astype(int)
start_codons["end"] = start_codons["end"].astype(int)
cds_regions["start"] = cds_regions["start"].astype(int)
cds_regions["end"] = cds_regions["end"].astype(int)

# For each start codon, count how many CDS regions it overlaps with
def count_overlaps(row, cds):
    subset = cds[cds['transcript_id'] == row['transcript_id']]
    return ((subset['start'] <= row['end']) & (subset['end'] >= row['start'])).sum()

start_codons['cds_overlap_count'] = start_codons.progress_apply(lambda row: count_overlaps(row, cds_regions), axis=1)

# Count how many start codons have exactly 1 overlapping CDS
overlap_counts = start_codons['cds_overlap_count'].value_counts().sort_index()
overlap_counts


100%|██████████| 19166/19166 [03:04<00:00, 103.71it/s]


cds_overlap_count
1    19166
Name: count, dtype: int64

In [68]:
# Filter stop codons
stop_codons = gtf_cds[gtf_cds['feature'] == 'stop_codon'].copy()

# Ensure coordinates are ints
stop_codons["start"] = stop_codons["start"].astype(int)
stop_codons["end"] = stop_codons["end"].astype(int)

# Preprocess CDS
cds_regions = gtf_cds[gtf_cds['feature'] == 'CDS'].copy()
cds_regions["start"] = cds_regions["start"].astype(int)
cds_regions["end"] = cds_regions["end"].astype(int)

# For each stop codon, check for:
# - overlap with any CDS
# - or being exactly adjacent to the last CDS (strand-aware)
def is_valid_stop(row, cds):
    tx = row['transcript_id']
    strand = row['strand']
    tx_cds = cds[cds['transcript_id'] == tx]
    
    # Check overlap
    overlap = ((tx_cds['start'] <= row['end']) & (tx_cds['end'] >= row['start'])).any()
    if overlap:
        return True

    # Strand-aware adjacency
    if strand == '+':
        last_cds_end = tx_cds['end'].max()
        return row['start'] == last_cds_end + 1
    elif strand == '-':
        first_cds_start = tx_cds['start'].min()
        return row['end'] == first_cds_start - 1

    return False  # fallback

from tqdm import tqdm
tqdm.pandas()

stop_codons['is_valid_stop'] = stop_codons.progress_apply(lambda row: is_valid_stop(row, cds_regions), axis=1)

# Summary
valid_pct = stop_codons['is_valid_stop'].mean() * 100
print(f"✔️ Valid stop codons: {valid_pct:.2f}%")

100%|██████████| 19166/19166 [03:12<00:00, 99.72it/s] 

✔️ Valid stop codons: 99.92%





In [69]:
cds_df = gtf_cds[gtf_cds['feature'] == 'CDS']
start_codon_df = gtf_cds[gtf_cds['feature'] == 'start_codon']

# Collect suspicious transcripts
mismatched_transcripts = []

# Iterate through all unique transcripts with start codons
for transcript_id in tqdm(start_codon_df['transcript_id'].unique(), desc="Checking transcripts"):
    start_rows = start_codon_df[start_codon_df['transcript_id'] == transcript_id]
    cds_rows = cds_df[cds_df['transcript_id'] == transcript_id]

    if start_rows.empty or cds_rows.empty:
        continue

    # Assume only one start_codon row per transcript (standard for well-formed GTFs)
    start_row = start_rows.iloc[0]
    start_coord = start_row['start']
    strand = start_row['strand']

    # Find overlapping CDS
    overlapping = cds_rows[
        (cds_rows['start'] <= start_coord) &
        (cds_rows['end'] >= start_coord)
    ]

    if not overlapping.empty:
        frame = overlapping.iloc[0]['frame']
        if str(frame).isdigit() and int(frame) != 0:
            mismatched_transcripts.append({
                'transcript_id': transcript_id,
                'frame': frame,
                'cds_start': overlapping.iloc[0]['start'],
                'cds_end': overlapping.iloc[0]['end'],
                'start_codon_pos': start_coord,
                'strand': strand
            })

# Show results
print(f"\nFound {len(mismatched_transcripts)} transcripts where start codon overlaps CDS with frame > 0")


Checking transcripts: 100%|██████████| 19166/19166 [03:26<00:00, 92.63it/s]


Found 0 transcripts where start codon overlaps CDS with frame > 0





In [70]:
mismatched_transcripts

[]

In [213]:
def get_adjusted_cds_windows(transcript_id, chrom, strand, gtf_df):
    # Filter relevant entries
    transcript_df = gtf_df[gtf_df['transcript_id'] == transcript_id]
    cds_df = transcript_df[transcript_df['feature'] == 'CDS']
    start_df = transcript_df[transcript_df['feature'] == 'start_codon']
    stop_df = transcript_df[transcript_df['feature'] == 'stop_codon']

    if cds_df.empty or start_df.empty or stop_df.empty:
        return None  # Missing critical annotations

    # Get coordinates
    start_codon_start = start_df.iloc[0]['start']
    stop_codon_end = stop_df.iloc[0]['end']

    # Keep only CDSs that intersect coding region
    if strand == '+':
        cds_df = cds_df[
            (cds_df['end'] >= start_codon_start) &
            (cds_df['start'] <= stop_codon_end)
        ].sort_values(by='start')
    else:
        start_codon_end = start_df.iloc[0]['end']
        stop_codon_start = stop_df.iloc[0]['start']
    
        cds_df = cds_df[
            (cds_df['end'] >= stop_codon_start) &
            (cds_df['start'] <= start_codon_end)
        ].sort_values(by='end', ascending=False)

    if cds_df.empty:
        return None

    # Build trimmed CDS windows
    cds_windows = []
    for i, row in cds_df.iterrows():
        start = row['start']
        end = row['end']

        # First CDS: adjust start if needed
        if strand == '+':
            if start <= start_codon_start <= end:
                start = start_codon_start
        else:
            if start <= start_codon_end <= end:
                end = start_codon_end

        # Last CDS: adjust end if needed
        if strand == '+':
            if start <= stop_codon_end <= end:
                end = stop_codon_end
        else:
            if start <= stop_codon_start <= end:
                start = stop_codon_start

        # Sanity check
        if end > start:
            cds_windows.append((start, end, int(row['frame'])))

    return cds_windows if cds_windows else None


In [232]:
gtf_cds[gtf_cds['entrez_id'] == 339456]

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attributes,gene_id,transcript_id,entrez_id,length
539,chr1,BestRefSeq,CDS,1919182,1919265,.,-,0,"gene_id ""TMEM52""; transcript_id ""NM_178545.4"";...",TMEM52,NM_178545.4,339456,84
540,chr1,BestRefSeq,CDS,1919045,1919088,.,-,0,"gene_id ""TMEM52""; transcript_id ""NM_178545.4"";...",TMEM52,NM_178545.4,339456,44
541,chr1,BestRefSeq,CDS,1918893,1918934,.,-,1,"gene_id ""TMEM52""; transcript_id ""NM_178545.4"";...",TMEM52,NM_178545.4,339456,42
542,chr1,BestRefSeq,CDS,1918253,1918431,.,-,1,"gene_id ""TMEM52""; transcript_id ""NM_178545.4"";...",TMEM52,NM_178545.4,339456,179
543,chr1,BestRefSeq,CDS,1917885,1918162,.,-,2,"gene_id ""TMEM52""; transcript_id ""NM_178545.4"";...",TMEM52,NM_178545.4,339456,278
544,chr1,BestRefSeq,start_codon,1919263,1919265,.,-,0,"gene_id ""TMEM52""; transcript_id ""NM_178545.4"";...",TMEM52,NM_178545.4,339456,3
545,chr1,BestRefSeq,stop_codon,1917882,1917884,.,-,0,"gene_id ""TMEM52""; transcript_id ""NM_178545.4"";...",TMEM52,NM_178545.4,339456,3


In [216]:
gtf_cds[gtf_cds['transcript_id'] == 'NM_014188.3']

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attributes,gene_id,transcript_id,entrez_id,length
415,chr1,BestRefSeq,CDS,1574478,1574557,.,-,0,"gene_id ""SSU72""; transcript_id ""NM_014188.3""; ...",SSU72,NM_014188.3,29101,80
416,chr1,BestRefSeq,CDS,1564773,1564916,.,-,1,"gene_id ""SSU72""; transcript_id ""NM_014188.3""; ...",SSU72,NM_014188.3,29101,144
417,chr1,BestRefSeq,CDS,1544863,1545002,.,-,1,"gene_id ""SSU72""; transcript_id ""NM_014188.3""; ...",SSU72,NM_014188.3,29101,140
418,chr1,BestRefSeq,CDS,1543869,1543987,.,-,2,"gene_id ""SSU72""; transcript_id ""NM_014188.3""; ...",SSU72,NM_014188.3,29101,119
419,chr1,BestRefSeq,CDS,1542069,1542167,.,-,0,"gene_id ""SSU72""; transcript_id ""NM_014188.3""; ...",SSU72,NM_014188.3,29101,99
420,chr1,BestRefSeq,start_codon,1574555,1574557,.,-,0,"gene_id ""SSU72""; transcript_id ""NM_014188.3""; ...",SSU72,NM_014188.3,29101,3
421,chr1,BestRefSeq,stop_codon,1542066,1542068,.,-,0,"gene_id ""SSU72""; transcript_id ""NM_014188.3""; ...",SSU72,NM_014188.3,29101,3


In [217]:
windows = get_adjusted_cds_windows("NM_014188.3", "chr1", "-", gtf_cds)

In [218]:
windows

[(1574478, 1574557, 0),
 (1564773, 1564916, 1),
 (1544863, 1545002, 1),
 (1543869, 1543987, 2),
 (1542069, 1542167, 0)]

In [219]:
import os
from pyfaidx import Fasta
from Bio.Seq import Seq


In [220]:
genome_fasta_dir = "/mnt/home/aaggarwal/ceph/gates_proj/ncbi_genome_hg38.p14"

# load the genome
fasta_files = {
    f.replace(".fa", ""): os.path.join(genome_fasta_dir, f)
    for f in os.listdir(genome_fasta_dir) if f.endswith(".fa")
}
fasta_index = {k: Fasta(v) for k, v in fasta_files.items()}

In [221]:
cds_windows = sorted(windows, key=lambda x: x[0], reverse=True)
cds_windows

[(1574478, 1574557, 0),
 (1564773, 1564916, 1),
 (1544863, 1545002, 1),
 (1543869, 1543987, 2),
 (1542069, 1542167, 0)]

In [222]:
genome = fasta_index['chr1']

In [223]:
chrom = genome['chr1'] 

In [224]:
seq_parts = []

In [225]:
for i, (start, end, frame) in enumerate(cds_windows):
    start = int(start)
    end = int(end)
    frame = int(frame)

    exon_seq = chrom[start-1:end].seq.upper()

    if i == 0:
        exon_seq = exon_seq[frame:]

    seq_parts.append(exon_seq)

In [226]:
full_seq = ''.join(seq_parts[::-1])  # reverse exon order
full_seq_rc = str(Seq(full_seq).reverse_complement())

In [227]:
protein = str(Seq(full_seq_rc).translate(to_stop=False))

In [228]:
print(f"First codon: {full_seq_rc[:3]}")
print(f"Protein start: {protein[:10]}")
print(f"Full protein: {protein}")

First codon: ATG
Protein start: MPSSPLRVAV
Full protein: MPSSPLRVAVVCSSNQNRSMEAHNILSKRGFSVRSFGTGTHVKLPGPAPDKPNVYDFKTTYDQMYNDLLRKDKELYTQNGILHMLDRNKRIKPRPERFQNCKDLFDLILTCEERVYDQVVEDLNSREQETCQPVHVVNVDIQDNHEEATLGAFLICELCQCIQHTEDMENEIDELLQEFEEKSGRTFLHTVCFY


In [229]:
seq_parts = []
for i, (start, end, frame) in enumerate(cds_windows):
    start = int(start)
    end = int(end)
    frame = int(frame)

    exon_seq = chrom[start-1:end].seq.upper()

    exon_seq = exon_seq[frame:]

    seq_parts.append(exon_seq)

In [230]:
full_seq = ''.join(seq_parts[::-1])  # reverse exon order
full_seq_rc = str(Seq(full_seq).reverse_complement())

protein = str(Seq(full_seq_rc).translate(to_stop=False))

print(f"First codon: {full_seq_rc[:3]}")
print(f"Protein start: {protein[:10]}")
print(f"Full protein: {protein}")

First codon: ATG
Protein start: MPSSPLRVAV
Full protein: MPSSPLRVAVVCSSNQNRSMEAHNILSKRGFSVRSFGTGTHVKLPGPAPDKPNVYDFKTTYDQMYNDLLRKDKEPIHRMGFYICWTEIRESSPGQKDSRTAKTCLI*SSLAKRECMTRWWKSEFQRTGDLPACARGQCGHPGQPRGGHPGGVSHL*ALPVSSTRKTWRTRSTSCCRSSRRRVAAPFCTPSAS




In [176]:
genome = fasta_index['chr1']

chrom = genome['chr1']  # OR just use fasta_index['chr1']

# Now slice the sequence: note that pyfaidx uses 0-based, end-inclusive indexing
seq = chrom[1407215 - 1 : 1407217].seq.upper()
rc = str(Seq(seq).reverse_complement())

In [177]:
rc

'ATG'

In [130]:
gtf_cds[gtf_cds['transcript_id'] == 'NM_001205252.2']

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attributes,gene_id,transcript_id,entrez_id,length
132,chr1,BestRefSeq,CDS,1071820,1072566,.,-,0,"gene_id ""RNF223""; transcript_id ""NM_001205252....",RNF223,NM_001205252.2,401934,747
133,chr1,BestRefSeq,start_codon,1072564,1072566,.,-,0,"gene_id ""RNF223""; transcript_id ""NM_001205252....",RNF223,NM_001205252.2,401934,3
134,chr1,BestRefSeq,stop_codon,1071817,1071819,.,-,0,"gene_id ""RNF223""; transcript_id ""NM_001205252....",RNF223,NM_001205252.2,401934,3


In [131]:
windows = get_adjusted_cds_windows("NM_001205252.2", "chr1", "-", gtf_cds)

In [132]:
windows

[(1071820, 1072566, 0)]

In [82]:
from tqdm import tqdm

start_codons = gtf_cds[gtf_cds['feature'] == 'start_codon']
stop_codons = gtf_cds[gtf_cds['feature'] == 'stop_codon']
cds = gtf_cds[gtf_cds['feature'] == 'CDS']

start_in_middle = []
stop_in_middle = []

for transcript_id in tqdm(gtf_cds['transcript_id'].unique(), desc="Scanning transcripts"):
    cds_rows = cds[cds['transcript_id'] == transcript_id].sort_values(by='start')
    if cds_rows.empty:
        continue

    start_row = start_codons[start_codons['transcript_id'] == transcript_id]
    stop_row = stop_codons[stop_codons['transcript_id'] == transcript_id]
    if start_row.empty or stop_row.empty:
        continue

    start_pos = start_row.iloc[0]['start']
    stop_pos = stop_row.iloc[0]['end']
    strand = cds_rows.iloc[0]['strand']

    if strand == '+':
        if start_pos > cds_rows.iloc[0]['start']:
            start_in_middle.append(transcript_id)
        if stop_pos < cds_rows.iloc[-1]['end']:
            stop_in_middle.append(transcript_id)
    else:
        if start_pos < cds_rows.iloc[-1]['end']:
            start_in_middle.append(transcript_id)
        if stop_pos > cds_rows.iloc[0]['start']:
            stop_in_middle.append(transcript_id)

print("Transcript with start codon in middle of CDS:", start_in_middle[:1])
print("Transcript with stop codon in middle of CDS:", stop_in_middle[:1])


Scanning transcripts: 100%|██████████| 19166/19166 [03:48<00:00, 83.95it/s]

Transcript with start codon in middle of CDS: ['NM_001005221.2']
Transcript with stop codon in middle of CDS: []





In [86]:
start_in_middle

['NM_001005221.2',
 'NM_001005277.1',
 'NM_015658.4',
 'NM_001394713.1',
 'NM_021170.4',
 'NM_001205252.2',
 'NM_017891.5',
 'NM_004195.3',
 'NM_003327.4',
 'NM_016176.6',
 'NM_001014980.3',
 'NM_058167.3',
 'NM_030649.3',
 'NM_017871.6',
 'NM_001330311.2',
 'NM_032348.4',
 'NM_017900.3',
 'NM_030937.6',
 'NM_017971.4',
 'NM_001145210.3',
 'NM_001114748.2',
 'NM_014188.3',
 'NM_001242659.2',
 'NM_033486.3',
 'NM_001290264.2',
 'NM_024011.4',
 'NM_023018.5',
 'NM_002074.5',
 'NM_178545.4',
 'NM_001304360.2',
 'NM_182533.4',
 'NM_024848.3',
 'NM_002617.4',
 'NM_018216.4',
 'NM_001010926.4',
 'NM_033467.4',
 'NM_001242672.3',
 'NM_001409.4',
 'NM_017818.4',
 'NM_020710.3',
 'NM_014704.4',
 'NM_207356.3',
 'NM_015102.5',
 'NM_015557.3',
 'NM_000983.4',
 'NM_012405.4',
 'NM_207370.4',
 'NM_007274.4',
 'NM_019089.5',
 'NM_003790.3',
 'NM_020631.6',
 'NM_024654.5',
 'NM_014851.4',
 'NM_018198.4',
 'NM_006786.4',
 'NM_001561.6',
 'NM_018948.4',
 'NM_001042681.2',
 'NM_001428.5',
 'NM_207420.3'

In [117]:
gtf_cds[gtf_cds['strand'] == '-'].head(50)

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attributes,gene_id,transcript_id,entrez_id,length
4,chr1,BestRefSeq,CDS,450743,451678,.,-,0,"gene_id ""OR4F29""; transcript_id ""NM_001005221....",OR4F29,NM_001005221.2,729759,936
5,chr1,BestRefSeq,start_codon,451676,451678,.,-,0,"gene_id ""OR4F29""; transcript_id ""NM_001005221....",OR4F29,NM_001005221.2,729759,3
6,chr1,BestRefSeq,stop_codon,450740,450742,.,-,0,"gene_id ""OR4F29""; transcript_id ""NM_001005221....",OR4F29,NM_001005221.2,729759,3
7,chr1,BestRefSeq,CDS,685719,686654,.,-,0,"gene_id ""OR4F16""; transcript_id ""NM_001005277....",OR4F16,NM_001005277.1,81399,936
8,chr1,BestRefSeq,start_codon,686652,686654,.,-,0,"gene_id ""OR4F16""; transcript_id ""NM_001005277....",OR4F16,NM_001005277.1,81399,3
9,chr1,BestRefSeq,stop_codon,685716,685718,.,-,0,"gene_id ""OR4F16""; transcript_id ""NM_001005277....",OR4F16,NM_001005277.1,81399,3
26,chr1,BestRefSeq,CDS,959215,959240,.,-,0,"gene_id ""NOC2L""; transcript_id ""NM_015658.4""; ...",NOC2L,NM_015658.4,26155,26
27,chr1,BestRefSeq,CDS,958929,959081,.,-,1,"gene_id ""NOC2L""; transcript_id ""NM_015658.4""; ...",NOC2L,NM_015658.4,26155,153
28,chr1,BestRefSeq,CDS,957099,957273,.,-,1,"gene_id ""NOC2L""; transcript_id ""NM_015658.4""; ...",NOC2L,NM_015658.4,26155,175
29,chr1,BestRefSeq,CDS,956894,957025,.,-,0,"gene_id ""NOC2L""; transcript_id ""NM_015658.4""; ...",NOC2L,NM_015658.4,26155,132
