In [2]:
import gzip
import pickle
import pandas as pd
from Bio import SeqIO
from pathlib import Path
from tqdm import tqdm
from gtfparse import read_gtf

pd.options.mode.copy_on_write = True


In [3]:
# NOT USING AS IT MAY BE MEMORY INEFFICIENT - FILES ARE ~ HALF THE SIZE OF FASTAS

def find_all_NGG_pam_sites(chrom, fasta_path):
    with gzip.open(fasta_path, "rt") as handle:
        fasta_str = str(SeqIO.read(handle, "fasta").seq).upper()

    pam_sites = []
    for i in tqdm(range(len(fasta_str)-2)):
        potential_pam = fasta_str[i:i+3]
        if potential_pam[1:] == "GG":
            strand = "+"
            pam_start_loc = i + 1 # convert to 1 indexing
        elif potential_pam[:-1] == "CC":
            strand = "-"
            pam_start_loc = i + 3 # convert to 1 indexing
        else:
            continue
        # get all variants in the window
        pam_sites.append({"pam_start_loc": pam_start_loc, "strand": strand})

    pam_sites_df = pd.DataFrame(pam_sites) #.sort_values("pam_start_loc")
    pam_sites_df.to_pickle(f"./pam_sites/{chrom}_pam_sites.pkl")
    return

def preprocess_pams():
    # list all genome fastas
    genome_fastas = list(Path("./genome_fastas").rglob("*.fa.gz"))

    # iterate through them
    for fasta_path in genome_fastas:
        chrom = fasta_path.stem.split(".")[0]
        print(chrom)
        find_all_NGG_pam_sites(chrom, f"./genome_fastas/{chrom}.fa.gz")
    # after all are done, run gzip *.pkl in terminal to compress all the pam_sites files



In [4]:

# download hg38 gtf from https://www.ncbi.nlm.nih.gov/datasets/taxonomy/9606/
# read in gtf file
#gtf = pd.read_csv("./hg38.gtf", sep="\t", comment="#", header=None)
gtf_df = read_gtf("./genome_files/hg38.gtf.gz")


# remove all duplicate transcript ids
#gtf["transcript_id"] = gtf[1].str.split(" ", expand=True)[1].str.replace('"', "")
#gtf = gtf.drop_duplicates("transcript_id")
print(gtf_df)



INFO:root:Extracted GTF attributes: ['gene_id', 'transcript_id', 'db_xref', 'description', 'gbkey', 'gene', 'gene_biotype', 'pseudo', 'product', 'transcript_biotype', 'exon_number', 'gene_synonym', 'model_evidence', 'tag', 'protein_id', 'experiment', 'inference', 'note', 'part', 'exception', 'isoform', 'anticodon', 'partial', 'The', 'transl_except', 'non-AUG', 'standard_name', 'deleted', 'source', 'similar', 'substituted', 'transferase', 'codons', '12S', '16S', 'transl_table', 'ATPase']


shape: (4_697_665, 44)
┌──────────────┬────────┬────────────┬───────┬───┬─────┬─────┬──────────────┬────────┐
│ seqname      ┆ source ┆ feature    ┆ start ┆ … ┆ 12S ┆ 16S ┆ transl_table ┆ ATPase │
│ ---          ┆ ---    ┆ ---        ┆ ---   ┆   ┆ --- ┆ --- ┆ ---          ┆ ---    │
│ cat          ┆ str    ┆ cat        ┆ i64   ┆   ┆ str ┆ str ┆ str          ┆ str    │
╞══════════════╪════════╪════════════╪═══════╪═══╪═════╪═════╪══════════════╪════════╡
│ NC_000001.11 ┆        ┆ gene       ┆ 11874 ┆ … ┆     ┆     ┆              ┆        │
│ NC_000001.11 ┆        ┆ transcript ┆ 11874 ┆ … ┆     ┆     ┆              ┆        │
│ NC_000001.11 ┆        ┆ exon       ┆ 11874 ┆ … ┆     ┆     ┆              ┆        │
│ NC_000001.11 ┆        ┆ exon       ┆ 12613 ┆ … ┆     ┆     ┆              ┆        │
│ NC_000001.11 ┆        ┆ exon       ┆ 13221 ┆ … ┆     ┆     ┆              ┆        │
│ …            ┆ …      ┆ …          ┆ …     ┆ … ┆ …   ┆ …   ┆ …            ┆ …      │
│ NC_012920.1  ┆    

In [9]:
# keep only relevant columns for transcripts
gtf_df2 = gtf_df[['gene_id', 'transcript_id', 'feature', 'start', 'end', 'tag', 'strand', 'exon_number']]
gtf_df2 = pd.DataFrame(gtf_df2)
gtf_df2.columns = ['gene_id', 'transcript_id', 'feature', 'start', 'end', 'tag', 'strand', 'exon_number']

# keep only transcripts that have a cds entry
print(gtf_df2['transcript_id'].unique().shape)

gtf_tx = gtf_df2[(gtf_df2['feature'] == 'CDS')]
gtf_tx = gtf_tx.drop(columns=['feature'])
gtf_tx = gtf_tx.drop_duplicates("transcript_id")
print(gtf_tx['transcript_id'].unique().shape)
gtf_tx['MANE'] = gtf_tx.apply(lambda x: -1 if "MANE" in x['tag'] else 0, axis=1)
gtf_tx['transcript_id'] = gtf_tx.apply(lambda x: x['transcript_id'] + " (MANE)" if x['MANE'] == -1 else x['transcript_id'], axis=1)

# keep minimal data in csvs
# sort by transcript id
gtf_tx = gtf_tx.sort_values(['gene_id', 'MANE', "transcript_id"])
# group by gene id and turn transcript ids into a list
gtf_tx_prnt = gtf_tx.groupby("gene_id").agg({"transcript_id": list}).reset_index()
gtf_tx_prnt.to_csv("./genome_files/hg38_transcripts.tsv", sep="\t", index=False)


(201190,)
(145439,)


In [10]:

# create df with transcript id and start stop coords
gtf_coords = gtf_df2

# Filter for transcripts with start and stop codons or exons
transcripts_with_codons = gtf_coords[(gtf_coords['feature'] == 'CDS')]

# Group by transcript_id and aggregate the necessary information
coding_coords = transcripts_with_codons.groupby('transcript_id').apply(lambda x: {
    'transcript_id': x['transcript_id'].iloc[0],
    #'strand': x['strand'].iloc[0],
    'cds_lengths': list((x[x['feature'] == 'CDS']['end'].astype(int) - x[x['feature'] == 'CDS']['start'].astype(int) + 1)),
    'cds_exons': list(x[x['feature'] == 'CDS']['exon_number'].astype(int)),

}).tolist()

coding_coords_df = pd.DataFrame(coding_coords)
coding_coords_df.to_csv("./genome_files/hg38_transcript_coords.tsv", sep="\t", index=False)

KeyError: 'exon_number'