In [1]:
""" This script finds every possible SNV within the regions of interest.
It also annotates the trinucleotide context around each SNV.

It takes the output of a bedtools getfasta command (tsv) as input.

NB the coordinates of each feature should be extended by 1 each side with
bedtools slop, so that the 3nt context around the most 5' and 3' positions
is still accessible.
""";

In [2]:
# Import relevant modules
import numpy as np
import pandas as pd

In [3]:
def get_tri_contexts(getfasta_output):
    """Get trinucleotide context for each nt within a genomic feature.

    Takes the .tsv output from a bedtools getfasta command.

    NB the coordinates of each feature should be extended by 1 each side with
    bedtools slop, so that the 3nt context around the most 5' and 3' positions
    is still accessible.
    """

    print(f"Getting trinucleotide contexts for {getfasta_output}")

    # Read data
    df = pd.read_csv(
        getfasta_output,
        sep="\t",
        header=None,
        names=["id", "seq"],
    )
    print(f"There are {len(df)} features")

    # Extract chr, start, and end information
    a = df["id"].str.split(":")
    b = a.str[1].str.split("-")
    df["chr"] = a.str[0]
    df["start"] = b.str[0].astype(int) + 1  # revert to 1-based
    df["end"] = b.str[1].astype(int)

    # Get the position and ref allele for every nt within the feature.
    pos = lambda x: range(x["start"], x["end"] + 1)
    df["pos"] = df.apply(pos, axis=1)

    ref = lambda x: list(zip(x["pos"], x["seq"]))
    df["ref"] = df.apply(ref, axis=1)

    df = df[["chr", "ref"]].explode("ref")

    df["pos"] = [x[0] for x in df["ref"]]
    df["ref"] = [x[1] for x in df["ref"]]

    print(f"They span {len(df)} nt before trimming of the most 3' and 5' positions")

    # Get trinucleotide context around each position.
    df = df.reset_index(names="cds")  # Unique ID for each feature
    df = df.sort_values(["cds", "pos"])  # Ensure order of positions

    first = df.groupby(["cds"])["ref"].shift(1)  # Order preserved by groupby
    last = df.groupby(["cds"])["ref"].shift(-1)
    tri = first.str.cat(others=[df["ref"], last]).rename("tri")  # Get triplet context

    df = pd.concat([df, tri], axis=1)  # Extreme ends contain NaNs: useful for dropping
    print(
        f"There are {df.tri.isna().sum()} positions at the extreme ends of the features."
    )
    df = df.dropna()  # Drop extreme end positions
    df = df.sort_values(["chr", "pos"])  # Sort for faster VEP annotation
    print(f"There are {len(df)} nt after trimming.")

    # Get possible alt alleles for each position.
    df["alt"] = [["A", "T", "C", "G"]] * len(df)
    df = df.explode("alt")
    df = df[df["ref"] != df["alt"]]
    df = df[["chr", "pos", "ref", "alt", "tri"]].reset_index(drop=True)
    print(f"There are {df.duplicated().sum()} identical duplicate positions")

    # Tidy the dataframe
    df = df.drop_duplicates()
    df = df[df["ref"] != "N"]  # Mainly chrY positions
    print(f"There are {len(df)} possible SNVs")

    return df

In [4]:
if __name__ == "__main__":

    get_fasta_output = "../outputs/gencode_v39_canonical_cds_seq.tsv"

    # Combine NMD-pos and NMD-esc regions
    df = get_tri_contexts(get_fasta_output)
    print(f"There are {len(df)} distinct possible SNVs in all CDS features.")
    print("Writing trinucleotide contexts to .tsv")
    df.to_csv("../outputs/cds_trinucleotide_contexts.tsv", sep="\t", index=False)

    # Create a VCF file for VEP annotation
    vcf = df.copy().assign(ID=".", QUAL=".", FILTER=".", INFO=".")
    vcf = vcf[["chr", "pos", "ID", "ref", "alt", "QUAL", "FILTER", "INFO"]]
    print("Writing SNVs to .vcf")
    vcf.to_csv(
        "../outputs/cds_all_possible_snvs.vcf", sep="\t", index=False, header=False
    )

Getting trinucleotide contexts for ../outputs/gencode_v39_canonical_cds_seq.tsv

There are 196885 features

They span 34571741 nt before trimming of the most 3' and 5' positions

There are 393770 positions at the extreme ends of the features.

There are 34177971 nt after trimming.

There are 1813980 identical duplicate positions

There are 100651272 possible SNVs

There are 100651272 distinct possible SNVs in all CDS features.

Writing .tsv

Writing .vcf
