In [1]:
import pandas as pd 
import requests 
import pickle
import glob

In [2]:
df = pd.read_csv("MANE.GRCh38.v1.0.ensembl_exon.csv")

def get_sequence(chrom, start, end):
    req_string = f"https://rest.ensembl.org/sequence/region/human/{chrom}:{start}..{end}:1?content-type=text/plain"
    res = requests.get(req_string)
    return res.text

def get_mutation(chrom, pos, ref):
    alt = "C"
    if ref == "C":
        alt = "G"
    req_string = f"https://rest.ensembl.org/vep/human/hgvs/{chrom}:g.{pos}{ref}>{alt}?canonical=1&content-type=application/json"
    res = requests.get(req_string)
    return res


def get_cds_from_response(res, gene):
    tcs = res[0]["transcript_consequences"]
    for tc in tcs:
        if "canonical" in tc.keys() and "gene_symbol" in tc.keys():
            if tc["gene_symbol"] == gene: 
                return tc           
    return None 

In [7]:
genes = ["PALB2"]

covered = glob.glob("*.pickle")

for gene in genes:
    res_lst = []
    gene_df = df.loc[df["gene_name"] == gene]
    print(gene, len(gene_df))
    if len(gene_df) == 0:
        print("missing")
        continue
    for exon_counter, (index, row) in enumerate(gene_df.iterrows()):
        
        exon_num = row["exon_number"]
        fn = f"{gene}_exon_{exon_num}.pickle"
        if fn in covered:
            print(f"skipping {gene}_exon_{exon_num}")
            continue
        chrom = int(row["chr"].replace("chr", ""))
        start = row["start"]
        end = row["end"]
        sequence = get_sequence(chrom, start, end)
        print(gene, exon_num, 'seqeunce--', sequence)
        exon_lst = []
        counter = 0
        for nt in list(sequence):
            pos = start + counter 
            res = get_mutation(chrom, pos, nt)
            tc = get_cds_from_response(res.json(), gene)
            if tc is None:
                print("found none ")
                print(chrom, pos, nt, gene)
            tc["genomic_pos"] = pos
            tc["chrom"] = chrom
            res_lst.append(tc)
            exon_lst.append(tc)
            counter += 1
        with open(f"{gene}_exon_{exon_num}.pickle", "wb") as f:
            pickle.dump(exon_lst, f)
        f.close()

    with open(f"{gene}_all.pickle", "wb") as f:
        pickle.dump(res_lst, f)
    f.close()
        

PALB2 13
skipping PALB2_exon_1
skipping PALB2_exon_2
skipping PALB2_exon_3
skipping PALB2_exon_4
skipping PALB2_exon_5
skipping PALB2_exon_6
skipping PALB2_exon_7
skipping PALB2_exon_8
skipping PALB2_exon_9
skipping PALB2_exon_10
skipping PALB2_exon_11
skipping PALB2_exon_12
skipping PALB2_exon_13
