In [1]:
import pandas as pd 
import requests 
import pickle
import glob

In [2]:
df = pd.read_csv("MANE.GRCh38.v1.0.ensembl_exon.csv")

def get_sequence(chrom, start, end):
    req_string = f"https://rest.ensembl.org/sequence/region/human/{chrom}:{start}..{end}:1?content-type=text/plain"
    res = requests.get(req_string)
    return res.text

def get_mutation(chrom, pos, ref):
    alt = "C"
    if ref == "C":
        alt = "G"
    req_string = f"https://rest.ensembl.org/vep/human/hgvs/{chrom}:g.{pos}{ref}>{alt}?canonical=1&content-type=application/json"
    res = requests.get(req_string)
    return res


def get_cds_from_response(res, gene):
    tcs = res[0]["transcript_consequences"]
    for tc in tcs:
        if "canonical" in tc.keys() and "gene_symbol" in tc.keys():
            if tc["gene_symbol"] == gene: 
                return tc           
    return None 



In [3]:
genes = ["BRCA2"]

covered = glob.glob("*.pickle")

for gene in genes:
    res_lst = []
    gene_df = df.loc[df["gene_name"] == gene]
    print(gene, len(gene_df))
    if len(gene_df) == 0:
        print("missing")
        continue
    for exon_counter, (index, row) in enumerate(gene_df.iterrows()):
        
        exon_num = row["exon_number"]
        fn = f"{gene}_exon_{exon_num}.pickle"
        if fn in covered:
            print(f"skipping {gene}_exon_{exon_num}")
            continue
        chrom = int(row["chr"].replace("chr", ""))
        start = row["start"]
        end = row["end"]
        sequence = get_sequence(chrom, start, end)
        print(gene, exon_num, 'seqeunce--', sequence)
        exon_lst = []
        counter = 0
        for nt in list(sequence):
            pos = start + counter 
            res = get_mutation(chrom, pos, nt)
            tc = get_cds_from_response(res.json(), gene)
            if tc is None:
                print("found none ")
                print(chrom, pos, nt, gene)
            tc["genomic_pos"] = pos
            tc["chrom"] = chrom
            res_lst.append(tc)
            exon_lst.append(tc)
            counter += 1
        with open(f"{gene}_exon_{exon_num}.pickle", "wb") as f:
            pickle.dump(exon_lst, f)
        f.close()

    with open(f"{gene}_all.pickle", "wb") as f:
        pickle.dump(res_lst, f)
    f.close()
        

BRCA2 27
skipping BRCA2_exon_1
skipping BRCA2_exon_2
skipping BRCA2_exon_3
skipping BRCA2_exon_4
skipping BRCA2_exon_5
skipping BRCA2_exon_6
skipping BRCA2_exon_7
skipping BRCA2_exon_8
skipping BRCA2_exon_9
skipping BRCA2_exon_10
BRCA2 11 seqeunce-- GTTTATTGCATTCTTCTGTGAAAAGAAGCTGTTCACAGAATGATTCTGAAGAACCAACTTTGTCCTTAACTAGCTCTTTTGGGACAATTCTGAGGAAATGTTCTAGAAATGAAACATGTTCTAATAATACAGTAATCTCTCAGGATCTTGATTATAAAGAAGCAAAATGTAATAAGGAAAAACTACAGTTATTTATTACCCCAGAAGCTGATTCTCTGTCATGCCTGCAGGAAGGACAGTGTGAAAATGATCCAAAAAGCAAAAAAGTTTCAGATATAAAAGAAGAGGTCTTGGCTGCAGCATGTCACCCAGTACAACATTCAAAAGTGGAATACAGTGATACTGACTTTCAATCCCAGAAAAGTCTTTTATATGATCATGAAAATGCCAGCACTCTTATTTTAACTCCTACTTCCAAGGATGTTCTGTCAAACCTAGTCATGATTTCTAGAGGCAAAGAATCATACAAAATGTCAGACAAGCTCAAAGGTAACAATTATGAATCTGATGTTGAATTAACCAAAAATATTCCCATGGAAAAGAATCAAGATGTATGTGCTTTAAATGAAAATTATAAAAACGTTGAGCTGTTGCCACCTGAAAAATACATGAGAGTAGCATCACCTTCAAGAAAGGTACAATTCAACCAAAACACAAATCTAAGAGTAATCCAAAAAAATCAAGAAGAAACTACTTCAATTTCAAAAATAACTGTCAATCCAGACTCTGAAGAACTTTTCTCAGACAATG

In [49]:
res = get_mutation(17, start, "C")

In [86]:
pd.DataFrame(res_lst)

Unnamed: 0,biotype,hgnc_id,strand,cdna_start,gene_id,gene_symbol_source,transcript_id,variant_allele,cdna_end,canonical,...,codons,sift_score,cds_start,amino_acids,sift_prediction,cds_end,polyphen_prediction,protein_start,protein_end,polyphen_score
0,protein_coding,HGNC:1100,-1,94,ENSG00000012048,HGNC,ENST00000357654,G,94,1,...,,,,,,,,,,
1,protein_coding,HGNC:1100,-1,93,ENSG00000012048,HGNC,ENST00000357654,C,93,1,...,,,,,,,,,,
2,protein_coding,HGNC:1100,-1,92,ENSG00000012048,HGNC,ENST00000357654,C,92,1,...,,,,,,,,,,
3,protein_coding,HGNC:1100,-1,91,ENSG00000012048,HGNC,ENST00000357654,C,91,1,...,,,,,,,,,,
4,protein_coding,HGNC:1100,-1,90,ENSG00000012048,HGNC,ENST00000357654,C,90,1,...,,,,,,,,,,
5,protein_coding,HGNC:1100,-1,89,ENSG00000012048,HGNC,ENST00000357654,G,89,1,...,,,,,,,,,,
6,protein_coding,HGNC:1100,-1,88,ENSG00000012048,HGNC,ENST00000357654,G,88,1,...,,,,,,,,,,
7,protein_coding,HGNC:1100,-1,87,ENSG00000012048,HGNC,ENST00000357654,G,87,1,...,,,,,,,,,,
8,protein_coding,HGNC:1100,-1,86,ENSG00000012048,HGNC,ENST00000357654,C,86,1,...,,,,,,,,,,
9,protein_coding,HGNC:1100,-1,85,ENSG00000012048,HGNC,ENST00000357654,C,85,1,...,,,,,,,,,,


In [87]:
gene_df

Unnamed: 0,chr,strand,start,end,source,type,score,phase,gene_id,gene_type,gene_name,transcript_id,transcript_type,transcript_name,tag,protein_id,db_xref,exon_number,exon_id
168313,chr17,-,43125271,43125364,ensembl_havana,exon,,,ENSG00000012048.24,protein_coding,BRCA1,ENST00000357654.9,protein_coding,BRCA1-203,MANE_Select,ENSP00000350283.3,RefSeq:NM_007294.4,1,ENSE00001852567.1
168314,chr17,-,43124017,43124115,ensembl_havana,exon,,,ENSG00000012048.24,protein_coding,BRCA1,ENST00000357654.9,protein_coding,BRCA1-203,MANE_Select,ENSP00000350283.3,RefSeq:NM_007294.4,2,ENSE00003559512.1
168315,chr17,-,43115726,43115779,ensembl_havana,exon,,,ENSG00000012048.24,protein_coding,BRCA1,ENST00000357654.9,protein_coding,BRCA1-203,MANE_Select,ENSP00000350283.3,RefSeq:NM_007294.4,3,ENSE00003510592.1
168316,chr17,-,43106456,43106533,ensembl_havana,exon,,,ENSG00000012048.24,protein_coding,BRCA1,ENST00000357654.9,protein_coding,BRCA1-203,MANE_Select,ENSP00000350283.3,RefSeq:NM_007294.4,4,ENSE00003541068.1
168317,chr17,-,43104868,43104956,ensembl_havana,exon,,,ENSG00000012048.24,protein_coding,BRCA1,ENST00000357654.9,protein_coding,BRCA1-203,MANE_Select,ENSP00000350283.3,RefSeq:NM_007294.4,5,ENSE00003531836.1
168318,chr17,-,43104122,43104261,ensembl_havana,exon,,,ENSG00000012048.24,protein_coding,BRCA1,ENST00000357654.9,protein_coding,BRCA1-203,MANE_Select,ENSP00000350283.3,RefSeq:NM_007294.4,6,ENSE00003513709.1
168319,chr17,-,43099775,43099880,ensembl_havana,exon,,,ENSG00000012048.24,protein_coding,BRCA1,ENST00000357654.9,protein_coding,BRCA1-203,MANE_Select,ENSP00000350283.3,RefSeq:NM_007294.4,7,ENSE00003642045.1
168320,chr17,-,43097244,43097289,ensembl_havana,exon,,,ENSG00000012048.24,protein_coding,BRCA1,ENST00000357654.9,protein_coding,BRCA1-203,MANE_Select,ENSP00000350283.3,RefSeq:NM_007294.4,8,ENSE00003587679.1
168321,chr17,-,43095846,43095922,ensembl_havana,exon,,,ENSG00000012048.24,protein_coding,BRCA1,ENST00000357654.9,protein_coding,BRCA1-203,MANE_Select,ENSP00000350283.3,RefSeq:NM_007294.4,9,ENSE00003787101.1
168322,chr17,-,43091435,43094860,ensembl_havana,exon,,,ENSG00000012048.24,protein_coding,BRCA1,ENST00000357654.9,protein_coding,BRCA1-203,MANE_Select,ENSP00000350283.3,RefSeq:NM_007294.4,10,ENSE00003522602.1
