In [1]:
import pandas as pd 
import requests 
import pickle
import glob
import time

In [20]:
df = pd.read_csv("MANE.GRCh38.v1.0.ensembl_exon.csv")

def get_sequence(chrom, start, end):
    req_string = f"https://rest.ensembl.org/sequence/region/human/{chrom}:{start}..{end}:1?content-type=text/plain"
    res = requests.get(req_string)
    return res.text

def get_mutation(chrom, pos, ref):
    alt = "C"
    if ref == "C":
        alt = "G"
    req_string = f"https://rest.ensembl.org/vep/human/hgvs/{chrom}:g.{pos}{ref}>{alt}?canonical=1&content-type=application/json"
    res = requests.get(req_string)
    return res


def get_cds_from_response(res, gene):
    tcs = res[0]["transcript_consequences"]
    for tc in tcs:
        if "canonical" in tc.keys() and "gene_symbol" in tc.keys():
            if tc["gene_symbol"] == gene: 
                return tc           
    return None 

                    

# res = get_mutation(17, start, "C")
# get_cds_from_response(res.json(), "BRCA1")

In [21]:
genes = ["BRCA1"]

covered = glob.glob("*.pickle")

for gene in genes:
    res_lst = []
    gene_df = df.loc[df["gene_name"] == gene]
    print(gene, len(gene_df))
    if len(gene_df) == 0:
        print("missing")
        continue
    for exon_counter, (index, row) in enumerate(gene_df.iterrows()):
        t1 = time.time()
        exon_num = row["exon_number"]
        fn = f"{gene}_exon_{exon_num}.pickle"
        if fn in covered:
            print(f"skipping {gene}_exon_{exon_num}")
            continue
        chrom = int(row["chr"].replace("chr", ""))
        start = row["start"]
        end = row["end"]
        sequence = get_sequence(chrom, start, end)
        print(gene, exon_num, 'seqeunce--', sequence)
        print(len(sequence))
        exon_lst = []
        counter = 0
        for nt in list(sequence):
            pos = start + counter 
            res = get_mutation(chrom, pos, nt)
            tc = get_cds_from_response(res.json(), gene)
            if tc is None:
                print("found none ")
                print(chrom, pos, nt, gene)
            tc["genomic_pos"] = pos
            tc["chrom"] = chrom
            res_lst.append(tc)
            exon_lst.append(tc)
            counter += 1
        with open(f"{gene}_exon_{exon_num}.pickle", "wb") as f:
            pickle.dump(exon_lst, f)
        f.close()
        t2 = time.time()
        elapsed = t2 - t1
        print(elapsed, len(sequence)/elapsed)

    with open(f"{gene}_all.pickle", "wb") as f:
        pickle.dump(res_lst, f)
    f.close()
        

BRCA1 23
skipping BRCA1_exon_1
skipping BRCA1_exon_2
skipping BRCA1_exon_3
skipping BRCA1_exon_4
skipping BRCA1_exon_5
skipping BRCA1_exon_6
skipping BRCA1_exon_7
skipping BRCA1_exon_8
skipping BRCA1_exon_9
BRCA1 10 seqeunce-- CTAAGTTTGAATCCATGCTTTGCTCTTCTTGATTATTTTCTTCCAAGCCCGTTCCTCTTTCTTCATCATCTGAAACCAATTCCTTGTCACTCAGACCAACTCCCTGGCTTTCAGACTGATGCCTCATTTGTTTGGAAGAACCAATCAAGAAAGGATCCTGGGTGTTTGTATTTGCAGTCAAGTCTTCCAATTCACTGCACTGTGAAGAAAACAAGCTAGCAGAACATTTTGTTTCCTCACTAAGGTGATGTTCCTGAGATGCCTTTGCCAATATTACCTGGTTACTGCAGTCATTTAAGCTATTCTTCAATGATAATAAATTCTCCTCTGTGTTCTTAGACAGACACTCGGTAGCAACGGTGCTATGCCTAGTAGACTGAGAAGGTATATTGTTTACTTTACCAAATAACAAGTGTTGGAAGCAGGGAAGCTCTTCATCCTCACTAGATAAGTTCTCTTCTGAGGACTCTAATTTCTTGGCCCCTCTTCGGTAACCCTGAGCCAAATGTGTATGGGTGAAAGGGCTAGGACTCCTGCTAAGCTCTCCTTTCTGGACGCTTTTGCTAAAAACAGCAGAACTTTCCTTAATGTCATTTTCAGCAAAACTAGTATCTTCCTTTATTTCACCATCATCTAACAGGTCATCAGGTGTCTCAGAACAAACCTGAGATGCATGACTACTTCCCATAGGCTGTTCTAAGTTATCTGAAATCAGATATGGAGAGAAATCTGTATTAACAGTCTGAACTACTTCTTCATATTCTTGCTTTTTT

In [17]:
glob.glob("BRCA1*")

['BRCA1_exon_9.pickle',
 'BRCA1_exon_5.pickle',
 'BRCA1_exon_13.pickle',
 'BRCA1.ipynb',
 'BRCA1_exon_11.pickle',
 'BRCA1_exon_7.pickle',
 'BRCA1_all.pickle',
 'BRCA1_exon_23.pickle',
 'BRCA1_exon_15.pickle',
 'BRCA1_exon_19.pickle',
 'BRCA1_exon_3.pickle',
 'BRCA1_exon_1.pickle',
 'BRCA1_exon_21.pickle',
 'BRCA1_exon_17.pickle',
 'BRCA1_exon_12.pickle',
 'BRCA1_exon_8.pickle',
 'BRCA1_exon_4.pickle',
 'BRCA1_exon_6.pickle',
 'BRCA1_exon_2.pickle',
 'BRCA1_exon_14.pickle',
 'BRCA1_exon_18.pickle',
 'BRCA1_exon_22.pickle',
 'BRCA1_exon_16.pickle',
 'BRCA1_exon_20.pickle']

In [16]:
for i in range(1,24):
    
    with open(f"BRCA1_exon_{i}.pickle", "rb") as f:
        data = pickle.load(f)
    f.close()
    
    for d in data:
        if "canonical" in d.keys():
            print(d["cdna_start"], d["cdna_end"])

94 94
93 93
92 92
91 91
90 90
89 89
88 88
87 87
86 86
85 85
84 84
83 83
82 82
81 81
80 80
79 79
78 78
77 77
76 76
75 75
74 74
73 73
72 72
71 71
70 70
69 69
68 68
67 67
66 66
65 65
64 64
63 63
62 62
61 61
60 60
59 59
58 58
57 57
56 56
55 55
54 54
53 53
52 52
51 51
50 50
49 49
48 48
47 47
46 46
45 45
44 44
43 43
42 42
41 41
40 40
39 39
38 38
37 37
36 36
35 35
34 34
33 33
32 32
31 31
30 30
29 29
28 28
27 27
26 26
25 25
24 24
23 23
22 22
21 21
20 20
19 19
18 18
17 17
16 16
15 15
14 14
13 13
12 12
11 11
10 10
9 9
8 8
7 7
6 6
5 5
4 4
3 3
2 2
1 1
193 193
192 192
191 191
190 190
189 189
188 188
187 187
186 186
185 185
184 184
183 183
182 182
181 181
180 180
179 179
178 178
177 177
176 176
175 175
174 174
173 173
172 172
171 171
170 170
169 169
168 168
167 167
166 166
165 165
164 164
163 163
162 162
161 161
160 160
159 159
158 158
157 157
156 156
155 155
154 154
153 153
152 152
151 151
150 150
149 149
148 148
147 147
146 146
145 145
144 144
143 143
142 142
141 141
140 140
139 139
138 138
137 13

FileNotFoundError: [Errno 2] No such file or directory: 'BRCA1_exon_10.pickle'