In [1]:
import os
import sys
home_dir = "../"
module_path = os.path.abspath(os.path.join(home_dir))
if module_path not in sys.path:
    sys.path.append(module_path)

import fileinput
import pandas as pd
import fileinput
from utils.ncbi_proteins import download_protein_list_mpi, download_protein_list, create_combined_fasta

from Bio import SeqIO
from Bio.PDB.Polypeptide import protein_letters_3to1 # 20 amino acids

In [2]:
def three_to_one(aa):
    if str.upper(aa) in protein_letters_3to1:
        return protein_letters_3to1[str.upper(aa)]
    return aa


def filter_unknown_variants(x):
    return len(x)==1


def get_variants_df(dbsnps_df):      
    # columns = ["snp_id", "chrom_acc_version", "chrom_pos", "ref_allele", "alt_allele", "prot_acc_version", "prot_pos", "wt", "mut", "wt_population", "mut_poulation", "wt_freq", "mt_freq"]
    variations = []
    for i, tuple in enumerate(dbsnps_df.itertuples()):
        # print(tuple.snp_id, tuple.variations, tuple.SAMN10492705)
        
        if len(tuple.REF)>1 or len(tuple.ALT)>1: # only considering single neucleodite variants
            continue
        
        prot_variations = tuple.variations.split(",") # ie: NP_064505.1:p.Arg898Lys,NP_064505.1:p.Arg898Met
        chrom_variations = tuple.ALT.split(",") # alt_alleles
        
        wt_population, mut_poulations = int(tuple.SAMN10492705.split(":")[0]), tuple.SAMN10492705.split(":")[1].split(",")
        total_population = wt_population+sum(list(map(int, mut_poulations)))
        
        
        try:
            for j, v in enumerate(prot_variations):
                if j < len(mut_poulations):
                    mut_poulation = int(mut_poulations[j])
                else: mut_poulation = 0

                if len(prot_variations) == len(chrom_variations):
                    # The chromosomal variants create same number of corresponding protein variants.
                    alt_allele = chrom_variations[j]
                else: alt_allele = chrom_variations[0]
                
                new_v = {"snp_id": tuple.snp_id,
                         "gene_symbol": tuple.gene_symbol,
                         
                         "chrom_acc_version": tuple.CHROM,
                         "chrom_pos": tuple.POS, # 1-indexed
                         "ref_allele": tuple.REF,
                         "alt_allele": alt_allele,

                         "prot_acc_version": v.split(":")[0], # protein_accession.version
                         "prot_pos": int(v.split(":")[1][5:-3]), # NCBI prot variants are 1-indexed
                         "wt": three_to_one(v.split(":")[1][2:5]),
                         "mut": three_to_one(v.split(":")[1][-3:]), 

                         "wt_population": wt_population,
                         "mut_poulation": mut_poulation, 
                         "wt_freq": wt_population/total_population, # freq should be computed here, since we are decomposing multiple SNVs into separate independent SNVs.
                         "mt_freq": mut_poulation/total_population}
                variations.append(new_v)
                # print(new_v)
        except:
            print(i, tuple)
            raise
        # if i==5000: break
    variations_df = pd.DataFrame(variations)
    return variations_df

In [3]:
# loading raw variants data ...
inp_filepath = home_dir+"data/ALFA_population_freq/dbsnp_with_prots_and_population_freq_mapping.vcf.gz"
dbsnps_df = pd.read_csv(inp_filepath, sep="\t", compression='gzip')
print(dbsnps_df.shape)
print(dbsnps_df.columns)

(4674997, 33)
Index(['snp_id', 'acc', 'chrpos', 'spdi', 'create_date', 'update_date',
       'gene_symbol', 'HGNC_ID', 'symbol', 'name', 'RefSeq_prot', 'variations',
       'CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT',
       'SAMN10492695', 'SAMN10492696', 'SAMN10492697', 'SAMN10492698',
       'SAMN10492699', 'SAMN10492700', 'SAMN10492701', 'SAMN10492702',
       'SAMN11605645', 'SAMN10492703', 'SAMN10492704', 'SAMN10492705'],
      dtype='object')


In [4]:
# parsed for single nucleotide variants (SNVs) and associated protein variants, computed allele population frequency
variations_df = get_variants_df(dbsnps_df)
print(variations_df.shape)
print(variations_df.columns)

(4629016, 14)
Index(['snp_id', 'gene_symbol', 'chrom_acc_version', 'chrom_pos', 'ref_allele',
       'alt_allele', 'prot_acc_version', 'prot_pos', 'wt', 'mut',
       'wt_population', 'mut_poulation', 'wt_freq', 'mt_freq'],
      dtype='object')


In [5]:
# defining classes: https://www.ncbi.nlm.nih.gov/snp/docs/gsr/alfa/ALFA_20201027095038/
variations_df = variations_df[variations_df["mut_poulation"]>=1] # excluding variants with population count <1
variations_df.loc[variations_df["mt_freq"]>=.01, "class"] = "Common"
variations_df.loc[(variations_df["mt_freq"]<.01) & (variations_df["mt_freq"]>=.001), "class"] = "Rare"
variations_df.loc[(variations_df["mt_freq"]<.001), "class"] = "Ultra-rare"
variations_df.loc[variations_df["mut_poulation"]==1, "class"] = "Singleton"
variations_df.drop_duplicates(keep="first", inplace=True, ignore_index=True)
print(variations_df["class"].value_counts())
print(variations_df.shape)

Singleton     738802
Ultra-rare    502863
Rare           45630
Common         28781
Name: class, dtype: int64
(1316076, 15)


In [6]:
# excluding unknown amino acid variants ...
variations_df = variations_df[variations_df["wt"].apply(filter_unknown_variants)]
variations_df = variations_df[variations_df["mut"].apply(filter_unknown_variants)]
print(variations_df["class"].value_counts())
print(variations_df.shape)
print(variations_df["prot_acc_version"].unique().shape[0])
print(variations_df["snp_id"].unique().shape[0])

Singleton     734912
Ultra-rare    500407
Rare           45411
Common         28560
Name: class, dtype: int64
(1309290, 15)
18302
1300759


In [7]:
# downloaing proteins ...
protein_acc_list = list(variations_df["prot_acc_version"].unique())
download_protein_list(protein_acc_list, start_i=0, home_dir=home_dir) # sequential downloading
# download_protein_list_mpi(protein_acc_list, len(protein_acc_list))
print("#-unique NCBI protein sequences downloaded: ", len(protein_acc_list))

0 NP_064505.1 Already existis
1 NP_002194.2 Already existis
2 NP_112509.3 Already existis
3 NP_064519.2 Already existis
4 NP_055673.2 Already existis
5 NP_004276.2 Already existis
6 NP_758872.1 Already existis
7 NP_003572.2 Already existis
8 NP_065806.1 Already existis
9 NP_003061.3 Already existis
10 NP_001158136.1 Already existis
11 NP_005980.1 Already existis
12 NP_065805.2 Already existis
13 NP_001185879.1 Already existis
14 NP_001839.2 Already existis
15 NP_001277116.1 Already existis
16 NP_003132.2 Already existis
17 NP_001124389.2 Already existis
18 NP_003357.2 Already existis
19 NP_003877.2 Already existis
20 NP_001804.2 Already existis
21 NP_078997.4 Already existis
22 NP_064560.2 Already existis
23 NP_612358.3 Already existis
24 NP_004548.3 Already existis
25 NP_660161.1 Already existis
26 NP_112228.1 Already existis
27 NP_006862.2 Already existis
28 NP_001363420.1 Already existis
29 NP_067022.1 Already existis
30 NP_000693.1 Already existis
31 NP_001020769.1 Already existis


In [8]:
# filtering on seq-len <= 1022
new_protein_acc_list = []
for i, prot in enumerate(protein_acc_list):
    filepath = home_dir+f"data/proteins/fastas/{prot}.fasta" 
    seq_record = SeqIO.parse(filepath, format="fasta").__next__()
    seq = str(seq_record.seq)
    seq_len = len(seq)
    if seq_len<=1022:
        print(f"{i}")
        new_protein_acc_list.append(prot)
        # the seq is not attached here, cause it takes lots of time
    # if i==10: break

variations_df = variations_df[variations_df["prot_acc_version"].isin(new_protein_acc_list)]
print(variations_df.shape)
variations_df["class"].value_counts()

2
3
4
5
6
7
8
11
16
17
18
19
22
23
25
26
27
28
29
30
32
33
35
36
37
38
40
41
42
43
44
45
49
50
51
53
56
57
59
60
63
64
66
68
69
70
71
72
78
80
81
84
85
87
88
91
92
93
96
97
99
100
101
104
106
107
110
111
112
115
116
119
120
121
123
125
126
127
129
130
131
132
134
135
136
137
142
143
145
146
149
150
152
155
156
157
158
159
162
163
164
167
168
169
170
171
172
173
175
176
177
178
179
180
181
182
183
184
185
187
188
190
191
193
195
197
198
200
203
204
205
206
207
209
210
211
212
213
215
216
218
219
220
222
223
225
226
227
228
229
230
231
233
234
235
236
240
243
244
246
247
248
249
250
252
253
255
256
257
258
260
261
262
263
265
266
267
268
269
270
271
275
277
278
280
281
283
284
285
286
288
290
291
292
293
294
295
296
297
298
300
301
302
304
305
306
307
309
312
313
314
315
318
320
322
323
324
326
329
330
332
333
334
335
338
340
341
343
345
346
349
350
351
354
356
357
358
359
363
364
365
367
368
369
370
372
373
377
378
379
380
381
382
384
385
386
387
388
389
390
391
392
393
394
397
399
400


Singleton     464584
Ultra-rare    315753
Rare           28778
Common         18239
Name: class, dtype: int64

In [10]:
# saving variants and combined fasta file
filename = "popu_freq"
out_filepath = home_dir+f"models/aa_common/datasets_popu_freq/{filename}"

print("\nLog: saving variants ...")
variations_df.to_csv(out_filepath+".tsv", index=False, sep="\t", header=True)

print("\nLog: Creating merged fasta document ...")
protein_acc_list = list(variations_df["prot_acc_version"].unique())
create_combined_fasta(protein_acc_list, out_filepath+".fasta", home_dir)


Log: saving variants ...

Log: Creating merged fasta document ...
0 NP_112509.3 Already existis
1 NP_064519.2 Already existis
2 NP_055673.2 Already existis
3 NP_004276.2 Already existis
4 NP_758872.1 Already existis
5 NP_003572.2 Already existis
6 NP_065806.1 Already existis
7 NP_005980.1 Already existis
8 NP_003132.2 Already existis
9 NP_001124389.2 Already existis
10 NP_003357.2 Already existis
11 NP_003877.2 Already existis
12 NP_064560.2 Already existis
13 NP_612358.3 Already existis
14 NP_660161.1 Already existis
15 NP_112228.1 Already existis
16 NP_006862.2 Already existis
17 NP_001363420.1 Already existis
18 NP_067022.1 Already existis
19 NP_000693.1 Already existis
20 NP_006270.1 Already existis
21 NP_001382572.1 Already existis
22 NP_001356746.1 Already existis
23 NP_071324.1 Already existis
24 NP_006173.2 Already existis
25 NP_001171754.1 Already existis
26 NP_005555.2 Already existis
27 NP_002941.1 Already existis
28 NP_079203.4 Already existis
29 NP_660308.1 Already existi