In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
home_dir = "../"

import pandas as pd

In [2]:
from Bio.PDB.Polypeptide import protein_letters_3to1
protein_letters_1to3 = {value:key for key, value in protein_letters_3to1.items()}

def one_to_three(aa):
    if str.upper(aa) in protein_letters_1to3:
        return protein_letters_1to3[str.upper(aa)]
    return aa

def three_to_one(aa):
    if str.upper(aa) in protein_letters_3to1:
        return protein_letters_3to1[str.upper(aa)]
    return aa

def get_variants_df(df:pd.DataFrame, patho_type=None):
    variants = []
    n_datapoints_losing=0
    for row_i in range(df.shape[0]):
        # print(df.loc[row_i])

        clinical_sig = df.loc[row_i, "Clinical significance (Last reviewed)"]
        if "Likely pathogenic" in clinical_sig:
            class_name = "Likely-pathogenic"
        else: class_name = "Pathogenic"

        # mapping protein to genomic variant
        x = df.loc[row_i, "Name"].split()

        is_mapped = False
        if len(x)!=2: 
            # print(x) 
            # print(df.loc[row_i])
            prot_variants = df.loc[row_i, "Protein change"].split(", ")
            chrom_variants = df.loc[row_i, "Canonical SPDI"].split("|")
            is_mapped = True
            
            if len(prot_variants)!=len(chrom_variants): # these are the cases when number of prot variants does not match with the number of genomic variants, skipping them
                print(df.loc[row_i, "VariationID"], x, prot_variants, chrom_variants)
                n_datapoints_losing+=1
                prot_variants, chrom_variants = [], []
                # raise
        else:
            prot_variants = [x[1]]
            chrom_variants = [df.loc[row_i, "Canonical SPDI"]]

        # this should never occur, but for sanity check it is here.
        if len(prot_variants)!=len(chrom_variants): raise 

        for prot_v, chrom_v in list(zip(prot_variants, chrom_variants)):
            # print(prot_v, chrom_v)

            new_v = {"clinvar_id": df.loc[row_i, "VariationID"],
                    "gene_name": df.loc[row_i, "Symbol"],
                    "gene_id": df.loc[row_i, "GeneID"], 
                    "snp_id": df.loc[row_i, "dbSNP ID"],
                    "mrna_acc_version": df.loc[row_i, "RNA_nucleotide_accession.version"],
                    "mrna_gi": df.loc[row_i, "RNA_nucleotide_gi"],

                    "prot_variant": prot_v,
                    "prot_acc_version": df.loc[row_i, "protein_accession.version"],
                    "1indexed_prot_mt_pos": prot_v[1:-1] if is_mapped else prot_v[6:-4], #not converting into integer, since there can be "p.Ser151_Gly152delinsTyrGlu" type of mutation, which we are not considering
                    "wt_aa": one_to_three(prot_v[0]) if is_mapped else prot_v[3:6],
                    "mt_aa": one_to_three(prot_v[-1]) if is_mapped else prot_v[-4:-1],
                    "wt_aa_1letter": prot_v[0] if is_mapped else three_to_one(prot_v[3:6]),
                    "mt_aa_1letter": prot_v[-1] if is_mapped else three_to_one(prot_v[-4:-1]),

                    "chrom_variant": chrom_v,
                    "chrom_acc_version": chrom_v.split(":")[0],
                    "chrom_num": int(chrom_v.split(":")[0][chrom_v.index('_')+1 : chrom_v.index('.')]),
                    "chrom_pos": int(chrom_v.split(":")[1])+1, # chr-pos in Canonical SPDI is 0-indexed, so adding 1 to make it 1-indexed
                    "ref_allele": chrom_v.split(":")[2],
                    "alt_allele": chrom_v.split(":")[3],

                    "class": class_name
            }
            variants.append(new_v)

    print("#-data-points not match protvariants vs genomicvariants:", n_datapoints_losing)
    variants_df = pd.DataFrame(variants)
    return variants_df   

In [3]:
inp_filepath = home_dir+"data/clinvar/filtered/clinvar_HumanPathogenicMissenseVariants01012022To01072023.txt"
df = pd.read_csv(inp_filepath, sep="\t")
print(df.shape)
patho_variants_df = get_variants_df(df)
print(patho_variants_df.shape)

(4206, 34)
2018405 ['NM_001370658.1(BTD):c.399+2T>G'] ['V155G', 'V134G'] ['NC_000003.12:15642058:T:G']
1723345 ['NM_000500.9(CYP21A2):c.[710T>A;713T>A]'] ['V238E', 'V103E', 'V208E', 'I237N', 'I207N', 'I102N'] ['NC_000006.12:32039809:T:A', 'NC_000006.12:32039806:T:A']
1695398 ['NM_014251.3(SLC25A13):c.[1956C>A;1962del]'] ['N652K', 'N653K', 'F654fs', 'F655fs'] ['NC_000007.14:96121262:G:T', 'NC_000007.14:96121256:AAA:AA']
1702645 ['NM_014491.4(FOXP2):c.1266+1G>C'] ['V423L', 'V448L'] ['NC_000007.14:114654009:G:C']
1683486 ['NM_004318.4(ASPH):c.323-11619A>G'] ['K202R', 'K217R'] ['NC_000008.11:61665278:T:C']
1683485 ['NM_004318.4(ASPH):c.323-11779G>C'] ['D149H', 'D164H'] ['NC_000008.11:61665438:C:G']
1683484 ['NM_004318.4(ASPH):c.322+12720A>C'] ['K103T', 'K88T'] ['NC_000008.11:61668247:T:G']
1701017 ['NM_001375567.1(FOCAD):c.[1687C>T;3694G>C]'] ['A1232P', 'A1197P', 'R563C', 'R528C'] ['NC_000009.12:20948288:G:C', 'NC_000009.12:20820964:C:T']
1705146 ['NM_000155.4(GALT):c.[413C>T;469G>A]'] ['T

In [4]:
inp_filepath = home_dir+"data/clinvar/filtered/clinvar_HumanLikelyPathogenicMissenseVariants01012022To01072023.txt"
df = pd.read_csv(inp_filepath, sep="\t")
print(df.shape)
likelypatho_variants_df = get_variants_df(df)
print(likelypatho_variants_df.shape)

(7815, 34)
1334358 ['NM_170707.4(LMNA):c.1698+1G>A'] ['V486M', 'V567M', 'V381M', 'V455M'] ['NC_000001.11:156137743:G:A']
1802572 ['NM_001386135.1(AFF3):c.[2151dup;3059A>G]'] ['G718fs', 'G743fs', 'Q1020R', 'Q1045R'] ['NC_000002.12:99593509:A:AA', 'NC_000002.12:99565546:T:C']
1342974 ['NM_001178015.2(SLC4A10):c.[1730A>T;3308A>T]'] ['K354M', 'K498M', 'K546M', 'K547M', 'K557M', 'K558M', 'K559M', 'K576M', 'K577M', 'K588M', 'K589M', 'N1024I', 'N1072I', 'N1073I', 'N1083I', 'N1084I', 'N1085I', 'N1102I', 'N1103I', 'N1114I', 'N1115I', 'N880I'] ['NC_000002.12:161904887:A:T', 'NC_000002.12:161976839:A:T']
1510215 ['NM_004100.5(EYA4):c.970+1G>A'] ['G324D', 'G270D'] ['NC_000006.12:133468731:G:A']
1802567 ['NM_152743.4(BRAT1):c.[638dup;803G>A]'] ['V214fs', 'V39fs', 'R268H', 'R93H'] ['NC_000007.14:2543754:TT:TTT', 'NC_000007.14:2543589:C:T']
1713142 ['NM_000238.4(KCNH2):c.2398+5G>A'] ['M461I', 'M701I', 'M705I', 'M742I', 'M801I'] ['NC_000007.14:150950162:C:T']
2441609 ['NM_000431.4(MVK):c.1039+1G>C'] [

In [5]:
# merging and clearning patho-&-likelypatho data
variants_df = pd.concat([patho_variants_df, likelypatho_variants_df], ignore_index=True)
print(variants_df.shape)
variants_df.drop_duplicates(keep="first", inplace=True, ignore_index=True)
print(variants_df.shape)
variants_df["class"].value_counts()

(12001, 20)
(11717, 20)


Likely-pathogenic    7753
Pathogenic           3964
Name: class, dtype: int64

In [6]:
# downloaing proteins
from utils.ncbi_proteins import download_protein_list_mpi, download_protein_list
protein_acc_list = list(variants_df["prot_acc_version"].unique())
download_protein_list(protein_acc_list, start_i=0, home_dir=home_dir) # sequential downloading
# download_protein_list_mpi(protein_acc_list, len(protein_acc_list))
print("#-unique NCBI protein sequences downloaded: ", len(protein_acc_list))

0 NP_001381642.1 Already existis
1 NP_542172.2 Already existis
2 NP_060341.2 Already existis
3 NP_001164006.1 Already existis
4 NP_002065.1 Already existis
5 NP_000806.2 Already existis
6 NP_003027.1 Already existis
7 NP_002608.1 Already existis
8 NP_056030.1 Already existis
9 NP_009193.2 Already existis
10 NP_001036146.1 Already existis
11 NP_004276.2 Already existis
12 NP_005017.3 Already existis
13 NP_004949.1 Already existis
14 NP_005948.3 Already existis
15 NP_055689.1 Already existis
16 NP_055816.2 Already existis
17 NP_000076.2 Already existis
18 NP_002991.2 Already existis
19 NP_000469.3 Already existis
20 NP_115612.4 Already existis
21 NP_001782.1 Already existis
22 NP_005817.1 Already existis
23 NP_000138.2 Already existis
24 NP_065184.2 Already existis
25 NP_006006.3 Already existis
26 NP_060116.2 Already existis
27 NP_116182.2 Already existis
28 NP_000301.1 Already existis
29 NP_683763.2 Already existis
30 NP_006507.2 Already existis
31 NP_005364.1 Already existis
32 NP_001

In [7]:
# filtering on seq-len <= 1022
from Bio import SeqIO
data = []
n_prots = len(protein_acc_list)
for i, prot in enumerate(protein_acc_list):
    filepath = home_dir+f"data/np_proteins/fastas/{prot}.fasta" 
    seq_record = SeqIO.parse(filepath, format="fasta").__next__()
    seq = str(seq_record.seq)
    seq_len = len(seq)
    if seq_len<=1022:
        print(f"{i}/{n_prots}")
        x = variants_df[variants_df['prot_acc_version']==prot].copy()
        # x["seq_len"] = int(seq_len)
        # x["seq"] = seq
        data.append(x)
variants_df = pd.concat(data)
print(variants_df.shape)
variants_df["class"].value_counts()

0/2165
1/2165
2/2165
3/2165
4/2165
5/2165
6/2165
7/2165
9/2165
11/2165
14/2165
15/2165
17/2165
18/2165
19/2165
21/2165
22/2165
23/2165
24/2165
26/2165
27/2165
28/2165
29/2165
30/2165
31/2165
32/2165
34/2165
35/2165
36/2165
37/2165
38/2165
40/2165
41/2165
42/2165
43/2165
45/2165
46/2165
47/2165
48/2165
51/2165
53/2165
54/2165
55/2165
57/2165
58/2165
59/2165
60/2165
62/2165
63/2165
65/2165
66/2165
68/2165
70/2165
71/2165
72/2165
73/2165
75/2165
76/2165
77/2165
78/2165
79/2165
80/2165
81/2165
82/2165
83/2165
84/2165
85/2165
86/2165
87/2165
88/2165
89/2165
90/2165
91/2165
92/2165
94/2165
97/2165
100/2165
102/2165
103/2165
105/2165
106/2165
107/2165
110/2165
111/2165
112/2165
113/2165
114/2165
115/2165
116/2165
117/2165
119/2165
120/2165
121/2165
122/2165
124/2165
125/2165
126/2165
128/2165
130/2165
131/2165
132/2165
133/2165
135/2165
136/2165
138/2165
139/2165
141/2165
142/2165
143/2165
144/2165
145/2165
146/2165
147/2165
149/2165
151/2165
152/2165
153/2165
155/2165
156/2165
158/2165
159/2

Likely-pathogenic    4865
Pathogenic           2561
Name: class, dtype: int64

In [8]:
variants_removing_delins_df = variants_df[variants_df.apply(lambda row: "delins" not in row.prot_variant, axis=1)]
print(variants_removing_delins_df.shape[0], variants_df.shape[0]-variants_removing_delins_df.shape[0])

# filter unknown AA in ref and alt
print(variants_removing_delins_df["wt_aa_1letter"].value_counts())
print(variants_removing_delins_df["mt_aa_1letter"].value_counts())

variants_removing_unknownaa_df = variants_removing_delins_df[variants_removing_delins_df.apply(lambda row: len(row.wt_aa_1letter)==1 and len(row.mt_aa_1letter)==1, axis=1)]
print(variants_removing_unknownaa_df.shape[0], variants_removing_delins_df.shape[0]-variants_removing_unknownaa_df.shape[0])

# filter unknown Nuc in ref and alt allele
print(variants_removing_unknownaa_df["ref_allele"].value_counts())
print(variants_removing_unknownaa_df["alt_allele"].value_counts())

variants_removing_unknownnuc_df = variants_removing_unknownaa_df[variants_removing_unknownaa_df.apply(lambda row: len(row.ref_allele)==1 and len(row.alt_allele)==1, axis=1)]
print(variants_removing_unknownnuc_df.shape[0], variants_removing_unknownaa_df.shape[0]-variants_removing_unknownnuc_df.shape[0])

# convert prot mut position into int
variants_removing_unknownnuc_df["1indexed_prot_mt_pos"] = variants_removing_unknownnuc_df.apply(lambda row: int(row["1indexed_prot_mt_pos"]), axis=1).copy()

filtered_variants_df = variants_removing_unknownnuc_df.copy()

7425 1
R    1033
G     883
L     518
M     466
P     430
D     406
A     404
S     371
C     363
V     334
T     282
E     278
I     252
Y     247
H     242
N     213
F     212
K     189
Q     152
W     150
Name: wt_aa_1letter, dtype: int64
R      806
P      587
L      558
S      556
V      520
C      402
T      401
G      377
H      337
D      321
N      300
K      283
Y      282
I      281
E      269
Q      265
A      258
F      238
W      209
M      141
Ter     34
Name: mt_aa_1letter, dtype: int64
7391 34
G                        2327
C                        2211
T                        1463
A                        1331
GC                         17
TC                          5
GA                          4
TG                          4
CA                          3
CC                          3
AG                          2
CG                          2
GT                          2
GG                          2
CT                          2
AA                          1
CCC   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  variants_removing_unknownnuc_df["1indexed_prot_mt_pos"] = variants_removing_unknownnuc_df.apply(lambda row: int(row["1indexed_prot_mt_pos"]), axis=1).copy()


In [27]:
filtered_variants_df.loc[filtered_variants_df["chrom_num"]==23, "chrom_num"] = "X"
filtered_variants_df.loc[filtered_variants_df["chrom_num"]==24, "chrom_num"] = "Y"

In [11]:
print(filtered_variants_df["wt_aa_1letter"].value_counts())
print(filtered_variants_df["mt_aa_1letter"].value_counts())
print(filtered_variants_df["ref_allele"].value_counts())
print(filtered_variants_df["alt_allele"].value_counts())
print(filtered_variants_df["1indexed_prot_mt_pos"].value_counts()) # perfect
print(filtered_variants_df["chrom_num"].value_counts()) 
print(filtered_variants_df.shape)
print(filtered_variants_df["class"].value_counts()) 
print(filtered_variants_df["clinvar_id"].dropna().unique().shape[0])
print(filtered_variants_df["gene_name"].dropna().unique().shape[0])
print(filtered_variants_df["prot_acc_version"].dropna().unique().shape[0])

R    1026
G     872
L     514
M     463
P     424
D     402
A     397
S     366
C     356
V     331
T     281
E     267
I     251
H     241
Y     240
N     213
F     211
K     187
W     146
Q     144
Name: wt_aa_1letter, dtype: int64
R    805
P    584
S    553
L    550
V    514
T    401
C    398
G    374
H    335
D    319
N    296
Y    281
K    278
I    278
E    266
Q    263
A    257
F    234
W    208
M    138
Name: mt_aa_1letter, dtype: int64
G    2327
C    2211
T    1463
A    1331
Name: ref_allele, dtype: int64
A    1980
T    1950
G    1714
C    1688
Name: alt_allele, dtype: int64
1      276
101     28
57      27
202     27
156     26
      ... 
710      1
895      1
567      1
651      1
569      1
Name: 1indexed_prot_mt_pos, Length: 839, dtype: int64
23    815
1     732
11    518
2     451
3     427
17    424
19    418
12    351
7     343
6     321
9     313
16    296
5     290
4     235
14    219
10    216
8     194
20    194
15    173
22    120
21    107
13    101
18     66
24   

In [29]:
# saving variants and combined fasta file
from utils.ncbi_proteins import create_combined_fasta
filename = "patho_and_likelypatho"
out_filepath = home_dir+f"data/datasets_patho/{filename}"

print("\nLog: saving variants ...")
filtered_variants_df.to_csv(out_filepath+".tsv", index=False, sep="\t", header=True)


print("\nLog: Creating merged fasta document ...")
protein_acc_list = list(filtered_variants_df["prot_acc_version"].unique())
create_combined_fasta(protein_acc_list, out_filepath+".fasta", home_dir)


Log: saving variants ...

Log: Creating merged fasta document ...
0 NP_001381642.1 Already existis
1 NP_542172.2 Already existis
2 NP_060341.2 Already existis
3 NP_001164006.1 Already existis
4 NP_002065.1 Already existis
5 NP_000806.2 Already existis
6 NP_003027.1 Already existis
7 NP_002608.1 Already existis
8 NP_009193.2 Already existis
9 NP_004276.2 Already existis
10 NP_005948.3 Already existis
11 NP_055689.1 Already existis
12 NP_000076.2 Already existis
13 NP_002991.2 Already existis
14 NP_000469.3 Already existis
15 NP_001782.1 Already existis
16 NP_005817.1 Already existis
17 NP_000138.2 Already existis
18 NP_065184.2 Already existis
19 NP_060116.2 Already existis
20 NP_116182.2 Already existis
21 NP_000301.1 Already existis
22 NP_683763.2 Already existis
23 NP_006507.2 Already existis
24 NP_005364.1 Already existis
25 NP_001246.2 Already existis
26 NP_060620.2 Already existis
27 NP_000365.3 Already existis
28 NP_116145.1 Already existis
29 NP_001041639.1 Already existis
30 N