In [48]:
import os
import sys
home_dir = "../"
module_path = os.path.abspath(os.path.join(home_dir))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
from typing import List
# see notes/mysql to find from where and how the data [SNPs_dbSNP, SNPs_PMD and seqs_PMD] is downloaded

In [49]:
def summarize_functional_effect(mut_function:str):
    # print(mut_function)
    effects = []
    effect = ""
    flag = False
    for i, ch in enumerate(mut_function):
        if ch=='[':
            flag = True
            continue
        if flag and ch!="]":
            effect += ch
        if ch=="]":
            flag = False
            effects.append(effect)
            effect=""
    return effects

def get_functional_effect(effects:List):
    if len(effects)==0: return "No-effect"

    counts = [] 
    functional_effect = ""
    for ef in effects:
        if "0" in ef:
            return "Knock-out"
        elif "=" in ef:
            functional_effect = "No-effect"
        else:
            counts.append(len(ef)-2)

    
    if len(counts)==0 and functional_effect=="No-effect": return "No-effect"
    elif max(counts)<3: return "Effect"
    else: return "Knock-out"

In [58]:
pmd_df = pd.read_csv(home_dir+"data/SNPdbe/SNPs_PMD.csv", sep="\t;") # PMD: from SNPdbe # (127565, 48)
pmd_df = pmd_df[(pmd_df["function"]!=";\\N") & (pmd_df["function"]!="\\N")]  # (72738, 48)
pmd_df = pmd_df[pmd_df["function"].apply(lambda x: "[" in x)] # removing rows that does not have functional effect annotations (65128, 48)
print(pmd_df.shape) 
print(pmd_df.columns)

pmd_df["wt"] = pmd_df["mut_real"].apply(lambda x: x[0]) # 1-letter amino acid
pmd_df["mut"] = pmd_df["mut_real"].apply(lambda x: x[-1])
pmd_df["prot_pos"] = pmd_df["mut_real"].apply(lambda x: int(x[1:-1])) # mutation position is 0-indexed
pmd_df["prot_pos"] += 1 # NCBI mutation positions are 1-indexed, so keeping up with existing implementation making 1-indexed protein mutation position

pmd_df["function_summarized"] = pmd_df["function"].apply(summarize_functional_effect)
pmd_df["functional_effect"] = pmd_df["function_summarized"].apply(get_functional_effect)
pmd_df[["function_summarized", "functional_effect"]]
# txt = "-100 [-]: Ef ime [+]: Ch [+++]ime [+ + +]: Ch tivity [- - -]: NAD-GH activity [- - -]ddd"
# re.findall("\[[+-]+\s+\]", txt)

print(pmd_df.shape) 
print(pmd_df.columns)

  pmd_df = pd.read_csv(home_dir+"data/SNPdbe/SNPs_PMD.csv", sep="\t;") # PMD: from SNPdbe # (127565, 48)


(65128, 48)
Index(['mut_id', 'md5', 'pmd_id', 'nr', 'authors', 'journal', 'title',
       'medline', 'crossref', 'uniprot_id', 'ensembl_id', 'other_ref',
       'protein', 'source', 'expression_sys', 'mut_PMD', 'mut_real',
       'function', 'fb', 'structure', 'strB', 'stability', 'staB',
       'expression', 'eB', 'transport', 'tB', 'maturation', 'mB', 'disease',
       'dB', 'uni_real', 'uni_realid', 'uni_start', 'uni_finish', 'uniq_start',
       'uniq_finish', 'uni_loc', 'ens_real', 'ens_organism', 'ens_start',
       'ens_finish', 'ensq_start', 'ensq_finish', 'ens_loc', 'pos_real',
       'mt_real', 'taxid'],
      dtype='object')
(65128, 53)
Index(['mut_id', 'md5', 'pmd_id', 'nr', 'authors', 'journal', 'title',
       'medline', 'crossref', 'uniprot_id', 'ensembl_id', 'other_ref',
       'protein', 'source', 'expression_sys', 'mut_PMD', 'mut_real',
       'function', 'fb', 'structure', 'strB', 'stability', 'staB',
       'expression', 'eB', 'transport', 'tB', 'maturation', 'mB', 

### PMD data merging with corresponding sequences

In [59]:
seq_pmd_df = pd.read_csv(home_dir+"data/SNPdbe/seqs_PMD.csv", sep="\t;") # PMD: from SNPdbe # (127565, 48)
seq_pmd_df.drop(columns=["taxid"], inplace=True)
print(seq_pmd_df.shape) 
print(seq_pmd_df.columns)

pmd_df = pd.merge(left=pmd_df, right=seq_pmd_df, on=["pmd_id", "nr"], how="inner")
pmd_columns = ["mut_id", "pmd_id", "nr", "crossref", "uniprot_id", "ensembl_id", "taxid", "protein", "mut_PMD", "mut_real", 'wt', 'mut', 'prot_pos', 'function_summarized', 'functional_effect', "function", "seq"]
pmd_df = pmd_df[pmd_columns]
# pmd_df.drop_duplicates(keep="first", inplace=True)
print(pmd_df.shape) 
print(pmd_df.columns)
# pmd_df["mut_id"].value_counts() # multiple entries with same mut-id
# pmd_df[pmd_df["mut_id"]==952197]
pmd_df[["pmd_id", "mut_real", "seq"]]

  seq_pmd_df = pd.read_csv(home_dir+"data/SNPdbe/seqs_PMD.csv", sep="\t;") # PMD: from SNPdbe # (127565, 48)


(53124, 4)
Index(['md5', 'pmd_id', 'nr', 'seq'], dtype='object')
(65128, 17)
Index(['mut_id', 'pmd_id', 'nr', 'crossref', 'uniprot_id', 'ensembl_id',
       'taxid', 'protein', 'mut_PMD', 'mut_real', 'wt', 'mut', 'prot_pos',
       'function_summarized', 'functional_effect', 'function', 'seq'],
      dtype='object')


Unnamed: 0,pmd_id,mut_real,seq
0,A000006,C271Y,MVNSTHRGMHTSLHLWNRSSYRLHSNASESLGKGYSDGGCYEQLFV...
1,A000006,N62S,MVNSTHRGMHTSLHLWNRSSYRLHSNASESLGKGYSDGGCYEQLFV...
2,A000007,K395A,MAGAIENARKEIKRISLEDHAESEYGAIYSVSGPVVIAENMIGCAM...
3,A000007,K407A,MAGAIENARKEIKRISLEDHAESEYGAIYSVSGPVVIAENMIGCAM...
4,A000007,Y459A,MAGAIENARKEIKRISLEDHAESEYGAIYSVSGPVVIAENMIGCAM...
...,...,...,...
65123,A994629,Q3A,NIQMLLEAADYLE
65124,A994629,Q3R,NIQMLLEAADYLE
65125,A994629,Y11D,NIQMLLEAADYLE
65126,A994631,K245R,EALQRPVASDFEPQGLSEAARWNSKENLLAGPSENDPNLFVALYDF...


### SNPs correponding to PMD data

In [60]:
dbSNP_df = pd.read_csv(home_dir+"data/SNPdbe/SNPs_dbSNP.csv", sep="\t;") # dbSNP: from SNPdbe
dbSNP_df = dbSNP_df[(dbSNP_df["mut_id"]!=";\\N") & (dbSNP_df["mut_id"]!="\\N")] # (1780330, 34)
dbSNP_df.drop(columns=["taxid"], inplace=True)

dbSNP_df = dbSNP_df.astype({'mut_id': 'int64'})
print(dbSNP_df.shape) # (1526352, 34)
print(dbSNP_df.columns)
# dbSNP_df.loc[0]

# column description can be found here: https://www.rostlab.org/services/snpdbe/docu/schema.pdf
# protein pos is 0-indexed
dbSNP_cols = ["mut_id", "snp_id", "mrna_acc", "mrna_ver", "mrna_pos", "allele",  "protein_acc", "protein_ver", "verified"]
dbSNP_df = dbSNP_df[dbSNP_cols]
dbSNP_df.drop_duplicates(subset=["mut_id"], keep="first", inplace=True)
dbSNP_df

  dbSNP_df = pd.read_csv(home_dir+"data/SNPdbe/SNPs_dbSNP.csv", sep="\t;") # dbSNP: from SNPdbe


(1526352, 33)
Index(['mut_id', 'build', 'assembly', 'verified', 'md5', 'snp_id',
       'contig_acc', 'contig_ver', 'asn_from', 'asn_to', 'locus_id',
       'locus_symbol', 'mrna_acc', 'mrna_ver', 'protein_acc', 'protein_ver',
       'fxn_class', 'reading_frame', 'allele', 'mt', 'pos', 'ctg_id',
       'mrna_pos', 'mrna_start', 'mrna_stop', 'codon', 'protRes', 'contig_gi',
       'mrna_gi', 'mrna_orien', 'cp_mrna_ver', 'cp_mrna_gi', 'verComp'],
      dtype='object')


Unnamed: 0,mut_id,snp_id,mrna_acc,mrna_ver,mrna_pos,allele,protein_acc,protein_ver,verified
0,1,45045661.0,NM_001011564,1.0,40,G,NP_001011564,1.0,0
2,2,45045662.0,NM_001011564,1.0,166,A,NP_001011564,1.0,0
4,3,45045672.0,NM_001011564,1.0,1013,C,NP_001011564,1.0,0
6,4,45045684.0,NM_001011564,1.0,1082,A,NP_001011564,1.0,0
8,5,45045690.0,NM_001011564,1.0,1244,C,NP_001011564,1.0,0
...,...,...,...,...,...,...,...,...,...
1780325,1748926,140599940.0,NM_001243541,1.0,\N,C,NP_001230470,1.0,0
1780326,1748927,145329239.0,NM_001243541,1.0,\N,C,NP_001230470,1.0,0
1780327,1748928,149512341.0,NM_001243541,1.0,\N,T,NP_001230470,1.0,0
1780328,1748929,180840662.0,NM_001243541,1.0,\N,A,NP_001230470,1.0,0


### Merging PMD with SNPs

In [42]:
pmd_with_rsids_df = pd.merge(left=pmd_df, right=dbSNP_df, how="left", on="mut_id")
print(pmd_with_rsids_df.shape)
print(pmd_with_rsids_df.columns)

(65128, 25)
Index(['mut_id', 'pmd_id', 'nr', 'crossref', 'uniprot_id', 'ensembl_id',
       'taxid', 'protein', 'mut_PMD', 'mut_real', 'wt', 'mut', 'prot_pos',
       'function_summarized', 'functional_effect', 'function', 'seq', 'snp_id',
       'mrna_acc', 'mrna_ver', 'mrna_pos', 'allele', 'protein_acc',
       'protein_ver', 'verified'],
      dtype='object')


### Downloading SNPs using rs-id list (work on this only once)

In [43]:
snp_ids = pmd_with_rsids_df[~pd.isna(pmd_with_rsids_df["snp_id"])]["snp_id"].unique() # number of rows that is mapped to snp-id: 2877
print(len(snp_ids))
snp_ids = [str(int(snp_id)) for snp_id in snp_ids]
snp_ids = " ".join(snp_ids)
snp_ids # 1. copy output and run download_snps_from_snplist (did not work) 2. manually downloaded and merged 100 rs-ids at a time.

1107


'121913562 121913566 80358221 56053615 61751374 61750146 61751402 1800555 61750646 1800548 1800553 61750639 62645940 61753034 61751392 1801466 1800552 61748536 61750152 61749450 61750135 179363882 121917869 121917867 114025668 61732239 121908737 28941471 119450941 121908522 121908524 121908529 121908523 121908525 4426527 34116584 671 145078268 1800546 77718928 78340951 118204430 151052374 4547 138840536 121918008 34605986 63751122 121909571 121909556 139392083 121912713 1130409 33956927 121912727 147210663 1801689 1801690 121912442 121912438 121912432 139629762 71581996 104894897 104894888 59962885 121913000 121913002 61726464 121913001 61726465 121434566 79184941 78311289 121913105 28928868 1138272 1695 28936396 121909308 148640446 80356482 113993946 143517122 137852783 148698650 118204082 118204067 104893747 56268439 28934905 28934907 28934904 28935468 28934906 121913560 13447332 63750376 63750129 63751438 63750570 74315448 74315447 2234916 137853096 28937597 132630303 118204443 1182

### Loading downloaded SNPS using the above rs-ids

In [44]:
snps_df = pd.read_csv(home_dir+"data/SNPdbe/snps.tsv", sep="\t", comment="#")
print(snps_df.columns)
snps_df = snps_df[['chr', 'pos', 'variation', 'variant_type', 'snp_id']]
print(snps_df.columns)
# print(pmd_snps_df.head())

variations = []
for i, tuple in enumerate(snps_df.itertuples()):
    if tuple.variant_type != "snv": # only considering SNVs
        print(tuple)
        # break
        continue
    
    ref_allele, alt_allele = tuple.variation.split(">")
    alt_alleles = alt_allele.split(",")
    # print(ref_allele, ">>>", alt_alleles)

    for alt_allele in alt_alleles:
        row = dict(snps_df.loc[i])
        row["ref_allele"] = ref_allele
        row["alt_allele"] = alt_allele
        variations.append(row)

snps_df = pd.DataFrame(variations)
snps_df = snps_df[~pd.isna(snps_df["chr"])]
snps_df = snps_df[~pd.isna(snps_df["pos"])]
snps_df = snps_df[~pd.isna(snps_df["snp_id"])]
snps_df.rename(columns={"pos": "chrom_pos", "chr": "chrom"}, inplace=True)
snps_df

Index(['chr', 'pos', 'variation', 'variant_type', 'snp_id',
       'clinical_significance', 'validation_status', 'function_class', 'gene',
       'frequency'],
      dtype='object')
Index(['chr', 'pos', 'variation', 'variant_type', 'snp_id'], dtype='object')
Pandas(Index=473, chr=nan, pos=nan, variation='G>T', variant_type=nan, snp_id=75660264)
Pandas(Index=474, chr=nan, pos=nan, variation='C>T', variant_type=nan, snp_id=76871093)
Pandas(Index=482, chr=nan, pos=nan, variation='G>T', variant_type=nan, snp_id=75660264)
Pandas(Index=483, chr=nan, pos=nan, variation='C>T', variant_type=nan, snp_id=76871093)
Pandas(Index=1242, chr='9', pos=124503218.0, variation='GC>TT', variant_type='mnv', snp_id=121918654)


Unnamed: 0,chrom,chrom_pos,variation,variant_type,snp_id,ref_allele,alt_allele
0,12,111803962.0,G>A,snv,671,G,A
1,11,67585218.0,"A>G,T",snv,1695,A,G
2,11,67585218.0,"A>G,T",snv,1695,A,T
3,8,142914761.0,A>G,snv,4547,A,G
4,14,20456995.0,"T>A,C,G",snv,1130409,T,A
...,...,...,...,...,...,...,...
2023,13,113118770.0,"T>C,G",snv,121964938,T,G
2024,10,80274538.0,"C>G,T",snv,138742870,C,G
2025,10,80274538.0,"C>G,T",snv,138742870,C,T
2026,6,160741375.0,"G>A,T",snv,181030365,G,A


In [45]:
pmd_data_df = pd.merge(left=pmd_with_rsids_df, right=snps_df, on="snp_id", how="left") # multiple rows incurred for SNPs data under same snp_id
pmd_data_df["pmd_nr_id"] = pmd_data_df["pmd_id"]+pmd_data_df["nr"].apply(lambda x: "_"+str(int(x)))
pmd_data_df

Unnamed: 0,mut_id,pmd_id,nr,crossref,uniprot_id,ensembl_id,taxid,protein,mut_PMD,mut_real,...,protein_acc,protein_ver,verified,chrom,chrom_pos,variation,variant_type,ref_allele,alt_allele,pmd_nr_id
0,168938,A000006,2,MC4R_HUMAN,P32245,ENSP00000299766,9606,Melanocortin 4 receptor (MC4R); melanocortin-4...,CYS 271 TYR,C271Y,...,NP_005903,2.0,0,18,60371538.0,"C>A,T",snv,C,A,A000006_2
1,168938,A000006,2,MC4R_HUMAN,P32245,ENSP00000299766,9606,Melanocortin 4 receptor (MC4R); melanocortin-4...,CYS 271 TYR,C271Y,...,NP_005903,2.0,0,18,60371538.0,"C>A,T",snv,C,T,A000006_2
2,168942,A000006,2,MC4R_HUMAN,P32245,ENSP00000299766,9606,Melanocortin 4 receptor (MC4R); melanocortin-4...,ASN 62 SER,N62S,...,NP_005903,2.0,0,18,60372165.0,T>C,snv,T,C,A000006_2
3,979746,A000007,1,VATA_YEAST,P17255,YDL185W,559292,Endonuclease PI-SceI; vacuolar ATP synthase ca...,LYS 112 ALA,K395A,...,,,,,,,,,,A000007_1
4,979747,A000007,1,VATA_YEAST,P17255,YDL185W,559292,Endonuclease PI-SceI; vacuolar ATP synthase ca...,LYS 124 ALA,K407A,...,,,,,,,,,,A000007_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66276,1075115,A994629,3,MAD1_HUMAN,Q05195,ENSP00000264444,\N,Fragment (Met--Glu 8-20) of Mad1 protein (alte...,GLN 10 ALA,Q3A,...,,,,,,,,,,A994629_3
66277,1075116,A994629,3,MAD1_HUMAN,Q05195,ENSP00000264444,\N,Fragment (Met--Glu 8-20) of Mad1 protein (alte...,GLN 10 ARG,Q3R,...,,,,,,,,,,A994629_3
66278,1075117,A994629,3,MAD1_HUMAN,Q05195,ENSP00000264444,\N,Fragment (Met--Glu 8-20) of Mad1 protein (alte...,TYR 18 ASP,Y11D,...,,,,,,,,,,A994629_3
66279,1015339,A994631,2,ABL1_HUMAN,P00519,ENSP00000323315,\N,Fragment (Glu--Arg 27-1130) of proto-oncogene ...,LYS 271 ARG,K245R,...,,,,,,,,,,A994631_2


In [46]:
pmd_data_df.to_csv(home_dir+"models/aa_common/datasets_pmd_analysis/pmd_data.tsv", sep="\t", index=False)
sequences_df = pmd_data_df[['pmd_nr_id', 'crossref', 'seq']].drop_duplicates(keep="first")
print("#-of sequences", sequences_df.shape)

out_fasta_filepath = home_dir+"models/aa_common/datasets_pmd_analysis/pmd_sequences.fasta"
out_fasta_file_handle = open(out_fasta_filepath, "w")

for tuple in sequences_df.itertuples():
    out_fasta_file_handle.write(f">{tuple.pmd_nr_id} | {tuple.crossref}\n")
    out_fasta_file_handle.write(f"{tuple.seq}\n")
    # break
out_fasta_file_handle.close()  

#-of sequences (11714, 3)


In [47]:
print(pmd_data_df["functional_effect"].value_counts())
print(pmd_data_df[pd.isna(pmd_data_df["mut_real"])].shape)
print(pmd_data_df[pd.isna(pmd_data_df["function"])].shape)
print(pmd_data_df[pd.isna(pmd_data_df["functional_effect"])].shape)
print("#-snps", pmd_data_df[~pd.isna(pmd_data_df["snp_id"])].shape) # number of rows that is mapped to snp-id: 2877
print("#-human (crossref)", pmd_data_df[pmd_data_df["crossref"].apply(lambda x: True if "HUMAN" in x else False)].shape) # number of human entries: 20594
print("#-human (taxid)", pmd_data_df[pmd_data_df["taxid"]=="9606"].shape) # # number of human entries using taxid: 16088
pmd_data_df[pmd_data_df["crossref"].apply(lambda x: True if "HUMAN" in x else False)][['pmd_id', 'nr']].drop_duplicates(keep="first")
# print(merged_df[(merged_df["crossref"].apply(lambda x: True if "HUMAN" in x else False)) & ~pd.isna(merged_df["snp_id"])].shape) # number of human entries that is mapped to rs-id: 2829

Effect       31136
Knock-out    18273
No-effect    16872
Name: functional_effect, dtype: int64
(0, 32)
(0, 32)
(0, 32)
#-snps (2868, 32)
#-human (crossref) (18789, 32)
#-human (taxid) (14914, 32)


Unnamed: 0,pmd_id,nr
0,A000006,2
65,A000050,1
79,A000077,1
88,A000082,1
97,A000091,1
...,...,...
66266,A994620,1
66268,A994627,3
66272,A994629,3
66279,A994631,2
