In [1]:
import os
import sys
home_dir = "../"
module_path = os.path.abspath(os.path.join(home_dir))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
from typing import List
# see notes/mysql to find from where and how the data [SNPs_dbSNP, SNPs_PMD and seqs_PMD] is downloaded

In [2]:
# extract functional annotation
def summarize_functional_effect(mut_function):
    # print(mut_function)
    mut_function = str(mut_function)
    effects = []
    effect = ""
    flag = False
    for i, ch in enumerate(mut_function):
        if ch=='[':
            flag = True
            continue
        if flag and ch in ['-', '+'] and mut_function[i+1] in ['-', '+', ']', ' ']:
            effect += ch
        elif flag and ch == '=':
            effects.append('=')
        elif flag and ch == '0' and mut_function[i+1]=="]":
            effects.append('0')
            
        if ch=="]":
            flag = False
            if len(effect)>0: effects.append(effect)
            effect=""
    
    if len(effects)>0: return effects
    else: return ""

def get_functional_effect(effects):
    counts = [] 
    functional_effect = ""
    for ef in effects:
        if "0" in ef:
            return "Knock-out"
        elif "=" in ef:
            functional_effect = "No-effect"
        else:
            counts.append(len(ef)-2)

    
    if len(counts)==0 and functional_effect=="No-effect": return "No-effect"
    elif max(counts)<3: return "Effect"
    else: return "Knock-out"

In [3]:
# loading SNPs_PMD table
raw_pmd_df = pd.read_csv(home_dir+"data/SNPdbe/SNPs_PMD.csv", sep="\t;") # PMD: from SNPdbe # (127565, 48)
raw_pmd_df = raw_pmd_df[(raw_pmd_df["function"]!=";\\N") & (raw_pmd_df["function"]!="\\N")]  # (72738, 48)
raw_pmd_df = raw_pmd_df[raw_pmd_df["function"].apply(lambda x: "[" in x)] # removing rows that does not have functional effect annotations (65128, 48)
# print(raw_pmd_df.shape) 

# raw_pmd_df = raw_pmd_df.copy(deep=True)
# raw_pmd_df["wt"] = raw_pmd_df["mut_real"].apply(lambda x: x[0]) # 1-letter amino acid
# raw_pmd_df["mut"] = raw_pmd_df["mut_real"].apply(lambda x: x[-1])
# raw_pmd_df["prot_pos"] = raw_pmd_df["mut_real"].apply(lambda x: int(x[1:-1])) # mutation position is 1-indexed

# raw_pmd_df["function_summarized"] = raw_pmd_df["function"].apply(summarize_functional_effect)
# raw_pmd_df = raw_pmd_df[raw_pmd_df["function_summarized"].apply(lambda x:  len(x)>0)] # removing rows that does not have any functional effect annotations (64750, 50)
# raw_pmd_df["functional_effect"] = raw_pmd_df["function_summarized"].apply(get_functional_effect)

print(raw_pmd_df.shape) 
print(raw_pmd_df.columns)

  raw_pmd_df = pd.read_csv(home_dir+"data/SNPdbe/SNPs_PMD.csv", sep="\t;") # PMD: from SNPdbe # (127565, 48)


(65128, 48)
Index(['mut_id', 'md5', 'pmd_id', 'nr', 'authors', 'journal', 'title',
       'medline', 'crossref', 'uniprot_id', 'ensembl_id', 'other_ref',
       'protein', 'source', 'expression_sys', 'mut_PMD', 'mut_real',
       'function', 'fb', 'structure', 'strB', 'stability', 'staB',
       'expression', 'eB', 'transport', 'tB', 'maturation', 'mB', 'disease',
       'dB', 'uni_real', 'uni_realid', 'uni_start', 'uni_finish', 'uniq_start',
       'uniq_finish', 'uni_loc', 'ens_real', 'ens_organism', 'ens_start',
       'ens_finish', 'ensq_start', 'ensq_finish', 'ens_loc', 'pos_real',
       'mt_real', 'taxid'],
      dtype='object')


In [45]:
### PMD data merging with gene information
# gene_df = pd.read_csv(home_dir+"data/SNPdbe/md52keywords.tsv", sep="\t")
# print(gene_df.shape) 
# print(gene_df.columns)
# gene_df = gene_df[(gene_df["gene_SP"]!=";\\N") & (gene_df["gene_SP"]!="\\N") & (~pd.isna(gene_df["gene_SP"]))] # (147349, 4)
# print(gene_df.shape) 
# print(gene_df.columns)
# pmd_df = pd.merge(left=pmd_df, right=gene_df, on="md5", how="left")
# print(pmd_df.shape) 
# print(pmd_df.columns)

# print(pmd_df["gene_SP"].unique().shape)
# pmd_df["gene_SP"].value_counts()

In [46]:
### PMD data merging with corresponding sequences
# seq_pmd_df = pd.read_csv(home_dir+"data/SNPdbe/seqs_PMD.csv", sep="\t;") # PMD: from SNPdbe # (127565, 48)
# seq_pmd_df.drop(columns=["taxid", "md5"], inplace=True)
# print(seq_pmd_df.shape) 
# print(seq_pmd_df.columns)

# pmd_df = pd.merge(left=pmd_df, right=seq_pmd_df, on=["pmd_id", "nr"], how="inner")
# # pmd_columns = ["mut_id", "pmd_id", "nr", "crossref", "uniprot_id", "ensembl_id", "taxid", "protein", "mut_PMD", "mut_real", 'wt', 'mut', 'prot_pos', 'function_summarized', 'functional_effect', "function", "seq"]
# # pmd_df = pmd_df[pmd_columns]
# print(pmd_df.shape) 
# print(pmd_df.columns)
# pmd_df[["pmd_id", "mut_real", "seq"]]

# # seq = seq_pmd_df[(seq_pmd_df["pmd_id"]=="A000006") & (seq_pmd_df["nr"]==2)]["seq"].values[0]
# # print(seq[270])
# # pmd_df.iloc[0]

In [4]:
# loading PMD SNPs_dbSNP table
dbSNP_df = pd.read_csv(home_dir+"data/SNPdbe/SNPs_dbSNP.csv", sep="\t;") # dbSNP: from SNPdbe
dbSNP_df = dbSNP_df[(dbSNP_df["mut_id"]!=";\\N") & (dbSNP_df["mut_id"]!="\\N")] # (1780330, 34)
dbSNP_df.drop(columns=["taxid"], inplace=True)

dbSNP_df = dbSNP_df.astype({'mut_id': 'int64'})
print(dbSNP_df.shape) # (1526352, 34)
# print(dbSNP_df.columns)

# column description can be found here: https://www.rostlab.org/services/snpdbe/docu/schema.pdf
# protein pos is 0-indexed
dbSNP_cols = ["mut_id", "snp_id", "mrna_acc", "mrna_ver", "mrna_pos", "allele",  "protein_acc", "protein_ver", "verified"]
dbSNP_df = dbSNP_df[dbSNP_cols]
print(dbSNP_df.columns)

  dbSNP_df = pd.read_csv(home_dir+"data/SNPdbe/SNPs_dbSNP.csv", sep="\t;") # dbSNP: from SNPdbe


(1526352, 33)
Index(['mut_id', 'snp_id', 'mrna_acc', 'mrna_ver', 'mrna_pos', 'allele',
       'protein_acc', 'protein_ver', 'verified'],
      dtype='object')


In [5]:
mutid_snpid_pair_df = dbSNP_df[["mut_id", "snp_id"]].drop_duplicates(keep="first")
mutid_snpid_pair_df.shape

(1376581, 2)

In [6]:
# mapping PMD rows to rs-ids
pmd_with_rsids_df = pd.merge(left=raw_pmd_df, right=mutid_snpid_pair_df, how="inner", on="mut_id")
print(pmd_with_rsids_df.shape)
print(pmd_with_rsids_df.columns)

(1817, 49)
Index(['mut_id', 'md5', 'pmd_id', 'nr', 'authors', 'journal', 'title',
       'medline', 'crossref', 'uniprot_id', 'ensembl_id', 'other_ref',
       'protein', 'source', 'expression_sys', 'mut_PMD', 'mut_real',
       'function', 'fb', 'structure', 'strB', 'stability', 'staB',
       'expression', 'eB', 'transport', 'tB', 'maturation', 'mB', 'disease',
       'dB', 'uni_real', 'uni_realid', 'uni_start', 'uni_finish', 'uniq_start',
       'uniq_finish', 'uni_loc', 'ens_real', 'ens_organism', 'ens_start',
       'ens_finish', 'ensq_start', 'ensq_finish', 'ens_loc', 'pos_real',
       'mt_real', 'taxid', 'snp_id'],
      dtype='object')


In [7]:
### Download SNPs using rs-id list (work on this only once)
snp_ids = pmd_with_rsids_df[~pd.isna(pmd_with_rsids_df["snp_id"])]["snp_id"].unique()
print(len(snp_ids))
snp_ids = [str(int(snp_id)) for snp_id in snp_ids]
snp_ids = " ".join(snp_ids)
snp_ids # 1. copy output and run download_snps_from_snplist (did not work) 2. manually downloaded and merged 100 rs-ids at a time.

1192


'121913562 121913566 80358221 56053615 61751374 61750146 61751402 1800555 61750646 1800548 1800553 61750639 62645940 76157638 121909203 61753034 61751392 1801466 1800552 61748536 61750152 61749450 61750135 179363882 121917869 121917867 114025668 61732239 121908737 28941471 119450941 121908522 121908524 121908529 121908523 121908525 4426527 34116584 671 145078268 1800546 77718928 78340951 118204430 151052374 4547 145467699 138840536 121918008 34605986 63751122 121909571 121909556 139392083 121912713 1130409 33956927 121912727 147210663 1801689 1801690 121912442 121912438 121912432 139629762 71581996 104894897 104894888 59962885 121913000 121913002 61726464 121913001 61726465 121913003 121434566 79184941 78311289 121913105 28928868 1138272 1695 28936396 121909308 148640446 80356482 113993946 143517122 137852783 148698650 118204082 118204067 104893747 56268439 28934905 28934907 28934904 28935468 28934906 121913560 13447332 63750376 63750129 63751438 63750570 74315448 74315447 2234916 1378

In [8]:
### Loading downloaded SNPs using the above rs-ids
snps_df = pd.read_csv(home_dir+"data/SNPdbe/snps.tsv", sep="\t", comment="#")
print(snps_df.columns)
snps_df = snps_df[['chr', 'pos', 'variation', 'variant_type', 'snp_id']]
print(snps_df.columns)
# print(pmd_snps_df.head())

variations = []
for i, tuple in enumerate(snps_df.itertuples()):
    if tuple.variant_type != "snv": # only considering SNVs
        print(tuple)
        # break
        continue
    
    ref_allele, alt_allele = tuple.variation.split(">")
    alt_alleles = alt_allele.split(",")
    # print(ref_allele, ">>>", alt_alleles)

    for alt_allele in alt_alleles:
        row = dict(snps_df.loc[i])
        row["ref_allele"] = ref_allele
        row["alt_allele"] = alt_allele
        variations.append(row)

snps_df = pd.DataFrame(variations)
snps_df = snps_df[~pd.isna(snps_df["chr"])]
snps_df = snps_df[~pd.isna(snps_df["pos"])]
snps_df = snps_df[~pd.isna(snps_df["snp_id"])]
snps_df.rename(columns={"pos": "chrom_pos", "chr": "chrom"}, inplace=True)
snps_df

Index(['chr', 'pos', 'variation', 'variant_type', 'snp_id',
       'clinical_significance', 'validation_status', 'function_class', 'gene',
       'frequency'],
      dtype='object')
Index(['chr', 'pos', 'variation', 'variant_type', 'snp_id'], dtype='object')
Pandas(Index=473, chr=nan, pos=nan, variation='G>T', variant_type=nan, snp_id=75660264)
Pandas(Index=474, chr=nan, pos=nan, variation='C>T', variant_type=nan, snp_id=76871093)
Pandas(Index=482, chr=nan, pos=nan, variation='G>T', variant_type=nan, snp_id=75660264)
Pandas(Index=483, chr=nan, pos=nan, variation='C>T', variant_type=nan, snp_id=76871093)
Pandas(Index=1242, chr='9', pos=124503218.0, variation='GC>TT', variant_type='mnv', snp_id=121918654)


Unnamed: 0,chrom,chrom_pos,variation,variant_type,snp_id,ref_allele,alt_allele
0,12,111803962.0,G>A,snv,671,G,A
1,11,67585218.0,"A>G,T",snv,1695,A,G
2,11,67585218.0,"A>G,T",snv,1695,A,T
3,8,142914761.0,A>G,snv,4547,A,G
4,14,20456995.0,"T>A,C,G",snv,1130409,T,A
...,...,...,...,...,...,...,...
2023,13,113118770.0,"T>C,G",snv,121964938,T,G
2024,10,80274538.0,"C>G,T",snv,138742870,C,G
2025,10,80274538.0,"C>G,T",snv,138742870,C,T
2026,6,160741375.0,"G>A,T",snv,181030365,G,A


In [9]:
# merging chromosomal information from snp_id 
pmd_data_df = pd.merge(left=pmd_with_rsids_df, right=snps_df, on="snp_id", how="inner") # multiple rows incurred for SNPs data under same snp_id
pmd_data_df.drop_duplicates(keep="first", inplace=True, ignore_index=True)
print(pmd_data_df.shape)
print(pmd_data_df.columns)
pmd_data_df.to_csv(home_dir+"data/pmd/pmd_with_rsids.tsv", sep="\t", index=False)
# pmd_data_df.value_counts()

(2783, 55)
Index(['mut_id', 'md5', 'pmd_id', 'nr', 'authors', 'journal', 'title',
       'medline', 'crossref', 'uniprot_id', 'ensembl_id', 'other_ref',
       'protein', 'source', 'expression_sys', 'mut_PMD', 'mut_real',
       'function', 'fb', 'structure', 'strB', 'stability', 'staB',
       'expression', 'eB', 'transport', 'tB', 'maturation', 'mB', 'disease',
       'dB', 'uni_real', 'uni_realid', 'uni_start', 'uni_finish', 'uniq_start',
       'uniq_finish', 'uni_loc', 'ens_real', 'ens_organism', 'ens_start',
       'ens_finish', 'ensq_start', 'ensq_finish', 'ens_loc', 'pos_real',
       'mt_real', 'taxid', 'snp_id', 'chrom', 'chrom_pos', 'variation',
       'variant_type', 'ref_allele', 'alt_allele'],
      dtype='object')


In [53]:
# pmd_data_df.to_csv(home_dir+"models/aa_common/datasets_pmd_analysis/pmd_data.tsv", sep="\t", index=False)
# sequences_df = pmd_data_df[['pmd_nr_id', 'crossref', 'seq']].drop_duplicates(keep="first")
# print("#-of sequences", sequences_df.shape)

# out_fasta_filepath = home_dir+"models/aa_common/datasets_pmd_analysis/pmd_sequences.fasta"
# out_fasta_file_handle = open(out_fasta_filepath, "w")

# for tuple in sequences_df.itertuples():
#     out_fasta_file_handle.write(f">{tuple.pmd_nr_id} | {tuple.crossref}\n")
#     out_fasta_file_handle.write(f"{tuple.seq}\n")
#     # break
# out_fasta_file_handle.close()  

In [54]:
# print(pmd_data_df["functional_effect"].value_counts())
# print(pmd_data_df[pd.isna(pmd_data_df["mut_real"])].shape)
# print(pmd_data_df[pd.isna(pmd_data_df["function"])].shape)
# print(pmd_data_df[pd.isna(pmd_data_df["functional_effect"])].shape)
# print("#-snps", pmd_data_df[~pd.isna(pmd_data_df["snp_id"])].shape) # number of rows that is mapped to snp-id: 2877
# print("#-human (crossref)", pmd_data_df[pmd_data_df["crossref"].apply(lambda x: True if "HUMAN" in x else False)].shape) # number of human entries: 20594
# print("#-human (taxid)", pmd_data_df[pmd_data_df["taxid"]=="9606"].shape) # # number of human entries using taxid: 16088
# pmd_data_df[pmd_data_df["crossref"].apply(lambda x: True if "HUMAN" in x else False)][['pmd_id', 'nr']].drop_duplicates(keep="first")
# # print(merged_df[(merged_df["crossref"].apply(lambda x: True if "HUMAN" in x else False)) & ~pd.isna(merged_df["snp_id"])].shape) # number of human entries that is mapped to rs-id: 2829

# print("#-of unique genes: ", pmd_data_df["gene_SP"].unique().shape)